I have writen this function to scrape top 10 results from google search:
def google_search(self,query):
"""
This function returns the urls of top 10 of google search result for a keyword
"""
params = {'q':query}
url = 'https://www.google.com/search?'+urllib.urlencode(params)
result = urlfetch.fetch(url=url)
content = result.content
soup = BeautifulSoup(content)
list = soup.findAll("li", {'class':'g'})
urls = []
for item in list:
link = item.findAll('a')[0]
url = 'https://www.google.com'+link['href']
urls.append(url.encode('utf-8'))
return urls
Then I wrote this other function that find related wikepedia articles based on google search
def wikipedia_search(self,query,language='en'):
"""
This function returns a list of urls and title of top wikepedia search result for a keyword
"""
q = query+u' site:%s.wikipedia.org' %language
urls = self.google_search(q.encode('utf-8'))
list =[]
for url in urls:
title = re.findall(r'/wiki/(.*)&s',url.encode('utf-8'))[0].replace("_"," ")
link = re.findall(r'q=(.*)&s',url)[0]
url_tag = {'url':link ,'title' :title}
list.append(url_tag)
return list
But when i try some search in arabic language I get result like this : {'title': '%25D8%25AD%25D9%2583%25D9%2588%25D9%2585%25D8%25A9', 'url': 'https://ar.wikipedia.org/wiki/%25D8%25AD%25D9%2583%25D9%2588%25D9%2585%25D8%25A9'}, {'title': '%25D8%25A8%25D9%258A%25D8%25AA %25D9%2588%25D9%258A%25D9%2586%25D8%25AF%25D8%25B3%25D9%2588%25D8%25B1', 'url': 'https://ar.wikipedia.org/wiki/%25D8%25A8%25D9%258A%25D8%25AA_%25D9%2588%25D9%258A%25D9%2586%25D8%25AF%25D8%25B3%25D9%2588%25D8%25B1'} that basically I can not explore.