Problem with generating the main image url of wikipedia with python

Question

I have an error calling the python script for the url for img wikipedia. It's about the part -`#url img for wikipedia! Not every google places result has an url address, so the "url" field may be empty. It then affects these records with the following error:

Traceback (most recent call last):
    File "C:\Users\Lenovo\fetch.py", line 98, in <module>
        soup = BeautifulSoup(requests.get(urladress).text,'html.parser')
    File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py",
        line 75, in get
            return request('get', url, params=params, **kwargs)
    File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py",
        line 61, in request
            return session.request(method=method, url=url, **kwargs)
    File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py",
        line 515, in request
            prep = self.prepare_request(req)
    File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py",
        line 443, in prepare_request
            p.prepare(
    File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\models.py",
        line 318, in prepare
            self.prepare_url(url, params)
    File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\models.py",
        line 392, in prepare_url
            raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '': No scheme supplied. Perhaps you meant http://?

How do I add an "if" to the url check so that the error does not occur? The code that is responsible for the problem:

#url img for wikipedia
from bs4 import BeautifulSoup
urladress = url #the url is not always present
soup = BeautifulSoup(requests.get(urladress).text,'html.parser')
imglinks = soup.find_all('a', attrs = {'class':'image'})[0]
for img in imglinks.find_all('img'):
    wiki_link = (img['src'].replace('//','https://'))
if (wiki_link is not None):
    img_link = wiki_link
else:
    img_link = -1

Script call:

python fetch.py --directory output/england/Krakow --rating 4 --reviews 5000 --operator i --query "Atrakcje w Krakowie"

My script Python:

#!/usr/bin/env python
import csv
import json
import pathlib
import operator
import requests
import argparse
import configparser
import re

import wikipedia

# Read the API key form the configuration.
config = configparser.ConfigParser()
config.read('secrets.ini')

API_KEY = ''
PLACES_TYPES = ['atrakcja']
#'park', 'point_of_interest', 'establishment', 'museum', 'library', 'church', 'art_gallery', 'political'
# Search query operators.
OPERATORS = {
    'i': operator.and_,
    'lub': operator.or_
}

def fetch_place_detail(place_id):
    place_raw = requests.get(f'https://maps.googleapis.com/maps/api/place/details/json?placeid={place_id}&key={API_KEY}')
    try:
        return json.loads(place_raw.text)['result']
    except KeyError:
        raise KeyError('Index \'result\' does not exist')
        
# Add parameters for the search query.
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, help='Search query for Google Maps API')
parser.add_argument('--directory', type=str, help='Output directory')
parser.add_argument('--rating', type=float, help='Minimum rating of the place(s)')
parser.add_argument('--reviews', type=int, help='Minimum review count of the place(s)')
parser.add_argument('--operator', default='and', choices=OPERATORS.keys(), type=str,
                                  help='Operation to perform between ratings and reviews count.')
parser.add_argument('--exclude', '-e', choices=PLACES_TYPES, nargs='+', type=str,
                                       help='Exclude the places from the query result')
parser.add_argument('--language', default='pl', choices=['pl', 'fr', 'de'], type=str,
                                  help='Language of the Wikipedia link')
parser.add_argument('--summary-length', type=int,
                                        help='Limit the number of sentences in place summary.')

args = parser.parse_args()
# Fetch the data.
places = requests.get(f'https://maps.googleapis.com/maps/api/place/textsearch/json?query={args.query}&language=pl&key={API_KEY}')
# Convert the response to a JSON object.
places = json.loads(places.text)['results']
if not places:
    raise Exception(f'No results found for query: {args.query}')

# Create the directory if it doesn't exist.
pathlib.Path(args.directory).mkdir(parents=True, exist_ok=True)
# Make the filename more readable, as this will appear as the layer title in Google My Maps.
query = args.query.split(' ')
filename = ' '.join([q.capitalize() for q in query])
# Set Wikipedia language.
wikipedia.set_lang(args.language)

columns = ['name', 'coordinates', 'types', 'rating', 'formatted_address', 'photo_reference', 'summary', 'url', 'reviews', 'img_link']
with open(args.directory + f'/{filename}.csv', 'w', encoding='utf-8') as out_file:
    writer = csv.writer(out_file, delimiter='|')
    writer.writerow(columns)
    for place in places:
        name = place['name']
        formatted_address = place['formatted_address']
        types = place['types']
        if 'photo_reference' in place:
            photo_reference = place['photo_reference']
        else:
            photo_reference = -1
        if 'user_ratings_total' in place:
            reviews = place['user_ratings_total']
        else:
            reviews = -1
        if 'rating' in place:
            rating = place['rating']
        else:
            rating = -1                 
        try:
            if args.summary_length:
                wiki_page = wikipedia.page(name, sentences=args.summary_length)
            else:
                wiki_page = wikipedia.page(name)
            url = wiki_page.url
            summary = wiki_page.summary.replace('\n', '')
        except KeyboardInterrupt:
            exit(-1)
        except:
            url, summary = '', ''
        #url img for wikipedia
        from bs4 import BeautifulSoup
        urladress = url
        soup = BeautifulSoup(requests.get(urladress).text,'html.parser')
        imglinks = soup.find_all('a', attrs = {'class':'image'})[0]
        for img in imglinks.find_all('img'):
            wiki_link = (img['src'].replace('//','https://'))
        if (wiki_link is not None):
            img_link = wiki_link
        else:
            img_link = -1           
        #else:
        #   img_link = wiki_link        
        # If item type is from the exlude list, skip it.
        if args.exclude:
            if list(set(args.exclude) & set(types)):
                continue
        # If an item doesn't satify the rating and review count criteria, skip it.
        if args.rating and args.reviews:
            rating = place['rating']
            if not OPERATORS[args.operator](rating >= args.rating, reviews >= args.reviews):
                continue
        elif args.rating:
            if not rating >= args.rating:
                continue
        elif args.reviews:
            if not reviews >= args.reviews:
                continue
                            

        lat, lng = place['geometry']['location']['lat'], place['geometry']['location']['lng']
        data = [name, (lat, lng), ', '.join(types), rating, formatted_address, photo_reference, summary, url, reviews, img_link]
        print(f'{filename} -> {data}')
        writer.writerow(data)

score 0 · Answer 1 · answered May 06 '22 at 15:58

You can update your code like this to skip getting the image if there is no url

...
with open(args.directory + f'/{filename}.csv', 'w', encoding='utf-8') as out_file:
    writer = csv.writer(out_file, delimiter='|')
    writer.writerow(columns)
    for place in places:
        name = place['name']
        formatted_address = place['formatted_address']
        types = place['types']
        if 'photo_reference' in place:
            photo_reference = place['photo_reference']
        else:
            photo_reference = -1
        if 'user_ratings_total' in place:
            reviews = place['user_ratings_total']
        else:
            reviews = -1
        if 'rating' in place:
            rating = place['rating']
        else:
            rating = -1
        try:
            if args.summary_length:
                wiki_page = wikipedia.page(name, sentences=args.summary_length)
            else:
                wiki_page = wikipedia.page(name)
            url = wiki_page.url
            summary = wiki_page.summary.replace('\n', '')
        except KeyboardInterrupt:
            exit(-1)
        except:
            url, summary = '', ''
        #url img for wikipedia
        from bs4 import BeautifulSoup
        if url:
            urladress = url
            soup = BeautifulSoup(requests.get(urladress).text,'html.parser')
            imglinks = soup.find_all('a', attrs = {'class':'image'})[0]
            for img in imglinks.find_all('img'):
                wiki_link = (img['src'].replace('//','https://'))
            if (wiki_link is not None):
                img_link = wiki_link
            else:
                img_link = -1
                #else:
            #   img_link = wiki_link        
            # If item type is from the exlude list, skip it.
    if args.exclude:
        if list(set(args.exclude) & set(types)):
            continue
    # If an item doesn't satify the rating and review count criteria, skip it.
    if args.rating and args.reviews:
        rating = place['rating']
        if not OPERATORS[args.operator](rating >= args.rating, reviews >= args.reviews):
            continue
    elif args.rating:
        if not rating >= args.rating:
            continue
    elif args.reviews:
        if not reviews >= args.reviews:
            continue


    lat, lng = place['geometry']['location']['lat'], place['geometry']['location']['lng']
    data = [name, (lat, lng), ', '.join(types), rating, formatted_address, photo_reference, summary, url, reviews, img_link]
    print(f'{filename} -> {data}')
    writer.writerow(data)

After adding "if url:" I get the error: C:\Users\Lenovo>python fetch.py --directory output/england/Krakow --rating 4 --reviews 5000 --operator i --query "Atrakcje w Krakowie" File "C:\Users\Lenovo\fetch.py", line 108 from bs4 import BeautifulSoup TabError: inconsistent use of tabs and spaces in indentation — Break, May 06 '22 at 16:05
That's not related to the code change but to your editor https://stackoverflow.com/questions/5685406/inconsistent-use-of-tabs-and-spaces-in-indentation — Alex, May 06 '22 at 16:11
Thanks for help Alex. I was struggling with it all day. Basically, I run the script via CMD. — Break, May 06 '22 at 18:31

Problem with generating the main image url of wikipedia with python

1 Answers1