2

I'm trying to scrape IMDB to search for a particular title, enter the first link in the search results, then print the year (and later other info) that the movie was released but I can't seem to figure out what part of the html to put in .find().

The first function works and collects the original url and joins it with the new second part of the url (for the movie page).

Thanks for the help, been stuck on this for days!

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin # For joining next page url with base url

search_terms = input("What movie do you want to know about?\n> ").split()

url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + '+'.join(search_terms) + '&s=all'

def scrape_find_next_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    next_page = soup.find('td', 'result_text').find('a').get('href')

    return next_page


next_page_url = scrape_find_next_page(url)

new_page = urljoin(url, next_page_url)



def scrape_movie_data(next_page_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    title_year = soup.find('span','titleYear').find('a').get_text()

    return title_year

print(scrape_movie_data(new_page))
J.C. Diaz
  • 43
  • 1
  • 4
  • use DevTool in Chrome/Firefox to find element (if page doesn't use JavaScript to load data). – furas Nov 10 '16 at 04:39

1 Answers1

2

First problem: in scrape_movie_data(next_page_url) you use url instead of next_page_url in requests.get() so you read wrong page.

response = requests.get(next_page_url, headers=headers)

Second problem: you have to use {'id': 'titleYear'} in find()

title_year = soup.find('span', {'id': 'titleYear'}).find('a').get_text()

Final version:

def scrape_movie_data(next_page_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(next_page_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    title_year = soup.find('span', {'id': 'titleYear'}).find('a').get_text()

    return title_year

EDIT: check IMDB API in Google. Some interesting results

SO - IMDB API to retrieve character information

SO - Does IMDB provide an API?

and you can get results as JSON so you don't have to scrape.

Other portals:

OMDb API -The Open Movie Database

The Movie DB API


EDIT: JSON data

import requests

url = 'http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q={}'
#url = 'http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q={}'

headers = {'User-Agent': 'Mozilla/5.0'}

title = input("Title: ").split()

response = requests.get(url.format(title[0]), headers=headers)

data = response.json()

for x in data['title_popular']: # data['title_approx']:
    print('title:', x['title'])
    print(' year:', x['title_description'][:4])
    print('---')
    print('  id:', x['id'])
    print('name:', x['name'])
    print('        title:', x['title'])
    print('episode_title:', x['episode_title'])
    print('title_description:', x['title_description'])
    print('      description:', x['description'])
    print('------------------------------------')
Community
  • 1
  • 1
furas
  • 134,197
  • 12
  • 106
  • 148