How to Scrape Multiple Pages of a Website Using a Python Web Scraper

Originally posted on medium

Extracting data and ensuring data quality


Where We Left Off

This was the code we used:

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
headers = {"Accept-Language": "en-US, en;q=0.5"}
results = requests.get(url, headers=headers)

soup = BeautifulSoup(results.text, "html.parser")

#initiate data storage
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

movie_div = soup.find_all('div', class_='lister-item mode-advanced')

#our loop through each container
for container in movie_div:

        #name
        name = container.h3.a.text
        titles.append(name)
        
        #year
        year = container.h3.find('span', class_='lister-item-year').text
        years.append(year)

        # runtime
        runtime = container.p.find('span', class_='runtime').text if container.p.find('span', class_='runtime').text else '-'
        time.append(runtime)

        #IMDb rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)

        #metascore
        m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
        metascores.append(m_score)

        #there are two NV containers, grab both of them as they hold both the votes and the grosses
        nv = container.find_all('span', attrs={'name': 'nv'})
        
        #filter nv for votes
        vote = nv[0].text
        votes.append(vote)
        
        #filter nv for gross
        grosses = nv[1].text if len(nv) > 1 else '-'
        us_gross.append(grosses)

#pandas dataframe        
movies = pd.DataFrame({
'movie': titles,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
})

#cleaning data 
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)
movies['metascore'] = movies['metascore'].astype(int)
movies['votes'] = movies['votes'].str.replace(',', '').astype(int)
movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')

#add dataframe to csv file named 'movies.csv'
movies.to_csv('movies.csv')

And our results looked like this:


What We’ll Cover


Introducing New Tools


Time to Code

Import tools

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from time import sleep
from random import randint

Initialize your storage

#initialize empty lists where you'll store your data
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

English movie titles

headers = {"Accept-Language": "en-US, en;q=0.5"}

Analyzing our URL

Refresher on ‘for' loops

for <variable> in <iterable>:
    <statement(s)>

Changing the URL Parameter

pages = np.arange(1, 1001, 50)

Looping Through Each Page

for page in pages:

Requesting the URL + ‘html_soup’ + ‘movie_div’

 page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt", headers=headers)

  soup = BeautifulSoup(page.text, 'html.parser')
  
  movie_div = soup.find_all('div', class_='lister-item mode-advanced')

Controlling the Crawl Rate

  sleep(randint(2,10))

Our code should now look like this:

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from time import sleep
from random import randint

headers = {"Accept-Language": "en-US,en;q=0.5"}

titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

pages = np.arange(1, 1001, 50)

for page in pages: 
  
  page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt", headers=headers)
  
  soup = BeautifulSoup(page.text, 'html.parser')
  
  movie_div = soup.find_all('div', class_='lister-item mode-advanced')
  
  sleep(randint(2,10))

Scraping Code

  for container in movie_div:
    
    name = container.h3.a.text
    
    titles.append(name)
    
    year = container.h3.find('span', class_='lister-item-year').text
    years.append(year)

    runtime = container.p.find('span', class_='runtime') if container.p.find('span', class_='runtime') else '-'
    time.append(runtime)

    imdb = float(container.strong.text)
    imdb_ratings.append(imdb)
    
    m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
    metascores.append(m_score)

    nv = container.find_all('span', attrs={'name': 'nv'})
        
    vote = nv[0].text
    votes.append(vote)
        
    grosses = nv[1].text if len(nv) > 1 else '-'
    us_gross.append(grosses)

Pointing Out Previous Errors

  m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
  metascores.append(m_score)
movies['metascore'] = movies['metascore'].astype(int)

What does this mean?


Fixing the Cleaning of the Metascore Data Code

movies['metascore'] = movies['metascore'].astype(int)
movies['metascore'] = movies['metascore'].str.extract('(\d+)')

movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

Add the DataFrame and Cleaning Code

movies = pd.DataFrame({
'movie': titles,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
})

movies['year'] = movies['year'].str.extract('(\d+)').astype(int)

movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)

movies['votes'] = movies['votes'].str.replace(',', '').astype(int)

movies['metascore'] = movies['metascore'].str.extract('(\d+)')
movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')

Save to CSV

movies.to_csv('movies.csv')


Basic Data-Quality Best Practices (Optional)

Missing data

print(movies.isnull().sum())

movies.metascore = movies.metascore.fillna("None Given")
movies.us_grossMillions = movies.us_grossMillions.fillna("")

print(movies['metascore'])
print(movies['us_grossMillions'])

# Dropping all rows with any NA values:
movies.dropna()

# or we can drop all rows with more than a specified amount of missing vaulues:
movies.dropna(thresh=10)
# drop all columns with any NA values:

movies.dropna(axis=1, how=’any’)

The Final Code

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from time import sleep
from random import randint

headers = {"Accept-Language": "en-US,en;q=0.5"}

titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

pages = np.arange(1, 1001, 50)

for page in pages: 

  page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt", headers=headers)

  soup = BeautifulSoup(page.text, 'html.parser')
  movie_div = soup.find_all('div', class_='lister-item mode-advanced')
  
  sleep(randint(2,10))

  for container in movie_div:

        name = container.h3.a.text
        titles.append(name)
        
        year = container.h3.find('span', class_='lister-item-year').text
        years.append(year)

        runtime = container.p.find('span', class_='runtime') if container.p.find('span', class_='runtime') else ''
        time.append(runtime)

        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)

        m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else ''
        metascores.append(m_score)

        nv = container.find_all('span', attrs={'name': 'nv'})
        
        vote = nv[0].text
        votes.append(vote)
        
        grosses = nv[1].text if len(nv) > 1 else ''
        us_gross.append(grosses)

movies = pd.DataFrame({
'movie': titles,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
'timeMin': time
})

movies['votes'] = movies['votes'].str.replace(',', '').astype(int)

movies.loc[:, 'year'] = movies['year'].str[-5:-1].astype(int)

movies['timeMin'] = movies['timeMin'].astype(str)
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)

movies['metascore'] = movies['metascore'].str.extract('(\d+)')
movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')


# to see your dataframe
print(movies)

# to see the datatypes of your columns
print(movies.dtypes)

# to see where you're missing data and how much data is missing 
print(movies.isnull().sum())

# to move all your scraped data to a CSV file
movies.to_csv('movies.csv')

Conclusion

Source: medium