Web Scraping with BeautifulSoup - Python Mastery

Web Scraping Basics

Web scraping extracts data from websites. Python's requests + BeautifulSoup combination is the most popular approach for static pages.

# pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup

# Fetch a webpage
url = "https://quotes.toscrape.com"   # A practice scraping site
response = requests.get(url)
response.raise_for_status()

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find elements
title = soup.title.text
print(title)   # "Quotes to Scrape"

# Find first element with selector
first_quote = soup.find("div", class_="quote")

# Find all matching elements
all_quotes = soup.find_all("div", class_="quote")

for quote in all_quotes:
    text = quote.find("span", class_="text").text
    author = quote.find("small", class_="author").text
    print(f"{author}: {text[:50]}...")

CSS Selectors

# select() uses CSS selector syntax
quotes = soup.select("div.quote")              # class selector
texts = soup.select("span.text")              # All text spans
first = soup.select_one("div.quote span.text") # First match

# Common selectors
soup.select("h1")                  # Tag name
soup.select(".classname")          # Class
soup.select("#id")                 # ID
soup.select("a[href]")             # Has attribute
soup.select("a[href^='https']")    # href starts with
soup.select("div > p")             # Direct child
soup.select("div p")               # Any descendant

# Accessing attributes and text
link = soup.find("a")
link["href"]        # Get attribute value
link.text           # Text content
link.get_text()     # Text (strips HTML)
link.string         # Direct string content (None if has children)

Scraping Multiple Pages

# Scrape paginated content
base_url = "https://quotes.toscrape.com"
all_quotes = []
page = 1

while True:
    url = f"{base_url}/page/{page}/"
    response = requests.get(url)
    if response.status_code == 404:
        break   # No more pages

    soup = BeautifulSoup(response.text, "html.parser")
    quotes = soup.find_all("div", class_="quote")

    if not quotes:
        break

    for q in quotes:
        all_quotes.append({
            "text": q.find("span", class_="text").text,
            "author": q.find("small", class_="author").text,
            "tags": [t.text for t in q.find_all("a", class_="tag")]
        })

    # Check for next page
    next_btn = soup.find("li", class_="next")
    if not next_btn:
        break

    page += 1
    import time
    time.sleep(1)   # Be polite — don't hammer the server!

print(f"Scraped {len(all_quotes)} quotes")

Important Considerations

Legal and Ethical Guidelines

Always check robots.txt (e.g., site.com/robots.txt). Respect rate limits — add delays between requests. Don't scrape personal data. Check the site's Terms of Service. Use official APIs when available.

# Check robots.txt
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://example.com/robots.txt")
rp.read()
rp.can_fetch("*", "https://example.com/page")  # Can we scrape this?

# Set user agent to identify your bot
headers = {"User-Agent": "MyBot/1.0 (contact@example.com)"}
response = requests.get(url, headers=headers)

# Handle dynamic JS-rendered pages (need Playwright or Selenium)
# pip install playwright
# playwright install chromium
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    page.goto("https://dynamic-site.com")
    page.wait_for_selector(".content")
    html = page.content()
    soup = BeautifulSoup(html, "html.parser")
    browser.close()

Key Takeaways

requests + BeautifulSoup: the standard combo for static HTML
soup.find() vs soup.find_all(): one vs all matching elements
CSS selectors with soup.select(): powerful and familiar syntax
Add delays between requests: time.sleep(1) — be a polite scraper
Check robots.txt: understand what you're allowed to scrape

Practice Exercises

Scrape all quotes from quotes.toscrape.com (all pages). Save to JSON.
Scrape book titles and prices from books.toscrape.com. Filter books under £20.
Scrape the Python changelog from docs.python.org/3/whatsnew/ and extract version numbers and dates.

← CSV and JSON Data Regular Expressions →