Web Scraping Basics
Web scraping extracts data from websites. Python's requests + BeautifulSoup combination is the most popular approach for static pages.
# pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup
# Fetch a webpage
url = "https://quotes.toscrape.com" # A practice scraping site
response = requests.get(url)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")
# Find elements
title = soup.title.text
print(title) # "Quotes to Scrape"
# Find first element with selector
first_quote = soup.find("div", class_="quote")
# Find all matching elements
all_quotes = soup.find_all("div", class_="quote")
for quote in all_quotes:
text = quote.find("span", class_="text").text
author = quote.find("small", class_="author").text
print(f"{author}: {text[:50]}...")CSS Selectors
# select() uses CSS selector syntax
quotes = soup.select("div.quote") # class selector
texts = soup.select("span.text") # All text spans
first = soup.select_one("div.quote span.text") # First match
# Common selectors
soup.select("h1") # Tag name
soup.select(".classname") # Class
soup.select("#id") # ID
soup.select("a[href]") # Has attribute
soup.select("a[href^='https']") # href starts with
soup.select("div > p") # Direct child
soup.select("div p") # Any descendant
# Accessing attributes and text
link = soup.find("a")
link["href"] # Get attribute value
link.text # Text content
link.get_text() # Text (strips HTML)
link.string # Direct string content (None if has children)Scraping Multiple Pages
# Scrape paginated content
base_url = "https://quotes.toscrape.com"
all_quotes = []
page = 1
while True:
url = f"{base_url}/page/{page}/"
response = requests.get(url)
if response.status_code == 404:
break # No more pages
soup = BeautifulSoup(response.text, "html.parser")
quotes = soup.find_all("div", class_="quote")
if not quotes:
break
for q in quotes:
all_quotes.append({
"text": q.find("span", class_="text").text,
"author": q.find("small", class_="author").text,
"tags": [t.text for t in q.find_all("a", class_="tag")]
})
# Check for next page
next_btn = soup.find("li", class_="next")
if not next_btn:
break
page += 1
import time
time.sleep(1) # Be polite โ don't hammer the server!
print(f"Scraped {len(all_quotes)} quotes")Important Considerations
Legal and Ethical Guidelines
Always check robots.txt (e.g., site.com/robots.txt). Respect rate limits โ add delays between requests. Don't scrape personal data. Check the site's Terms of Service. Use official APIs when available.
# Check robots.txt
import urllib.robotparser
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://example.com/robots.txt")
rp.read()
rp.can_fetch("*", "https://example.com/page") # Can we scrape this?
# Set user agent to identify your bot
headers = {"User-Agent": "MyBot/1.0 (contact@example.com)"}
response = requests.get(url, headers=headers)
# Handle dynamic JS-rendered pages (need Playwright or Selenium)
# pip install playwright
# playwright install chromium
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto("https://dynamic-site.com")
page.wait_for_selector(".content")
html = page.content()
soup = BeautifulSoup(html, "html.parser")
browser.close()Key Takeaways
- requests + BeautifulSoup: the standard combo for static HTML
- soup.find() vs soup.find_all(): one vs all matching elements
- CSS selectors with soup.select(): powerful and familiar syntax
- Add delays between requests: time.sleep(1) โ be a polite scraper
- Check robots.txt: understand what you're allowed to scrape
Practice Exercises
- Scrape all quotes from quotes.toscrape.com (all pages). Save to JSON.
- Scrape book titles and prices from books.toscrape.com. Filter books under ยฃ20.
- Scrape the Python changelog from docs.python.org/3/whatsnew/ and extract version numbers and dates.