Regular Expressions

Regular expressions (regex) are patterns for matching text. Powerful for validation, extraction, and transformation. Python's re module provides full regex support.

import re

# Basic matching
pattern = r"hello"    # r prefix for raw string (essential for regex!)
text = "say hello to the world"

match = re.search(pattern, text)     # Find first match anywhere
if match:
    print(match.group())    # "hello"
    print(match.start())    # 4 — start index
    print(match.end())      # 9 — end index

# re.fullmatch — entire string must match
re.fullmatch(r"\d+", "123")    # Match
re.fullmatch(r"\d+", "123abc") # None — not full match

# re.findall — return all matches
emails_text = "contact alice@example.com or bob@test.org"
emails = re.findall(r"[\w.]+@[\w.]+", emails_text)
# ['alice@example.com', 'bob@test.org']

# re.finditer — iterate over Match objects
for m in re.finditer(r"\d+", "abc 123 def 456"):
    print(m.group(), m.start())

Regex Syntax Reference

Pattern	Matches
`.`	Any character except newline
`\d`	Digit [0-9]
`\w`	Word char [a-zA-Z0-9_]
`\s`	Whitespace (space, tab, newline)
`\D, \W, \S`	Opposite of above (uppercase)
`[abc]`	Any of a, b, or c
`[^abc]`	Any character NOT a, b, c
`[a-z]`	Any lowercase letter
`^`	Start of string
`$`	End of string
`*`	0 or more
`+`	1 or more
`?`	0 or 1
`{n}`	Exactly n times
`{m,n}`	Between m and n times
`(abc)`	Group (capture)
`a\|b`	a or b

Groups and Substitution

# Capturing groups
pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(pattern, "alice@example.com")
if match:
    print(match.group(0))   # Full match: alice@example.com
    print(match.group(1))   # Group 1: alice
    print(match.group(2))   # Group 2: example
    print(match.group(3))   # Group 3: com

# Named groups
pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<tld>\w+)"
match = re.search(pattern, "alice@example.com")
if match:
    print(match.group("user"))    # alice
    print(match.group("domain"))  # example

# re.sub — substitution
text = "Hello, World!"
cleaned = re.sub(r"[^\w\s]", "", text)   # Remove punctuation
# "Hello World"

# re.sub with function
def mask_email(match):
    user = match.group(1)
    return f"{user[0]}***@{match.group(2)}"

masked = re.sub(r"(\w+)@(\w+\.\w+)", mask_email, "alice@example.com")
# "a***@example.com"

# re.split
parts = re.split(r"[,;\s]+", "one, two;  three four")
# ["one", "two", "three", "four"]

Practical Patterns

# Email validation (simplified)
def is_valid_email(email):
    pattern = r"^[\w.-]+@[\w.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern, email))

# Phone number extraction
def extract_phones(text):
    pattern = r"\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}"
    return re.findall(pattern, text)

# URL extraction
def extract_urls(text):
    pattern = r"https?://[^\s]+"
    return re.findall(pattern, text)

# Compile for performance (when using same pattern many times)
email_re = re.compile(r"[\w.]+@[\w.]+")
phone_re = re.compile(r"\d{3}-\d{4}")

emails = email_re.findall(text)
phones = phone_re.findall(text)

# Flags
re.search(r"hello", "Hello World", re.IGNORECASE)
re.findall(r"^\w+", multiline_text, re.MULTILINE)   # ^ matches each line start

Key Takeaways

Always use raw strings: r"pattern" avoids backslash confusion
re.search vs re.match: search finds anywhere, match only at start
re.findall: returns list of all matches
Groups for extraction: parentheses capture sub-patterns
re.compile(): pre-compile patterns used repeatedly for performance

Practice Exercises

Write a function that validates a strong password: at least 8 chars, one uppercase, one lowercase, one digit, one special character.
Write a function that extracts all URLs from a text document and normalizes them (remove trailing slashes, convert to lowercase domain).
Parse log lines in the format 2024-01-15 ERROR Failed to connect to database — extract date, level, and message into a dict.

← Web Scraping Generators and Iterators →