Regular Expressions
Regular expressions (regex) are patterns for matching text. Powerful for validation, extraction, and transformation. Python's re module provides full regex support.
import re
# Basic matching
pattern = r"hello" # r prefix for raw string (essential for regex!)
text = "say hello to the world"
match = re.search(pattern, text) # Find first match anywhere
if match:
print(match.group()) # "hello"
print(match.start()) # 4 โ start index
print(match.end()) # 9 โ end index
# re.fullmatch โ entire string must match
re.fullmatch(r"\d+", "123") # Match
re.fullmatch(r"\d+", "123abc") # None โ not full match
# re.findall โ return all matches
emails_text = "contact alice@example.com or bob@test.org"
emails = re.findall(r"[\w.]+@[\w.]+", emails_text)
# ['alice@example.com', 'bob@test.org']
# re.finditer โ iterate over Match objects
for m in re.finditer(r"\d+", "abc 123 def 456"):
print(m.group(), m.start())Regex Syntax Reference
| Pattern | Matches |
|---|---|
. | Any character except newline |
\d | Digit [0-9] |
\w | Word char [a-zA-Z0-9_] |
\s | Whitespace (space, tab, newline) |
\D, \W, \S | Opposite of above (uppercase) |
[abc] | Any of a, b, or c |
[^abc] | Any character NOT a, b, c |
[a-z] | Any lowercase letter |
^ | Start of string |
$ | End of string |
* | 0 or more |
+ | 1 or more |
? | 0 or 1 |
{n} | Exactly n times |
{m,n} | Between m and n times |
(abc) | Group (capture) |
a|b | a or b |
Groups and Substitution
# Capturing groups
pattern = r"(\w+)@(\w+)\.(\w+)"
match = re.search(pattern, "alice@example.com")
if match:
print(match.group(0)) # Full match: alice@example.com
print(match.group(1)) # Group 1: alice
print(match.group(2)) # Group 2: example
print(match.group(3)) # Group 3: com
# Named groups
pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<tld>\w+)"
match = re.search(pattern, "alice@example.com")
if match:
print(match.group("user")) # alice
print(match.group("domain")) # example
# re.sub โ substitution
text = "Hello, World!"
cleaned = re.sub(r"[^\w\s]", "", text) # Remove punctuation
# "Hello World"
# re.sub with function
def mask_email(match):
user = match.group(1)
return f"{user[0]}***@{match.group(2)}"
masked = re.sub(r"(\w+)@(\w+\.\w+)", mask_email, "alice@example.com")
# "a***@example.com"
# re.split
parts = re.split(r"[,;\s]+", "one, two; three four")
# ["one", "two", "three", "four"]Practical Patterns
# Email validation (simplified)
def is_valid_email(email):
pattern = r"^[\w.-]+@[\w.-]+\.[a-zA-Z]{2,}$"
return bool(re.match(pattern, email))
# Phone number extraction
def extract_phones(text):
pattern = r"\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}"
return re.findall(pattern, text)
# URL extraction
def extract_urls(text):
pattern = r"https?://[^\s]+"
return re.findall(pattern, text)
# Compile for performance (when using same pattern many times)
email_re = re.compile(r"[\w.]+@[\w.]+")
phone_re = re.compile(r"\d{3}-\d{4}")
emails = email_re.findall(text)
phones = phone_re.findall(text)
# Flags
re.search(r"hello", "Hello World", re.IGNORECASE)
re.findall(r"^\w+", multiline_text, re.MULTILINE) # ^ matches each line startKey Takeaways
- Always use raw strings:
r"pattern"avoids backslash confusion - re.search vs re.match: search finds anywhere, match only at start
- re.findall: returns list of all matches
- Groups for extraction: parentheses capture sub-patterns
- re.compile(): pre-compile patterns used repeatedly for performance
Practice Exercises
- Write a function that validates a strong password: at least 8 chars, one uppercase, one lowercase, one digit, one special character.
- Write a function that extracts all URLs from a text document and normalizes them (remove trailing slashes, convert to lowercase domain).
- Parse log lines in the format
2024-01-15 ERROR Failed to connect to databaseโ extract date, level, and message into a dict.