正则表达式是处理文本的强大工具,Python通过re
模块提供了正则表达式支持。下面是一些基础到进阶的正则表达式训练内容。
1. 基础匹配
import re
# 检查字符串是否包含"python"
text = "I love python programming"
pattern = r"python"
match = re.search(pattern, text)
if match:
print("Found:", match.group()) # Found: python
2. 常用元字符
# \d 匹配数字
text = "My number is 123-456-7890"
pattern = r"\d{3}-\d{3}-\d{4}"
match = re.search(pattern, text)
print(match.group()) # 123-456-7890
# \w 匹配字母数字下划线
text = "User_name: john_doe123"
pattern = r"\w+"
matches = re.findall(pattern, text)
print(matches) # ['User_name', 'john_doe123']
# . 匹配任意字符(除换行符)
text = "cat bat hat mat"
pattern = r".at"
matches = re.findall(pattern, text)
print(matches) # ['cat', 'bat', 'hat', 'mat']
3. 字符集和量词
# [] 字符集
text = "The rain in Spain falls mainly in the plain"
pattern = r"[Ss]pain"
matches = re.findall(pattern, text)
print(matches) # ['Spain', 'spain']
# 量词: *, +, ?, {n,m}
text = "color colour colooor"
pattern = r"colo[u]?r" # ?表示前面的u出现0或1次
matches = re.findall(pattern, text)
print(matches) # ['color', 'colour']
pattern = r"colo[u]+r" # +表示前面的u出现1次或多次
matches = re.findall(pattern, text)
print(matches) # ['colour', 'colouur']
4. 分组和捕获
# 分组提取
text = "John: 30, Jane: 25"
pattern = r"(\w+): (\d+)"
matches = re.findall(pattern, text)
for name, age in matches:
print(f"{name} is {age} years old")
# John is 30 years old
# Jane is 25 years old
# 非捕获组 (?:...)
text = "hello world"
pattern = r"(?:hello|hi) world"
match = re.search(pattern, text)
print(match.group()) # hello world
5. 边界匹配
# ^ 字符串开头,$ 字符串结尾
text = "apple banana apple"
pattern = r"^apple"
match = re.search(pattern, text)
print(match.group()) # apple
pattern = r"apple$"
match = re.search(pattern, text)
print(match.group()) # apple
# \b 单词边界
text = "cat concatenate catfish"
pattern = r"\bcat\b" # 只匹配单独的cat
matches = re.findall(pattern, text)
print(matches) # ['cat']
6. 查找替换
# re.sub() 替换
text = "Today is 2023-05-15"
pattern = r"(\d{4})-(\d{2})-(\d{2})"
replacement = r"\2/\3/\1" # 月/日/年
new_text = re.sub(pattern, replacement, text)
print(new_text) # Today is 05/15/2023
7. 进阶练习
# 验证电子邮件
emails = ["test@example.com", "invalid@.com", "user.name@domain.co.uk"]
pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
for email in emails:
if re.match(pattern, email):
print(f"{email} is valid")
else:
print(f"{email} is invalid")
# 提取HTML标签内容
html = "<h1>Title</h1><p>Paragraph 1</p><p>Paragraph 2</p>"
pattern = r"<[^>]+>(.*?)</[^>]+>"
matches = re.findall(pattern, html)
print(matches) # ['Title', 'Paragraph 1', 'Paragraph 2']
8. 实用技巧
# 编译正则表达式(多次使用时提高效率)
pattern = re.compile(r"\b\w{4}\b") # 匹配4字母单词
text = "This is a sample text with some words"
matches = pattern.findall(text)
print(matches) # ['This', 'some', 'word']
# 忽略大小写匹配
text = "Python is awesome, PYTHON is powerful"
pattern = r"python"
matches = re.findall(pattern, text, flags=re.IGNORECASE)
print(matches) # ['Python', 'PYTHON']
# 多行模式
text = """First line
Second line
Third line"""
pattern = r"^[A-Za-z]+"
matches = re.findall(pattern, text, flags=re.MULTILINE)
print(matches) # ['First', 'Second', 'Third']
9. 复杂示例
# 解析日志文件
log = """
[2023-05-15 10:30:45] INFO: User 'admin' logged in
[2023-05-15 10:31:02] ERROR: Database connection failed
[2023-05-15 10:32:15] WARNING: Disk space low
"""
pattern = r"\[(.*?)\] (.*?): (.*)"
matches = re.findall(pattern, log)
for timestamp, level, message in matches:
print(f"{timestamp} | {level:7} | {message}")
# 提取URL各部分
url = "https://www.example.com:8080/path/to/resource?query=string#fragment"
pattern = r"(https?)://([^/:]+)(?::(\d+))?(/[^?#]*)(?:\?([^#]*))?(?:#(.*))?"
match = re.match(pattern, url)
if match:
protocol, domain, port, path, query, fragment = match.groups()
print(f"Protocol: {protocol}")
print(f"Domain: {domain}")
print(f"Port: {port}")
print(f"Path: {path}")
print(f"Query: {query}")
print(f"Fragment: {fragment}")
要掌握正则表达式,最重要的是多练习。可以从简单的模式开始,逐步构建更复杂的表达式。在线工具如 regex101.com 可以帮助你测试和调试正则表达式。