Python 正则表达式训练-优快云博客

本文链接：https://blog.youkuaiyun.com/2301_81882590/article/details/147029796

正则表达式是处理文本的强大工具，Python通过`re`模块提供了正则表达式支持。下面是一些基础到进阶的正则表达式训练内容。

1. 基础匹配

import re

# 检查字符串是否包含"python"
text = "I love python programming"
pattern = r"python"
match = re.search(pattern, text)
if match:
    print("Found:", match.group())  # Found: python

2. 常用元字符

# \d 匹配数字
text = "My number is 123-456-7890"
pattern = r"\d{3}-\d{3}-\d{4}"
match = re.search(pattern, text)
print(match.group())  # 123-456-7890

# \w 匹配字母数字下划线
text = "User_name: john_doe123"
pattern = r"\w+"
matches = re.findall(pattern, text)
print(matches)  # ['User_name', 'john_doe123']

# . 匹配任意字符(除换行符)
text = "cat bat hat mat"
pattern = r".at"
matches = re.findall(pattern, text)
print(matches)  # ['cat', 'bat', 'hat', 'mat']

3. 字符集和量词

# [] 字符集
text = "The rain in Spain falls mainly in the plain"
pattern = r"[Ss]pain"
matches = re.findall(pattern, text)
print(matches)  # ['Spain', 'spain']

# 量词: *, +, ?, {n,m}
text = "color colour colooor"
pattern = r"colo[u]?r"  # ?表示前面的u出现0或1次
matches = re.findall(pattern, text)
print(matches)  # ['color', 'colour']

pattern = r"colo[u]+r"  # +表示前面的u出现1次或多次
matches = re.findall(pattern, text)
print(matches)  # ['colour', 'colouur']

4. 分组和捕获

# 分组提取
text = "John: 30, Jane: 25"
pattern = r"(\w+): (\d+)"
matches = re.findall(pattern, text)
for name, age in matches:
    print(f"{name} is {age} years old")
# John is 30 years old
# Jane is 25 years old

# 非捕获组 (?:...)
text = "hello world"
pattern = r"(?:hello|hi) world"
match = re.search(pattern, text)
print(match.group())  # hello world

5. 边界匹配

# ^ 字符串开头，$ 字符串结尾
text = "apple banana apple"
pattern = r"^apple"
match = re.search(pattern, text)
print(match.group())  # apple

pattern = r"apple$"
match = re.search(pattern, text)
print(match.group())  # apple

# \b 单词边界
text = "cat concatenate catfish"
pattern = r"\bcat\b"  # 只匹配单独的cat
matches = re.findall(pattern, text)
print(matches)  # ['cat']

6. 查找替换

# re.sub() 替换
text = "Today is 2023-05-15"
pattern = r"(\d{4})-(\d{2})-(\d{2})"
replacement = r"\2/\3/\1"  # 月/日/年
new_text = re.sub(pattern, replacement, text)
print(new_text)  # Today is 05/15/2023

7. 进阶练习

# 验证电子邮件
emails = ["test@example.com", "invalid@.com", "user.name@domain.co.uk"]
pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
for email in emails:
    if re.match(pattern, email):
        print(f"{email} is valid")
    else:
        print(f"{email} is invalid")

# 提取HTML标签内容
html = "<h1>Title</h1><p>Paragraph 1</p><p>Paragraph 2</p>"
pattern = r"<[^>]+>(.*?)</[^>]+>"
matches = re.findall(pattern, html)
print(matches)  # ['Title', 'Paragraph 1', 'Paragraph 2']

8. 实用技巧

# 编译正则表达式（多次使用时提高效率）
pattern = re.compile(r"\b\w{4}\b")  # 匹配4字母单词
text = "This is a sample text with some words"
matches = pattern.findall(text)
print(matches)  # ['This', 'some', 'word']

# 忽略大小写匹配
text = "Python is awesome, PYTHON is powerful"
pattern = r"python"
matches = re.findall(pattern, text, flags=re.IGNORECASE)
print(matches)  # ['Python', 'PYTHON']

# 多行模式
text = """First line
Second line
Third line"""
pattern = r"^[A-Za-z]+"
matches = re.findall(pattern, text, flags=re.MULTILINE)
print(matches)  # ['First', 'Second', 'Third']

9. 复杂示例

# 解析日志文件
log = """
[2023-05-15 10:30:45] INFO: User 'admin' logged in
[2023-05-15 10:31:02] ERROR: Database connection failed
[2023-05-15 10:32:15] WARNING: Disk space low
"""

pattern = r"\[(.*?)\] (.*?): (.*)"
matches = re.findall(pattern, log)
for timestamp, level, message in matches:
    print(f"{timestamp} | {level:7} | {message}")

# 提取URL各部分
url = "https://www.example.com:8080/path/to/resource?query=string#fragment"
pattern = r"(https?)://([^/:]+)(?::(\d+))?(/[^?#]*)(?:\?([^#]*))?(?:#(.*))?"
match = re.match(pattern, url)
if match:
    protocol, domain, port, path, query, fragment = match.groups()
    print(f"Protocol: {protocol}")
    print(f"Domain: {domain}")
    print(f"Port: {port}")
    print(f"Path: {path}")
    print(f"Query: {query}")
    print(f"Fragment: {fragment}")

要掌握正则表达式，最重要的是多练习。可以从简单的模式开始，逐步构建更复杂的表达式。在线工具如 regex101.com 可以帮助你测试和调试正则表达式。