1. 正则表达式
正则表达式就是字符串的匹配规则,在多数编程语言里都有相应的支持,python里对应的模块是re
1.1 字符相关
匹配文本中的指定文本
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("Py小白雨", text)
print(data_list)
# ['Py小白雨', 'Py小白雨'] 可用于计算字符串中某个字符出现的次数
[abc] 匹配a或b或c 字符
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("[Py]", text)
print(data_list)
# ['P', 'y', 'P', 'y']
data_list = re.findall("P[y]", text)
print(data_list)
# ['Py', 'Py']
[^abc] 匹配除了abc意外的其他字符。
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("[^y]", text)
print(data_list)
# ['你', '好', '我', '是', 'P', '小', '白', '雨', ',', '我', '的', 'c', 's', 'd', 'n', '名', '称', '也', '是', 'P', '小', '白', '雨']
[a-z] 匹配a~z的任意字符( [0-9]也可以 )。
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("[a-z]", text)
print(data_list)
# ['y', 'c', 's', 'd', 'n', 'y']
. 代指除换行符以外的任意字符。
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("P.小", text)
print(data_list)
# ['Py小', 'Py小']
data_list = re.findall("P.+小", text) # # 贪婪匹配
print(data_list)
# ['Py小白雨,我的csdn名称也是Py小']
data_list = re.findall("P.+?小", text) # # 非贪婪匹配
print(data_list)
# ['Py小', 'Py小']
\w 代指字母或数字或下划线(汉字)。
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("P\w+雨", text)
print(data_list)
# ['Py小白雨', 'Py小白雨']
\d 代指数字
import re
text = "root-ad888min-add999-admd99in"
data_list = re.findall("d\d", text)
print(data_list)
# ['d8', 'd9', 'd9']
data_list = re.findall("d\d+", text)
print(data_list)
# ['d888, 'd999', 'd99']
\s 代指任意的空白符,包括空格、制表符等。
import re
text = "root admin add admin"
data_list = re.findall("a\w+\s\w+", text)
print(data_list)
# ['admin add']
1.2 数量相关
* 重复0次或更多次
import re
text = "你好我是Py小白雨,我的csdn名称也是Py小白雨"
data_list = re.findall("Py*小白雨", text)
print(data_list)
# ['Py小白雨', 'Py小白雨']
+ 重复1次或更多次
import re
text = "他是大佬啊,确实是个大佬1啊,大佬2啊,大佬666啊。"
data_list = re.findall("大佬\d+啊", text)
print(data_list)
# ['大佬1啊', '大佬2啊', '大佬666啊']
? 重复0次或1次
import re
text = "他是大佬啊,确实是个大佬1啊,大佬2啊,大佬666啊。"
data_list = re.findall("大佬\d?啊", text)
print(data_list)
# ['大佬啊', '大佬1啊', '大佬2啊']
{n} 重复n次
import re
text = "有串带码12131216789"
data_list = re.findall("121312\d{5}", text)
print(data_list)
# ['12131216789']
{n,} 重复n次或更多次
import re
text = "两串随机数字111111111,5588996625544"
data_list = re.findall("\d{9,}", text)
print(data_list)
# ['111111111', '5588996625544']
{n,m} 重复n到m次
import re
text = "两串随机数字111111111,5588996625544"
data_list = re.findall("\d{10,15}", text)
print(data_list) # [ '5588996625544']
1.3 括号(分组)
提取数据区域
import re
text = "两串随机数字111111111,55889966255441"
data_list = re.findall("55889966(2\d{5})", text)
print(data_list)
#['255441']
data_list = re.findall("55(88)9966(2\d{5})", text)
print(data_list)
#[('88', '255441')]
data_list = re.findall("(55889966(2\d{5}))", text)
print(data_list)
# [('55889966255441', '255441')]
获取指定区域 + 或条件
import re
text = "楼主15131root太牛15131sdsf,在sadadadadwwwwrrweq,手机号也可151312111111,搞起来呀"
data_list = re.findall("15131(2\d{5}|r\w+太)", text)
print(data_list)
# ['root太', '211111']
data_list = re.findall("(15131(2\d{5}|r\w+太))", text)
print(data_list)
# [('15131root太', 'root太'), ('15131211111', '211111')]
1.4 起始和结束
如果要求用户输入的内容必须是指定的内容开头和结尾,比就需要用到如下两个字符。
^开始$结束
import re
text = "啊111111111@werd.adssw‘"
email_list = re.findall("^\w+@\w+.\w+$", text, re.ASCII)
print(email_list)
# []
text = "111111111@werd.ad"
email_list = re.findall("^\w+@\w+.\w+$", text, re.ASCII)
print(email_list)
# ['111111111@werd.ad']
这种一般用于对用户输入数据格式的校验比较多,例如:
import re
text = input("请输入邮箱:")
email = re.findall("^\w+@\w+.\w+$", text, re.ASCII)
if not email:
print("邮箱格式错误")
else:
print(email)
1.5 特殊字符
由于正则表达式中 * . \ { } ( ) 等都具有特殊的含义,所以如果想要在正则中匹配这种指定的字符,需要转义,例如:
import re
text = "我是Py{5}小白雨"
data = re.findall("Py{5}小白雨", text)
print(data)
# []
data = re.findall("Py\{5\}小白雨", text)
print(data)
# ['Py{5}小白雨']
2. re模块
可以处理正则表达式并对文本进行处理。
获取匹配到的所有数据:findall()
import re
text = "dsf130429191912015219k13042919591219521Xkk"
data_list = re.findall("(\d{6})(\d{4})(\d{2})(\d{2})(\d{3})([0-9]|X)", text)
print(data_list)
# [('130429', '1919', '12', '01', '521', '9'), ('130429', '1959', '12', '19', '521', 'X')]
从起始位置开始匹配,匹配成功返回一个对象,未匹配成功返回None:match()
import re
text = "大小逗2B最逗3B欢乐"
data = re.match("逗\dB", text)
print(data)
# None
data = re.match("逗\dB", text)
if data:
content = data.group()
print(content)
# "逗2B"
浏览整个字符串去匹配第一个,未匹配成功返回None:search()
import re
text = "大小逗2B最逗3B欢乐"
data = re.search("逗\dB", text)
if data:
print(data.group())
# "逗2B"
替换匹配成功的位置:sub()
import re
text = "逗2B最逗3B欢乐"
data = re.sub("\dB", "沙雕", text)
print(data)
# 逗沙雕最逗沙雕欢乐
data = re.sub("\dB", "沙雕", text, 1)
print(data)
# 逗沙雕最逗3B欢乐
根据匹配成功的位置分割:split()
import re
text = "逗2B最逗3B欢乐"
data = re.split("\dB", text)
print(data)
# ['逗', '最逗', '欢乐']
data = re.split("\dB", text, 1)
print(data)
# ['逗', '最逗3B欢乐']
在输入字符串中找到所有匹配内容,返回可迭代的对象:finditer
import re
text = "逗2B最逗3B欢乐"
data = re.finditer("\dB", text)
for item in data:
print(item.group())
# 2B
# 3B
data = re.finditer("(?P<xx>\dB)", text) # 命名分组
for item in data:
print(item.groupdict())
# {'xx': '2B'}
# {'xx': '3B'}
text = "dsf130429191912015219k13042919591219521Xkk"
data_list = re.finditer("\d{6}(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})\d{3}[\d|X]", text)
for item in data_list:
info_dict = item.groupdict()
print(info_dict)
# {'year': '1919', 'month': '12', 'day': '01'}
# {'year': '1959', 'month': '12', 'day': '19'}
本文详细介绍了Python中的正则表达式,包括字符相关匹配、数量相关、括号分组、起始和结束等概念,并通过实例演示了如何使用re模块进行文本匹配、查找、替换和分割等操作,适用于数据处理和字符串校验场景。
4万+

被折叠的 条评论
为什么被折叠?



