1、介绍
Python
使用 re
模式提供了正则表达式处理的能力。
- 使用
| 位或
运算开启多种选项
2、方法
2.1 编译
Signature: re.compile(pattern, flags=0)
- Docstring: Compile a regular expression pattern, returning a Pattern object.
- 设定 flags,编译模式,返回正则表达式对象 regex,pattern 就是正则表达式字符串。
- 正则表达式需要被编译,为了提高效率,这些编译后的结果被保存,下次使用同样的 pattern 的时候,就不需要再次编译
- re 的其它方法为了提高效率都调用了编译方法,就是为了提速
import re
regex = re.compile('b\w+')
print(regex, type(regex))
# re.compile('b\\w+') <class 're.Pattern'>
2.2 单次匹配
Signature: re.match(pattern, string, flags=0)
- Docstring:Try to apply the pattern at the start of the string, returning a Match object, or None if no match was found.
s = 'bottle\nbag\nbig\napple'
result = re.match('\Ab+', s)
print(result, result.span(), result.group())
print(result.string.encode())
Out:
<re.Match object; span=(0, 1), match='b'> (0, 1) b
b'bottle\nbag\nbig\napple'
Signature: re.search(pattern, string, flags=0)
- Docstring:Scan through string looking for a match to the pattern, returning a Match object, or None if no match was found.
Signature: re.fullmatch(pattern, string, flags=0)
- Docstring:Try to apply the pattern to all of the string, returning a Match object, or None if no match was found.
import re
s = """bottle\nbag\nbig\napple"""
print(s)
Out:
bottle
bag
big
apple
for i,c in enumerate(s, 1):
print((i-1, c), end='\n' if i%5==0 else ' ')
print()
Out:
(0, 'b') (1, 'o') (2, 't') (3, 't') (4, 'l')
(5, 'e') (6, '\n') (7, 'b') (8, 'a') (9, 'g')
(10, '\n') (11, 'b') (12, 'i') (13, 'g') (14, '\n')
(15, 'a') (16, 'p') (17, 'p') (18, 'l') (19, 'e')
# '---match---'
print(re.match('b', s)) # <re.Match object; span=(0, 1), match='b'>
print(re.match('a', s)) # None
print(re.match('^a', s, re.M)) # None
print(re.match('^a', s, re.S)) # None
# 先编译再使用正则表达式
regex = re.compile('a')
print(regex.match(s)) # None
print(regex.match(s, 15)) # <re.Match object; span=(15, 16), match='a'>
print(regex.match(s, 8)) # <re.Match object; span=(8, 9), match='a'>
# '---search---'
print(re.search('a', s)) # <re.Match object; span=(8, 9), match='a'>
regex = re.compile('b')
print(regex.search('b')) # <re.Match object; span=(0, 1), match='b'>
print(regex.search(s, 1)) # <re.Match object; span=(7, 8), match='b'>
print(regex.search(s, 8)) # <re.Match object; span=(11, 12), match='b'>
regex = re.compile('^b', re.S)
print(regex.search(s)) # <re.Match object; span=(0, 1), match='b'>
print(regex.search(s, 8)) # None
regex = re.compile('^b', re.M)
print(regex.search(s)) # <re.Match object; span=(0, 1), match='b'>
print(regex.search(s, 8)) # <re.Match object; span=(11, 12), match='b'>
# ---fullmatch---
regex = re.compile('bag')
print(regex.fullmatch(s)) # None
print(regex.fullmatch(s, 7)) # None
print(regex.fullmatch(s, 7, 9)) # None
print(regex.fullmatch(s, 7, 10)) # <re.Match object; span=(7, 10), match='bag'>
# 需要完全匹配,多了少了都不行
2.3 全文搜索
-
Signature: re.findall(pattern, string, flags=0)
-
Docstring:Return a list of all non-overlapping matches in the string. If one or more capturing groups are present in the pattern, return a list of groups; this will be a list of tuples if the pattern has more than one group
-
对整个字符串,从左至右匹配,返回所有匹配项的列表
# ---findall--- regex = re.compile('b') print(regex.findall(s)) # ['b', 'b', 'b'] print(regex.findall(s, 7)) # ['b', 'b'] regex = re.compile('^b') print(regex.findall(s)) # ['b'] print(regex.findall(s, 7)) # [] regex = re.compile('^b', re.M) print(regex.findall(s)) # ['b', 'b', 'b'] print(regex.findall(s, 7)) # ['b', 'b'] print(regex.findall(s, 7, 10)) # ['b'] regex = re.compile('^b', re.S) print(regex.findall(s)) # ['b'] print(regex.findall(s, 7)) # []
-
Signature: re.finditer(pattern, string, flags=0)
-
Docstring:Return an iterator over all non-overlapping matches in the string. For each match, the iterator returns a Match object.
-
对整个字符串,从左至右匹配,返回所有匹配项,返回迭代器,注意每次迭代返回的是match 对象
# ---finditer--- regex = re.compile('b') result = regex.finditer(s) print(result) # <callable_iterator object at 0x000002A5481652B0> print(type(result)) # <class 'callable_iterator'> r = next(result) print(type(r), r) # <class 're.Match'> <re.Match object; span=(0, 1), match='b'> print(r.start(), r.end(), s[r.start():r.end()]) # 0 1 b r = next(result) print(type(r), r) # <class 're.Match'> <re.Match object; span=(7, 8), match='b'> print(r.start(), r.end(), s[r.start():r.end()]) # 7 8 b
list(re.finditer('[abc]', s)) [<re.Match object; span=(0, 1), match='b'>, <re.Match object; span=(7, 8), match='b'>, <re.Match object; span=(8, 9), match='a'>, <re.Match object; span=(11, 12), match='b'>, <re.Match object; span=(15, 16), match='a'>]
2.4 匹配替换
-
Signature: re.sub(pattern, repl, string, count=0, flags=0)
-
使用 pattern 对字符串 string 进行匹配,对匹配项使用 repl 替换
-
Signature: re.subn(pattern, repl, string, count=0, flags=0)
-
同 sub 返回 一个元组(new_string, number_of_subs_made)
for i,c in enumerate(s, 1): print((i-1, c), end='\n' if i%5==0 else ' ') print() Out: (0, 'b') (1, 'o') (2, 't') (3, 't') (4, 'l') (5, 'e') (6, '\n') (7, 'b') (8, 'a') (9, 'g') (10, '\n') (11, 'b') (12, 'i') (13, 'g') (14, '\n') (15, 'a') (16, 'p') (17, 'p') (18, 'l') (19, 'e')
regex = re.compile('b\wg') print(regex.sub('sybil', s)) # bottle # sybil # sybil # apple print(regex.sub('sybil', s, 1)) # bottle # sybil # big # apple regex = re.compile('\s+') print(regex.subn('\t', s)) # ('bottle\tbag\tbig\tapple', 3) print(regex.subn('+', s)) # ('bottle+bag+big+apple', 3) regex = re.compile('a') print(regex.subn('A', s)) # ('bottle\nbAg\nbig\nApple', 2)
2.5 分割字符串
-
字符串的分割函数 split ,太难用,不能指定多个字符进行分割
-
Signature: re.split(pattern, string, maxsplit=0, flags=0)
-
re.split 分割字符串
s = """ os.apth.abspath(path) normpath(join(os.getcwd(), path)) """ print(s.split()) # ['os.apth.abspath(path)', 'normpath(join(os.getcwd(),', 'path))'] print(re.split('[\.()\s,]+', s)) # ['', 'os', 'apth', 'abspath', 'path', 'normpath', 'join', 'os', 'getcwd', 'path', '']
2.6 分组
- 使用小括号的 pattern 捕获的数据被放到了组 group 中
- match / search 函数可以返回 match 对象
- findall 返回 字符串列表
- finditer 返回 一个个 match 对象
- 如果 pattern 中使用了分组,如果有匹配的结果,会在 match 对象中
- 使用 group(N)方式返回对应分组,1 到 N 是对应的分组, 0 返回整个匹配的字符串, N 不写缺省为 0
- 如果使用了命名分组,可以使用 group(’name‘)的方式获取分组
- 也可以使用 groups()返回所有组
- 使用 groupdict()返回所有命名的分组
s = """bottle\nbag\nbig\napple"""
print(s)
Out:
bottle
bag
big
apple
regex = re.compile('(b\w+)')
result = regex.match(s)
print(1, type(result), result)
print(2, 'match', result.groups())
result = regex.search(s, 1)
print(3, 'search', result.groups())
# name the group
regex = re.compile('(b\w+)\n(?P<name2>b\w+)\n(?P<name3>b\w+)')
result = regex.match(s)
print(4, 'match', result)
print(5, result.group(3), result.group(2), result.group(1))
print(6, result.group(0).encode())
print(7, result.group('name2'))
print(8, result.groups())
print(9, result.groupdict())
Out:
1 <class 're.Match'> <re.Match object; span=(0, 6), match='bottle'>
2 match ('bottle',)
3 search ('bag',)
4 match <re.Match object; span=(0, 14), match='bottle\nbag\nbig'>
5 big bag bottle
6 b'bottle\nbag\nbig'
7 bag
8 ('bottle', 'bag', 'big')
9 {'name2': 'bag', 'name3': 'big'}
result = regex.findall(s)
for x in result:
print(type(x), x)
Out:
<class 'tuple'> ('bottle', 'bag', 'big')
regex = re.compile('(?P<name_b>b\w+)')
result = regex.finditer(s)
print(result)
for x in result:
print(type(x), x, x.group(), x.group('name_b'))
Out:
<callable_iterator object at 0x000002A54899E490>
<class 're.Match'> <re.Match object; span=(0, 6), match='bottle'> bottle bottle
<class 're.Match'> <re.Match object; span=(7, 10), match='bag'> bag bag
<class 're.Match'> <re.Match object; span=(11, 14), match='big'> big big