In [1]: test ='hello my name is insane'
In [2]:import re
In [3]: result = re.search('hello (.*) name is (.*)', test)
In [4]: result.groups()
Out[4]:('my','insane')
In [5]: result.group(1)
Out[5]:'my'
In [6]: result.group(2)
Out[6]:'insane'
In [10]: result = re.match('hello', data)
In [11]: result.groups()
Out[11]:()
In [12]: result.group()
Out[12]:'hello'
re模块-compile
compile(pattern, flags=0)
定义一个匹配规则的对象
In [13]: data='hello my email is insane@loaf.com i like python'
In [14]: re_obj = re.compile('email is (.*?) i')
In [15]: result = re_obj.findall(data)
In [16]: result
Out[16]:['insane@loaf.com']
re的额外匹配要求
实战
#!/usr/bin/python3# -*- coding: utf-8 -*-# @Time : 2021/8/28 22:13# @Author : InsaneLoafer# @File : re_test2.pyimport re
defcheck_url(url):"""
判断url是否合法
:param url:
:return:
"""
re_g = re.compile('[a-zA-Z]{4,5}://\w*\.*\w+\.\w+')print(re_g)
result = re_g.findall(url)iflen(result)!=0:returnTrueelse:returnFalsedefget_url(url):"""
通过组获取url中的某一部分
:param url:
:return:
"""
re_g = re.compile('[https://|http://](\w*\.*\w+\.\w+)')
result = re_g.findall(url)iflen(result)!=0:return result[0]else:return''defget_email(data):# result = re.findall('[0-9a-zA-Z_]+@[0-9a-zA-Z]+\.[a-zA-Z]+', data)
re_g = re.compile('.+@.+\.[a-zA-Z]+')
result = re_g.findall(data)return result
html =('<div class="s-top-nav" style="display:none;">''</div><div class="s-center-box"></div>')defget_html_data(data):"""
获取style中的display:
使用非贪婪模式
"""
re_g = re.compile('style="(.*?)"')
result = re_g.findall(data)return result
defget_all_data_html(data):"""
获取html中所有等号后双引号内的字符
:param data:
:return:
"""
re_g = re.compile('="(.+?)"')
result = re_g.findall(data)return result
if __name__ =='__main__':
result = check_url('https://www.baidu.com')print(result)
result = get_url('https://www.baidu.com')print(result,'https')
result = get_url('http://www.baidu.com')print(result,'http')
result = get_email('insane@163.net')print(result)
result = get_html_data(html)print(result)
result = get_all_data_html(html)print(result)
re_g = re.compile('\s')
result = re_g.split(html)print(result)
re_g = re.compile('<div class="(.*?)"')
result = re_g.match(html)print(result)print(result.span())# 返回结果的前后索引print(html[:22])