re模块使用
with open('re模块使用.html',mode='r',encoding='utf-8') as f:
# 读取内容,并去除换行以空串代替
context = re.sub('\n','',f.read())
#定义正则
pattern_1 = '<div class="email">(.*?)</div>'
ret_1 = re.findall(pattern_1,context)
print(ret_1[0].strip())#去除空格strip
#正则表达式前面添加一个小写的r表示防止转义
password_pattern = r'^[a-zA-Z]{1}[a-zA-Z0-9_]{5,15}$'
pass1 = '1234567'
pass2 = 'k123456'
pass3 = 'k123'
# 打印测试结果,匹配成功返回re.Match对象,不成功返回None
print(re.match(password_pattern, pass1))
print(re.match(password_pattern, pass2))
print(re.match(password_pattern, pass3))
f.close()
#re模块
import re
with open('static/html/index.html','r',encoding='utf-8') as f:
html = re.sub('\n','',f.read())#获取文件,去掉换行符
section_pattern = '<section class="main_section">(.*?)</section>'
section_s = re.findall(section_pattern,html)
print(section_s)
print(len(section_s))
category_pattern = '<h1>(.*?)</h1>' #
course_pattern = '<span class="course_name">(.*?)</span>'
data_s = []
for section in section_s:
print(section)
category = re.findall(category_pattern,section)[0]
course_s = re.findall(course_pattern,section)
data_s.append(
{
'category':category,
'course_s':course_s
}
)
print(data_s)
for data in data_s:
print(data.get('category'))
for course in data.get('course_s'):
print(' ',course)
#对于使用js隐藏并显示的html内容 使用xpath不能完成需要使用正则表达式
with open('meiju1.html','r',encoding='utf-8') as f:
html = re.sub('\n','',f.read())
title_pattern = '<div class="threadlist_title pull_left j_th_tit ">.*?<a.*?>(.*?)</a>'
title_s = re.findall(title_pattern,html)
for title in title_s:
print(title)
#json对象和pyThon对象转换
import json
python_data = [
{
'username':'name1',
'vip':True,
},
{
'username':None,
'vip':False,
},
]
# dumps 用于把Python对象转换成Json对象
json_data = json.dumps(python_data)
print(json_data)
print(type(json_data))
# loads 用于把Json对象转换成Python对象
python_data2 = json.loads(json_data)
print(python_data2)
print(type(python_data2))
# dump 把Python类型的数据转化成Json类型的字符串,然后保存到本地
json.dump(python_data,open('json.txt','w'))
# load 用于读取本地的json数据,并转换成Python对象
python_data2 = json.load(open('json.txt'))
print(python_data2)
#xPath
import lxml.etree as le
with open('edu.html','r',encoding='utf-8') as f:
html = f.read()
# print(html)
html_x = le.HTML(html)
# print(html_x)
div_x_s = html_x.xpath('//div[@class="classify_cList"]')
data_s = []
for div_x in div_x_s:
category1 = div_x.xpath('./h3/a/text()')[0]
category2_s = div_x.xpath('./div/span/a/text()')
data_s.append(
dict(
category1 = category1,
category2_s = category2_s
)
)
print(data_s)
for data in data_s:
print(data.get('category1'))
for category2 in data.get('category2_s'):
print(' ',category2)