数据清洗和提取策略

最新推荐文章于 2024-11-22 09:58:31 发布
原创最新推荐文章于 2024-11-22 09:58:31 发布 · 9.5k 阅读
CC 4.0 BY-SA版权
文章标签：
re模块使用
with open('re模块使用.html',mode='r',encoding='utf-8') as f:
    # 读取内容,并去除换行以空串代替
    context = re.sub('\n','',f.read())
    #定义正则
    pattern_1 = '<div class="email">(.*?)</div>'
    ret_1 = re.findall(pattern_1,context)
    print(ret_1[0].strip())#去除空格strip
    
    #正则表达式前面添加一个小写的r表示防止转义
    password_pattern = r'^[a-zA-Z]{1}[a-zA-Z0-9_]{5,15}$'
    pass1 = '1234567'
    pass2 = 'k123456'
    pass3 = 'k123'
    # 打印测试结果，匹配成功返回re.Match对象，不成功返回None
    print(re.match(password_pattern, pass1))
    print(re.match(password_pattern, pass2))
    print(re.match(password_pattern, pass3))



    f.close()


#re模块
import re

with open('static/html/index.html','r',encoding='utf-8') as f:
    html = re.sub('\n','',f.read())#获取文件，去掉换行符
    section_pattern = '<section class="main_section">(.*?)</section>'
    section_s = re.findall(section_pattern,html)
    print(section_s)
    print(len(section_s))
    category_pattern = '<h1>(.*?)</h1>' #
    course_pattern = '<span class="course_name">(.*?)</span>'
    data_s = []
    for section in section_s:
        print(section)
        category = re.findall(category_pattern,section)[0]
        course_s = re.findall(course_pattern,section)
        data_s.append(
            {
                'category':category,
                'course_s':course_s
            }
        )
    print(data_s)
    for data in data_s:
        print(data.get('category'))
        for course in data.get('course_s'):
            print('    ',course)







#对于使用js隐藏并显示的html内容 使用xpath不能完成需要使用正则表达式
with open('meiju1.html','r',encoding='utf-8') as f:
    html = re.sub('\n','',f.read())
    title_pattern = '<div class="threadlist_title pull_left j_th_tit ">.*?<a.*?>(.*?)</a>'
    title_s = re.findall(title_pattern,html)
    for title in title_s:
        print(title)


#json对象和pyThon对象转换
import json

python_data = [
    {
        'username':'name1',
        'vip':True,
    },
    {
        'username':None,
        'vip':False,
    },
]

# dumps 用于把Python对象转换成Json对象
json_data = json.dumps(python_data)
print(json_data)
print(type(json_data))

# loads 用于把Json对象转换成Python对象
python_data2 = json.loads(json_data)
print(python_data2)
print(type(python_data2))


# dump 把Python类型的数据转化成Json类型的字符串，然后保存到本地
json.dump(python_data,open('json.txt','w'))

# load 用于读取本地的json数据，并转换成Python对象
python_data2 = json.load(open('json.txt'))
print(python_data2)


#xPath
import lxml.etree as le

with open('edu.html','r',encoding='utf-8') as f:
    html = f.read()
    # print(html)
    html_x = le.HTML(html)
    # print(html_x)
    div_x_s = html_x.xpath('//div[@class="classify_cList"]')
    data_s = []
    for div_x in div_x_s:
        category1 = div_x.xpath('./h3/a/text()')[0]
        category2_s = div_x.xpath('./div/span/a/text()')
        data_s.append(
            dict(
                category1 = category1,
                category2_s = category2_s
            )
        )
    print(data_s)
    for data in data_s:
        print(data.get('category1'))
        for category2 in data.get('category2_s'):
            print('   ',category2)