爬虫过程中解决html乱码和获取的文本乱码问题

最新推荐文章于 2022-08-01 16:59:45 发布

原创最新推荐文章于 2022-08-01 16:59:45 发布 · 5.9k 阅读

5 ·

CC 4.0 BY-SA版权

爬虫专栏收录该内容

5 篇文章

订阅专栏

本文介绍了解决爬虫过程中遇到的HTML和文本乱码问题的方法，通过使用iso-8859-1编码来正确解析网页内容，并展示了如何利用Python的requests库和lxml库进行数据抓取和解析。

部署运行你感兴趣的模型镜像

爬虫过程中解决html乱码和获取的文本乱码问题
response1 = requests.get(url=detail_url, headers=headers)
responseText1 = response1.text
获取的html中有乱码，xpath解析出来的文本当然也有乱码。
解决办法：
responseText1 = response1.text.encode(‘iso-8859-1’)
utf-8也不行，用iso-8859-1

# coding=utf-8
import requests
from lxml import etree
import pandas as pd
import time
import csv

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}

#  写入表头
header = ['company', 'position', 'salary', 'address', 'experience', 'education', 'number_people', 'date', 'welfare', 'position_type']
with open('./beijing.csv', 'w', encoding='utf-8', newline='')as f:
    writer = csv.writer(f)
    writer.writerow(header)


page = 1
while 1:

    # 列表页url
    print('爬取第{}页'.format(page))
    list_url = 'https://search.51job.com/list/010000,000000,7500,38,9,99,%2B,2,{}.html'.format(str(1))
    response = requests.get(url=list_url, headers=headers)
    responseText = response.text

    # print(responseText)

    html_str = etree.HTML(responseText)
    #  得到详情页列表
    detailUrl_list = html_str.xpath("//p[@class='t1 ']/span/a/@href")
    print(detailUrl_list)

    #  请求每一个详情页url，xpath解析数据
    for detail_url in detailUrl_list:
        response1 = requests.get(url=detail_url, headers=headers)
        responseText1 = response1.text.encode('iso-8859-1')
        html_str1 = etree.HTML(responseText1)

        #  解析数据

        #  职位
        position_list = html_str1.xpath("//div[@class='cn']/h1/@title")
        position = position_list[0] if position_list else None
        print(position)

        #  公司
        company_list = html_str1.xpath("//p[@class='cname']/a/@title")
        #  处理为空的数据
        company = company_list[0] if company_list else None
        print(company)

        #  薪资
        salary_list = html_str1.xpath("//div[@class='cn']/strong/text()")
        salary = salary_list[0] if salary_list else None
        print(salary)

        #  基本信息
        try:
            other_list = html_str1.xpath("//p[@class='msg ltype']//text()")
            print(other_list)
            #  数据处理
            Other = ''.join(other_list).replace('|', '').split() if other_list else None
            print(Other)

            address = Other[0] if other_list else None
            experience = Other[1] if other_list else None
            education = Other[2] if other_list else None
            number_people = Other[3] if other_list else None
            date = Other[4] if other_list else None
            print(address, experience, education, number_people, date)
        except:
            address, experience, education, number_people, date = None, None, None, None, None

        #  福利待遇
        try:
            welfare_list = html_str1.xpath("//div[@class='t1']/span/text()")
            welfare = ','.join(welfare_list)
            print(welfare)
        except:
            welfare = '未公布福利待遇'

        try:
            position_type_list = html_str1.xpath("//p[@class='fp']/a/text()")
            position_type = ','.join(position_type_list)
            print(position_type)
        except:
            position_type = '暂无信息'

        #  将数据存入csv
        data_tuple = (company, position, salary, address, experience, education, number_people, date, welfare, position_type)

        df = pd.DataFrame(columns=data_tuple)
        df.to_csv('beijing.csv', mode='a', line_terminator='\n', sep=',', index=False)

        #  调整请求速度，可以自己调整  睡眠单位是秒
        time.sleep(1)

    #  页数+1
    page += 1