# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des :Python 爬虫过程中编码乱码问题解决,
默认情况况下,requests会自动从响应头和响应体中解析编码方式,解析失败会赋个默认编码格式 ISO-8859-1,所以导致遇到GBK等编码时出现乱码,以下get_encodin通过相同原理,获取编码方式,成功率显著提高
"""
import chardet
import requests
import re
def get_encoding(response_obj:object)->str:
"""
Automatically parse web page encoding
First parse the charset from the response header
Then parse from the response data
:param response_obj: REQUEST RESPONSE OBJECT
:return: Coding
"""
charset_header = response_obj.headers["Content-Type"]
pattern_charset_header = re.compile('charset=(.*)', re.I)
charset = re.search(pattern_charset_header, charset_header)
if charset is not None:
return charset.groups()[0]
else:
charset = chardet.detect(response_obj.content)["encoding"]
return charset
url = "http://news.inewsweek.cn/society/2022-05-30/15753.shtml"
res = requests.get(url)
res.encoding = get_encoding(res)
print(res.text)
Python requests响应数据乱码问题处理
最新推荐文章于 2024-07-12 16:58:11 发布