代码:
Python资源共享群:626017123
import requests
from bs4 import BeautifulSoup
import re
import random
import time
# 爬虫主函数
def mm(url):
# 设置目标url,使用requests创建请求
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
req0 = requests.get(url=url, headers=header)
req0.encoding = "gb18030" # 解决乱码问题
html0 = req0.text
# 使用BeautifulSoup创建html代码的BeautifulSoup实例,存为soup0
soup0 = BeautifulSoup(html0, "html.parser")
# 获取最后一页数字,对应-122(对照前一小节获取尾页的内容看你就明白了)
total_page = int(soup0.find("div", class_="pagers").findAll("a")[-2].get_text())
myfile = open("aika_qc_gn_1_1_1.txt", "a", encoding='gb18030', errors='ignore') # 解决乱码问题
print("user", " 来源", " 认为有用人数", " 类型", " comment")
NAME = "user" + " 来源" + " 认为有用人数" + " 类型" + " comment"
myfile.write(NAME + "\n")
for i in list(range(1, total_page + 1)):
# 设置随机暂停时间
stop = random.uniform(1, 3)
url = "http://newcar.xcar.com.cn/257/review/0/0_" + str(i) + ".htm"
req = requests.get(url=url, headers=header)
req.encoding = "gb18030" # 解决乱码问题
html = req.text
soup = BeautifulSoup(html, "html.parser")
contents = soup.find('div', class_="review_comments").findAll("dl")
l = len(conte
Python爬虫实战:汽车页面信息抓取与分析

本文介绍使用Python进行网络爬虫,详细讲解如何爬取汽车相关网页的数据,并对其进行深入的分析,内容包括爬虫实现步骤及数据分析技巧。
最低0.47元/天 解锁文章
235

被折叠的 条评论
为什么被折叠?



