汽车之家的数据爬取还是比较简单的,遇到的坑如下:
页面的页面编码格式:汽车之家的页面编码格式有三种,分别是**“GB2312”,“ISO-8859-1"和"UTF-8-SIG”,每次使用requests模块获取页面的html时,会随机出现其中的一种,其中页面编码格式为"GB2312",“ISO-8859-1”,可以正常显示数据,而当编码格式为"UTF-8-SIG"时,会获取不到一些关键数据.
解决办法:使用第三方库:chardet的detect()**方法,返回页面的编码格式.
其余基本上没什么难度,简单的代码实现如下:
"""
1.获取汽车的id;
2.拼接api地址;
3.抓取并保存数据.
"""
import hashlib
import random
import chardet
import pymysql
import redis
import requests
from lxml import etree
import time
class CarSpider:
api_url = "https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}" # specid
start_url = "https://www.che168.com/weifang/a0_0msdgscncgpi1ltocsp{}exx0/?pvareaid=102179#currengpostion" # since 1
headers = {
'User-Agent':"",
'Referer':"",
'Cookie': ""
}
def __init__(self):
self.conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',password='123456',db='spider')
self.cursor = self.conn.cursor()
self.r = redis.Redis(host='127.0.0.1',port=6379,db=0)
def __del__(self):
self.cursor.close()
self.conn.close()
def create_table(self):
sql = """
create table if not exists car_spider_table(
id int primary key auto_increment,
car_name varchar(100) not null,
car_price varchar(100),
car_type varchar(100),
car_standard varchar(100),
car_power varchar(50),
car_motor varchar(100),
car_gearbox varchar(100),
car_structure varchar(100),
car_speed varchar(100)
)
"""
try:
self.cursor.execute(sql)
print('----创建表成功----')
except Exception as e:
print('----创建表失败----!',e)
def get_one_html(self,url):
html = requests.get(url=url,headers=self.headers,allow_redirects=False)
encoding = chardet.detect(html.content)['encoding']
if encoding == 'GB2312' or encoding == 'ISO-8859-1':
response = html.content.decode('gbk')
else:
response = html.content.decode(encoding)
print('----遭遇反爬----')
self.parse_one_html(response)
def parse_one_html(self,response):
tree = etree.HTML(response)
c_list = tree.xpath("//div[@id='goodStartSolrQuotePriceCore0']/ul/li")
for item in c_list:
specid = item.xpath("./@specid")
if specid:
url = self.api_url.format(specid[0])
self.get_two_html(url)
def get_two_html(self,url):
response = requests.get(url=url,headers=self.headers).json()
self.parse_two_html(response)
def parse_two_html(self,response):
try:
car_info_list = response['result'].get('paramtypeitems')
if car_info_list:
item = dict()
item['car_name'] = car_info_list[0]["paramitems"][0]["value"] if car_info_list else '暂无数据'
item['car_price'] = car_info_list[0]["paramitems"][1]["value"] if car_info_list else '暂无数据'
item['car_type'] = car_info_list[0]["paramitems"][4]["value"] if car_info_list else '暂无数据'
item['car_standard'] = car_info_list[0]["paramitems"][5]["value"] if car_info_list else '暂无数据'
item['car_power'] = car_info_list[0]["paramitems"][7]["value"] if car_info_list else '暂无数据'
item['car_motor'] = car_info_list[0]["paramitems"][9]["value"] if car_info_list else '暂无数据'
item['car_gearbox'] = car_info_list[0]["paramitems"][10]["value"] if car_info_list else '暂无数据'
item['car_structure'] = car_info_list[0]["paramitems"][12]["value"] if car_info_list else '暂无数据'
item['car_speed'] = car_info_list[0]["paramitems"][13]["value"] if car_info_list else '暂无数据'
self.check_data(item)
except Exception as e:
print('----数据不存在----',e)
def check_data(self,item):
md5 = hashlib.md5()
md5.update(str(item).encode('utf-8'))
value = md5.hexdigest()
redis_result = self.r.sadd('car_spider_set:filter',value)
if redis_result:
self.save_data(item)
else:
print('----数据已存在----')
def save_data(self,item):
car_info_id = 0
sql = """
insert into car_spider_table(
car_name,
car_price,
car_type,
car_standard,
car_power,
car_motor,
car_gearbox,
car_structure,
car_speed
)values(%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
try:
self.cursor.execute(sql,(item['car_name'],item['car_price'],item['car_type'],item['car_standard'],item['car_power'],item['car_motor'],item['car_gearbox'],item['car_structure'],item['car_speed']))
self.conn.commit()
print('----插入数据成功----')
except Exception as e:
print('----插入数据失败----',e)
def main(self):
for page in range(20,31):
url = self.start_url.format(page)
self.get_one_html(url)
time.sleep(random.randint(2,3))
print('----第{}页数据爬取完毕----'.format(page))
if __name__ == '__main__':
car_spider = CarSpider()
car_spider.create_table()
car_spider.main()在这里插入代码片
306

被折叠的 条评论
为什么被折叠?



