前言
四天入门Python(慕课网地址),三天入门简单爬虫(慕课网地址),八天撸完180行的代码,一路磕磕绊绊,一路的bugbugbug…,索性还是以比较快的速度解决了女票的要求(爬来的数据给女票写小论文用)。
直接先码代码贴上,以后有空再对代码进行详解吧…:
#爬取爱卡汽车网站所有“纯电动”汽车的“车名”、“价格”、“级别”、“续航”、“电量”,并输出为html
#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
from urllib import request
#8个电动汽车的总页面的url
first_urls = []
for i in range(1, 9):
first_urls.append('http://newcar.xcar.com.cn/car/0-0-0-0-0-0-0-0-0-0-0-'+str(i)+'-1-0/')
#8个总页面下的所有款车的url
nodes_num = []
for url in first_urls:
response = request.urlopen(url)
html_cont = response.read()
soup = BeautifulSoup(html_cont.decode('gb2312'),'html.parser')
link_nodes = soup.find_all('a', class_="car_search_ps_list_a")#link_nodes是一个list列表
for i in link_nodes:
nodes_num.append('http://newcar.xcar.com.cn'+str(i['href']))
#构造datas-------------------------------------------------------------
datas = []
nodes_rev = []#收集有用的url,加工成口碑url存到此列表中
count = 0
for url_num in nodes_num:
count+=1
response_num = request.urlopen(url_num)
html_cont_num = response_num.read()
soup_num = BeautifulSoup(html_cont_num.decode('gbk'),'html.parser')
url_rev = url_num+'review.htm'
response_rev = request.urlopen(url_rev)
html_cont_rev = response_rev.read()
soup_rev = BeautifulSoup(html_cont_rev.decode('gbk'),'html.parser')
res_data = {
}#dict列表,每轮res_data列表包含13个key(车名、价格......)
try:
link_node01 = soup_num.find("span", class_="lt_f1").get_text()
link_node02 = soup_num.find('div', class_="tt_h1").find("h1").get_text()
res_data['车名'] = link_node01+link_node02
link_node03 = soup_num.find('a', class_="com_price_menu").get_text()
#print "价格:", link_node03,"万"
res_data['价格'] = link_node03
link_node04 = soup_num.find_all('li',