工具包:
python的requests ,lxml,xpath ,pandas包
IDE:Anaconda
代码:
import requests
from lxml import etree
import pandas as pd
# 将获取的数据写入文件中
def write_to_csv(*dd):
ll = []
for i in dd:
ll.append(i)
columns = [1,2,3,4,5,6,7]
data = pd.DataFrame(columns = columns)
# 每一行数据追加到末尾
data.loc[-1] = ll
# mode=a 是追加模式,index,header =0 不存储下表和列
data.to_csv('test.csv',encoding='utf-8',index=0,header=0,mode='a')
def justgo():
uri='https://dongguan.cncn.com/dingpiao/'
res = requests.get(uri)
#print(res.text)
html = etree.HTML(res.text)
box = html.xpath('//div[contains(@class,"item")]')
for x in box:
#人气 hotel_hot
hotel_hot = x.xpath('./div[@class="lPic"]/div[@class="hotel_hot"]')
if len(hotel_hot)>0 :
hotel_hot = hotel_hot[0].xpath('string(.)')
# 项目名字
tour_name = x.xpath('./div[@class="rTxt"]/div/h3/a')
if tour_name:
tour_name = tour_name[0].xpath('string(.)')
# 项目类型
tour_type = x.xpath('./div[@class="rTxt"]/div/h3/span')
if tour_type:
tour_type = tour_type[0].text
# 儿童票
child_tickt = x.xpath('./div[@class="rTxt"]/div[@class="rooms"]/ul/li[2]/span[@class="w1"]')
# 市场价
child_sale_tickt = x.xpath('./div[@class="rTxt"]/div[@class="rooms"]/ul/li[2]/span[2]')
# 欣欣价
child_sale_xin = x.xpath('./div[@class="rTxt"]/div[@class="rooms"]/ul/li[2]/span[3]')
# 取票方式
child_get_ways = x.xpath('./div[@class="rTxt"]/div[@class="rooms"]/ul/li[2]/span[4]')
if len(child_tickt) & len(child_sale_tickt) & len(child_sale_xin) & len(child_get_ways):
child_tickt = child_tickt[0].text
child_sale_tickt = child_sale_tickt[0].text
child_sale_xin = child_sale_xin[0].text
child_get_ways = child_get_ways[0].text
write_to_csv(tour_name, hotel_hot,tour_type, child_tickt,child_sale_tickt,child_sale_xin,child_get_ways)
justgo()