泰国旅游信息爬取
# _*_ coding:utf-8 _*_
import requests
from lxml import etree
import json
class BangkokTravel:
"""
爬取取曼谷旅游的信息
"""
def __init__(self):
pass
def getPage(self):
"""
获取页面内容,以及解析出想要的内容
"""
url = "https://www.mafengwo.cn/sales/0--M11045P泰国-0-0-0-0-0.html"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"}
html = requests.get(url,headers = headers)
html.encoding = "utf-8"
# print(html.text)
xml = etree.HTML(html.text)
#返回每个节点的位置
node_list = xml.xpath('//a[@class="item clearfix"]')
for node in node_list:
#获取进入下一个页面的链接 ,,,detail_url
detail_url = node.xpath('./@href')[0]
print(detail_url)
#标题,,,,tittle
tittle = node.xpath('.//h3/text()')[0].replace(" ","")
print(tittle)
#价格,,,price
price = node.xpath('.//span[@class="price"]/strong/text()')[0]
print(price)
#图片,,,,image_url
image_url = node.xpath('.//img/@src')[0]
print(image_url)
#已售,,,,yet_sale
yet_sale= node.xpath('.//div[@class="info"]/p[1]/text()')[0]
print(yet_sale)
#班期,,,time
time = node.xpath('.//span[@class="t"]/text()')[0]
print(time)
self.write_Content(detail_url,tittle,price,image_url,yet_sale,time)
def write_Content(self,detail_url,tittle,price,image_url,yet_sale,time):
"""
对爬取的信息进行保存
"""
items = {
"详情连接":detail_url,
"标题":tittle,
"价格":price,
"图片链接":image_url,
"已经售出":yet_sale,
"启程日期":time
}
with open("泰国.json","a",encoding="utf8") as f:
f.write(str(json.dumps(items,ensure_ascii=False)+"\n"))
f.close()
if __name__ == "__main__":
bangkok = BangkokTravel()
bangkok.getPage()