爬取携程上成都双流机场出发、到达信息

最新推荐文章于 2023-09-25 09:43:02 发布

原创最新推荐文章于 2023-09-25 09:43:02 发布 · 1.1k 阅读

7 ·

CC 4.0 BY-SA版权

爬虫专栏收录该内容

1 篇文章

订阅专栏

导入模块：

re:（Regular Expression 正则表达式），提取满足要求的信息（字符串）

requests：发送网络请求，返回响应数据（连接到对方网站）

traceback:抛出详细的异常信息

BeautifulSoup:Python解析html或xml的库

code:

import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHTMLText(url):#传入地址参数，返回响应内容
try:
r = requests.get(url, timeout=30)#设置必须在3s内收到响应，不然或抛出ReadTimeout异常
r.raise_for_status()#出错返回状态码，如404
r.encoding = r.apparent_encoding#将头编码设置为内容响应编码
return r.text#返回响应内容
except:
return ""

def getStockList(lst,stockURL):#获取航班号
html = getHTMLText(stockURL)
soup = BeautifulSoup(html,'html.parser')#使用返回内容创建BeautifulSoup对象
a = soup.find_all('a')#查找所有a标签（其中包括航班号信息）
for i in a:
try:
title = i.attrs["title"]#在a标签中查找title部分
lst.append(re.findall(r"^[A-Za-z0-9]+$",title)[0])#在title中使用正则表达式匹配航班号，添加在列表中
except:
continue

def getStockInfo(lst,stockURL,fpath):#获取详细的航班信息
for stock in lst:
url = stockURL + stock +".html"
html = getHTMLText(url)#返回每一趟航班具体的网页信息
try:
if html == "":
continue
infoDict = []
soup = BeautifulSoup(html,'html.parser')#解析新的网页
stockInfo = soup.find('div',attrs={'class':'detail-info'})#获取div标签中的detail-info信息

name = stockInfo.find_all(attrs={'class':'ml5'})[1]#在stockInfo中查找航班号

infoDict.append('航班号'+":"+name.text.split()[0])#添加航班号

keyList = stockInfo.find_all(attrs={'class':'time'})#时间

value = stockInfo.find_all(attrs={'class':'strong'})[1]#登机口

for i in range(len(keyList)):#时间分为到达和出发时间，这里将其分割为两部分添加在空列表中保存
key = keyList[i].text
infoDict.append(key)

val = value.text#登机口信息中前面数据为\n，使用切片清除
val = str(val)
val = val[1:4]
infoDict.append(val)
print(infoDict) #控制台打印信息

with open(fpath,'a',encoding='utf-8')as f:#保存在指定路径
f.write(str(infoDict)+'\n')
except:
traceback.print_exc() #出错打印异常信息
continue

def main():
depth = 46#查询深度
stock_list_url='http://flights.ctrip.com/actualtime/depart-ctu/'
stock_info_url='http://flights.ctrip.com/actualtime/fno--'
output_file = 'D://11.txt'
slist = []
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)

for i in range(1,depth):
try:
print("第"+str(i)+"页")
slist = []
stock_list_url ='http://flights.ctrip.com/actualtime/depart-ctu'+'.p'+str(i)+'/'#每一页航班号具体的url
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
except:
continue

main()

截图：