爬取某个旅游网站的信息:
python3.6的环境,安装了BeautifulSoup,requests,time这三个库。
from bs4 import BeautifulSoup
import requests
import time
url_saves = 'http://www.tripadvisor.com/Saves#37685322'
url = 'https://www.tripadvisor.cn/Attractions-g60763-Activities-New_York_City_New_York.html#ATTRACTION_SORT_WRAPPER'
urls = ['https://cn.tripadvisor.com/Attractions-g60763-Activities-oa{}-New_York_City_New_York.html#ATTRACTION_LIST'.format(str(i)) for i in range(30,930,30)]
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Cookie':'TAUnique=%1%enc%3AWaN7giJIUjVkSiBGWVYYDUaI8hoVhsqa3Iq4qT4Z9%2Fo%3D; TASSK=enc%3AAJJTDwU2yMrMcAEQ%2FYNElzUOb684JE1FPZDmbJJyKZ8%2BGGhyFoRoydkLp4MywJGhefm4QbuQEU3l79Vd5lYZWvtoZCI4d35uOJQzqUAtHwYD%2F4tOO8G%2B23NZOTQD4dfjgg%3D%3D; _smt_uid=58ff435b.2715deef; _jzqckmp=1; __gads=ID=8102c0a926e4efc2:T=1493123949:S=ALNI_MYl8n1gjvqexIrLr5oTxSlYRxdMpw; ServerPool=A; TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RVL.60763_116*RS.1; TAReturnTo=%1%%2FAttractions-g60763-Activities-New_York_City_New_York.html; TASession=%1%V2ID.7D8DFA1F5874018BF86792C82094E377*SQ.8*LP.%2FLangRedirect%3Fauto%3D3%26origin%3Dzh%26pool%3DA%26returnTo%3D%252FAttractions-g60763-Activities-New_York_City_New_York%5C.html*LS.Attractions*GR.96*TCPAR.45*TBR.26*EXEX.13*ABTR.3*PHTB.20*FS.18*CPU.39*HS.popularity*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*FA.1*DF.0*IR.3*OD.zh*RT.0*FLO.60763*TRA.true*LD.60763; CM=%1%HanaPersist%2C%2C-1%7CHanaSession%2C%2C-1%7CFtrSess%2C%2C-1%7CRCPers%2C%2C-1%7CHomeAPers%2C%2C-1%7CWShadeSeen%2C%2C-1%7CRCSess%2C%2C-1%7CFtrPers%2C%2C-1%7CTheForkMCCPers%2C%2C-1%7CHomeASess%2C2%2C-1%7CLaFourchette+MC+Banners%2C%2C-1%7CPremiumMCSess%2C%2C-1%7Csh%2C%2C-1%7CLastPopunderId%2C137-1859-null%2C-1%7Cpssamex%2C%2C-1%7CTheForkMCCSess%2C%2C-1%7CCpmPopunder_1%2C1%2C1493295984%7CCpmPopunder_2%2C1%2C-1%7CViatorMCPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Csesssticker%2C%2C-1%7C%24%2C%2C-1%7CViatorMCSess%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS2%2C%2C-1%7CPremiumMCPers%2C%2C-1%7CMC_IB_UPSELL_IB_LOGOS%2C%2C-1%7CLaFourchette+Banners%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7CSaveFtrPers%2C%2C-1%7CSaveFtrSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CRBASess%2C%2C-1%7Cperssticker%2C%2C-1%7CMetaFtrSess%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; TAUD=LA-1493123860689-1*RDD-1-2017_04_25*LG-85718820-2.1.F.*LD-85718821-.....; roybatty=TNI1625!AKSp0VaUBz8ms7cd73GTjBTFgiKkchq%2BGnuRvr7Ca98I6D56eZ94IuP3ZF681ZOvenkpBVZ8c4KB8GY%2BaCEYdnikKZBchq62Hghs04vTUmVQUf2OBoLPaGq32ykr9FHGoBbj2KfexdODCLPgTiFKNoOUCvZF7Zuzc6aeEJFGki6l%2C1; _ga=GA1.2.646625780.1493123888; Hm_lvt_2947ca2c006be346c7a024ce1ad9c24a=1493123935,1493205990; Hm_lpvt_2947ca2c006be346c7a024ce1ad9c24a=1493209587; _jzqa=1.150809094252425180.1493123938.1493205993.1493209114.3; _jzqc=1; _qzja=1.2016907279.1493123950764.1493205992769.1493209113909.1493209113909.1493209588048..0.0.7.3; _qzjb=1.1493209113908.2.0.0.0; _qzjc=1; _qzjto=5.2.0; _jzqb=1.2.10.1493209114.1; ki_t=1493123951705%3B1493205994856%3B1493209598582%3B2%3B6; ki_r=; EVT=gac.STANDARD_PAGINATION*gaa.page*gal.2*gav.0*gani.false*gass.Attractions*gasl.60763*gads.Attractions*gadl.60763*gapu.WQCR638AAAEAAGBedtIAAAAU*gams.0'
}
def get_attractions(url,data=None):
wb_data = requests.get(url)
print(wb_data)
time.sleep(4)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('div.property_title > a[target="_blank"]')
imgs = soup.select('img[width="160"]')
cates = soup.select('div.p13n_reasoning_v2')
if data == None:
for title,img,cate in zip(titles,imgs,cates):
data = {
'title' :title.get_text(),
'img' :img.get('src'),
'cate' :list(cate.stripped_strings),
}
print(data)
def get_favs(url,data=None):
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('a.location-name')
imgs = soup.select('div.photo > div.sizedThumb > img.photo_image')
metas = soup.select('span.format_address')
if data == None:
for title,img,meta in zip(titles,imgs,metas):
data = {
'title' :title.get_text(),
'img' :img.get('src'),
'meta' :list(meta.stripped_strings)
}
print(data)
for single_url in urls:
get_attractions(single_url)
# from mobile web site
'''
headers = {
'User-Agent':'', #mobile device user agent from chrome
}
mb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(mb_data.text,'lxml')
imgs = soup.select('div.thumb.thumbLLR.soThumb > img')
for i in imgs:
print(i.get('src'))
'''