import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from urllib.parse import quote
import json
headers = {
'access-control-allow-credentials true'
'access-control-allow-headers content-type,x-ctx-currency,x-ctx-locale,x-ctx-ubt-pageid,x-ctx-ubt-pvid,x-ctx-ubt-sid,x-ctx-ubt-vid'
'access-control-allow-methods POST'
'access-control-allow-origin https://you.ctrip.com'
'access-control-expose-headers x-service-call'
'access-control-expose-headers x-gate-region'
'access-control-expose-headers slb-http-protocol-version'
'content-length 0'
'content-type text/html'
'date Mon, 19 May 2025 04:26:13 GMT'
'slb-http-protocol-version HTTP/2.0'
'x-gate ctrip-gate'
'x-gate-region SHAXY'
'x-gate-root-id 100025527-0a739a0f-485452-1305989'
'x-originating-url https://m.ctrip.com/restapi/soa2/28967/reportInjectFnInfo?_fxpcqlniredt=09031077312951608849&x-traceID=09031077312951608849-1747628780641-5951399'
}
def search_attraction(attraction_name, city_code='30'):
search_url = f"https://m.ctrip.com/restapi/soa2/28967/reportInjectFnInfo?_fxpcqlniredt=09031077312951608849&x-traceID=09031077312951608849-1747628780641-5951399"
try:
response = requests.get(search_url, headers=headers)
response.raise_for_status()
data = response.json()
content = data['data']['content']
if 'data' in data and 'content' in data['data']:
for item in data['data']['content']:
if item.get('type') == 'sight':
return item.get('id'), item.get('name')
return None, None
except Exception as e:
print(f"搜索景点出错: {e}")
return None, None
print(type(data['data']['content']))
def get_attraction_hotels(attraction_id, attraction_name, city_code='30', radius=2, page=1):
hotel_url = f"https://api.map.baidu.com/images/blank.gif?product=jsapi&sub_product=jsapi&v=2.0&sub_product_v=2.0&t=26639547&code=5054&da_src=5054&pic=16"
payload = {
"bizFilter": {
"radius": radius,
"landmarkId": attraction_id,
"landmarkType": 1,
"landmarkName": attraction_name
},
"cityId": city_code,
"sortType": 9,
"pageIndex": page,
"pageSize": 20,
"sourceFrom": 2,
"arrivalDate": "2025-05-19",
"departureDate": "2025-05-20",
"includeSoldOut": True,
"needRoomMessage": False,
"travelPurpose": 0,
"supportDigitalCheckIn": False,
"needRoomQuotas": False,
"priceRange": "0,0",
"star": "0,5",
"facility": [],
"brand": [],
"group": [],
"promotion": [],
"paymentType": [],
"bedType": [],
"cancelPolicy": [],
"prepay": [],
"specialOffer": [],
"hotelName": "",
"isOnlyHotel": False,
"requestTravelMoney": False,
"needHotelRemarks": False,
"needMemberPrice": True,
"currency": "CNY",
"needSatisfaction": False,
"supportMultipleRooms": False,
"isFlashSale": False,
"showFlashSaleTag": False,
"needBasicPoint": False,
"needCancelPolicy": False,
"needVipPrice": False,
"needGroupRoom": False,
"isPackage": False,
"isJustConfirm": False,
"isJustOrder": False,
"needHotelPromotion": False,
"needPriceTips": False,
"needCancelFee": False,
"needRoomTags": False,
"needPrepayment": False,
"needRoomQuotas": False,
"needRoomQuotas": False,
"needBasicPoint": False,
"needCancelPolicy": False,
"needVipPrice": False,
"needGroupRoom": False,
"isPackage": False,
"isJustConfirm": False,
"isJustOrder": False,
"needHotelPromotion": False,
"needPriceTips": False,
"needCancelFee": False,
"needRoomTags": False,
"needPrepayment": False
}
try:
response = requests.post(hotel_url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
hotels = []
if 'hotelPositionJSON' in data:
for hotel in data['hotelPositionJSON']:
hotel_info = {
'hotel_id': hotel.get('hotelId'),
'hotel_name': hotel.get('name'),
'distance': hotel.get('distance'),
'price': hotel.get('lowestPrice'),
'rating': hotel.get('hotelStar'),
'address': hotel.get('address'),
'latitude': hotel.get('lat'),
'longitude': hotel.get('lng'),
'landmark_name': attraction_name,
'landmark_id': attraction_id
}
hotels.append(hotel_info)
return hotels, data.get('totalCount', 0)
except Exception as e:
print(f"获取酒店数据出错: {e}")
return [], 0
def main():
attraction_name = input("请输入要搜索的景点名称: ")
city_code = input("请输入城市代码(默认上海为30): ") or "30"
radius = int(input("请输入搜索半径(公里,默认2公里): ") or "2")
max_pages = int(input("请输入最大爬取页数(默认3页): ") or "3")
# 搜索景点
attraction_id, attraction_name = search_attraction(attraction_name, city_code)
if not attraction_id:
print(f"未找到景点: {attraction_name}")
return
print(f"找到景点: {attraction_name}, ID: {attraction_id}")
print(f"开始爬取{attraction_name}附近{radius}公里内的酒店数据...")
all_hotels = []
current_page = 1
total_count = 0
while current_page <= max_pages:
print(f"正在爬取第 {current_page} 页...")
hotels, total_count = get_attraction_hotels(
attraction_id, attraction_name, city_code, radius, current_page
)
all_hotels.extend(hotels)
if not hotels or len(hotels) == 0:
print("没有更多酒店数据")
break
print(f"已爬取 {len(hotels)} 家酒店,总共已爬取 {len(all_hotels)} 家酒店")
delay = random.uniform(2, 5)
print(f"等待 {delay:.2f} 秒后继续爬取下一页...")
time.sleep(delay)
current_page += 1
if all_hotels:
df = pd.DataFrame(all_hotels)
filename = f"{attraction_name}_附近酒店数据.csv"
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据已保存到文件: {filename}")
print(f"总共爬取了 {len(all_hotels)} 家酒店数据")
print("\n前5条酒店数据示例:")
print(df.head().to_csv(sep='\t', na_rep='nan'))
else:
print("未爬取到任何酒店数据")
if __name__ == "__main__":
main()为什么运行结果出现:搜索景点出错: 'set' object has no attribute 'items'
最新发布