import csv # 用于读写CSV文件
import time # 用于时间相关操作和延迟
import os # 用于操作系统相关功能,如路径操作
import requests # 用于发送HTTP请求
from requests.adapters import HTTPAdapter # 用于请求适配器配置
from requests.packages.urllib3.util.retry import Retry # 用于请求重试策略
from bs4 import BeautifulSoup # 用于HTML解析
from datetime import datetime # 用于处理日期时间1
import random # 用于生成随机数
# ========================== 全局配置 ==========================
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0',
'Referer': 'https://member.anjuke.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
}
# 定义HTTP请求头,模拟浏览器访问
COOKIES = {
'ajkAuthTicket': 'TT=3f67c23d85c369b7018fcb4e1418466f&TS=1738219179437&PBODY=IotzzfNhkTJKGH_LuUrSfcNHUGin1wBsHjAQYBL3k0USZDHrUxL6RQUv1ZsFPDHjxvQl0uvU2zSgIEdSFCHUc7wYEf4slKV2U2F9rwNnp6xHgufTxMgdYWZEob_Tep-poDqBMbQQgayOQhsaRgVjw8K8ut3QqqMfPgYGpKJJBHw&VER=2&CUID=fzgJGetduRhII81NXadF-HKyO1Hvr8W-',
'ctid': '24',
}
# 定义网站需要的Cookies
RETRY_STRATEGY = Retry(
total=3,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504],
allowed_methods=frozenset(['GET', 'POST'])
)
# 定义请求重试策略
BASE_URL = 'https://nanjing.anjuke.com/community/p{page}/' # 定义基础URL模板,{page}会被替换为页码
REQUEST_DELAY = random.uniform(0.5, 1.5) # 定义随机请求延迟时间(0.5-1.5秒)
MAX_RETRIES = 3 # 定义最大重试次数
CSV_HEADERS = [
'小区名称', '价格', '地址', '小区链接',
'物业类型', '权属类别', '竣工时间', '产权年限', '总户数', '总建筑面积', '容积率',
'绿化率', '建筑类型', '所属商圈', '统一供暖', '供水供电', '停车位', '物业费',
'停车费', '车位管理费', '物业公司', '小区地址', '开发商', '在售房源', '在租房源',
'抓取时间'
]
# 定义CSV文件的表头
# ========================== 工具函数 ==========================
def create_session():
"""创建带有重试策略的请求会话"""
session = requests.Session() # 创建会话对象
adapter = HTTPAdapter(max_retries=RETRY_STRATEGY) # 创建带有重试策略的适配器
session.mount('https://', adapter) # 为HTTPS协议挂载适配器
session.mount('http://', adapter) # 为HTTP协议挂载适配器
return session # 返回配置好的会话对象
def safe_get_text(element, selector, default='N/A'):
"""安全获取元素文本内容"""
target = element.select_one(selector) # 使用CSS选择器查找元素
return target.get_text(strip=True) if target else default
# 如果找到元素返回其文本,否则返回默认值
def get_output_path():
"""获取安全的输出文件路径"""
output_dir = os.path.join("D:\\", '安居客数据') # 构建输出目录路径
os.makedirs(output_dir, exist_ok=True) # 创建目录(如果不存在)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # 获取当前时间戳
return os.path.join(output_dir, f'communities_by_page_{timestamp}.csv')
# 返回完整文件路径
def process_community_page(session, page_url):
"""处理单个小区页面"""
try: # 发送GET请求获取页面内容
response = session.get(page_url, headers=HEADERS, cookies=COOKIES, timeout=15)
response.raise_for_status() # 检查请求是否成功
if "验证" in response.text: # 检查是否触发了反爬验证
print("⚠️ 检测到反爬验证")
return None
detail_soup = BeautifulSoup(response.text, 'html.parser') # 解析HTML
details = [safe_get_text(detail_soup, f'div.value.value_{index}') for index in range(14)]
# 提取14个标准信息项
extra_info = { # 定义额外信息字典(初始值为N/A)
'停车费': 'N/A',
'车位管理费': 'N/A', # 提取额外信息
'物业公司': 'N/A',
'小区地址': 'N/A',
'开发商': 'N/A'
}
for column in detail_soup.find_all('div', class_='column-1'):
label = safe_get_text(column, 'div.label') # 获取标签文本
value = safe_get_text(column, 'div.value') # 获取值文本
for key in extra_info:
if key in label: # 如果标签包含字典键
extra_info[key] = value # 更新字典值
# 提取在售和在租房源信息
sale = detail_soup.find('div', class_='sale')
rent = detail_soup.find('div', class_='rent')
sale_info = f"{safe_get_text(sale, 'i.source-number')} {safe_get_text(sale, 'i.source-unit')}" if sale else 'N/A'
rent_info = f"{safe_get_text(rent, 'i.source-number')} {safe_get_text(rent, 'i.source-unit')}" if rent else 'N/A'
return details, extra_info, sale_info, rent_info # 返回提取的数据
except Exception as e:
print(f"⚠️ 小区详情页处理失败: {e}")
return None
# ========================== 主程序 ==========================
def main():
try:
# 用户输入
while True:
try:
start_page = int(input("请输入起始页码(1开始): ")) # 获取起始页码
end_page = int(input("请输入结束页码(大于起始页): ")) # 获取结束页码
if 1 <= start_page <= end_page: # 验证输入有效性
break
print("请输入有效的页码范围!")
except ValueError:
print("请输入有效的数字!")
# 初始化会话
session = create_session() # 创建会话
output_file = get_output_path() # 获取输出文件路径
print(f"\n数据将保存到: {output_file}") # 打印输出路径
with open(output_file, mode='w', newline='', encoding='utf-8-sig') as csv_file:
writer = csv.writer(csv_file) # 创建CSV写入器
writer.writerow(CSV_HEADERS) # 写入表头
total_collected = 0 # 初始化总收集计数器
# 按页抓取
for current_page in range(start_page, end_page + 1): # 遍历每一页
print(f"\n➤ 正在处理第 {current_page} 页...")
page_url = BASE_URL.format(page=current_page) # 构建当前页URL
try: # 获取列表页内容
response = session.get(page_url, headers=HEADERS, cookies=COOKIES, timeout=10)
response.raise_for_status()
if "验证" in response.text: # 检查反爬
print("⚠️ 检测到反爬验证,跳过此页")
continue
list_soup = BeautifulSoup(response.text, 'html.parser') # 解析列表页
communities = list_soup.find_all('a', class_='li-row') # 查找所有小区元素
page_collected = 0
for community in communities:
name = safe_get_text(community, 'div.li-title')
price = safe_get_text(community, 'div.community-price')
address = safe_get_text(community, 'div.props')
link = community.get('href', '')
print(f" ▌ 处理小区: {name}")
# 获取详情数据
detail_data = None
for attempt in range(MAX_RETRIES):
result = process_community_page(session, link)
if result:
details, extra_info, sale_info, rent_info = result
detail_data = [
name, price, address, link,
*details,
*extra_info.values(),
sale_info, rent_info,
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
]
break
time.sleep(REQUEST_DELAY * (attempt + 1))
if detail_data:
writer.writerow(detail_data)
page_collected += 1
total_collected += 1
print(f" ✅ 已保存 (本页:{page_collected}, 总计:{total_collected})")
else:
print(f" ⚠️ 跳过小区: {name}")
time.sleep(REQUEST_DELAY)
print(f" ✔ 第 {current_page} 页完成,收集 {page_collected} 条数据")
time.sleep(REQUEST_DELAY * 2) # 页间延迟稍长
except Exception as e:
print(f"⚠️ 第 {current_page} 页处理失败: {e}")
continue
print(f"\n🎉 数据抓取完成!共收集 {total_collected} 条数据,结果已保存到: {output_file}")
except KeyboardInterrupt:
print("\n⚠️ 用户中断程序执行")
except Exception as e:
print(f"\n❌ 程序发生异常: {e}")
finally:
if 'session' in locals():
session.close()
if __name__ == '__main__':
main()哪部分代码是用于读取多少页的信息