不多废话直接放代码,注意cookie失效了,记得换个自己的
import os
import time
import requests
from lxml import etree
import pandas as pd
import random
import logging
from pypinyin import lazy_pinyin
def save_progress(zy_i, page, index):
"""保存第page页,第index条的数据到progress文件中"""
with open('大众点评progress.txt', 'w') as file:
file.write(f'{zy_i},{page},{index}')
def convert_to_pinyin(word):
return ''.join(lazy_pinyin(word))
def load_progress():
if os.path.exists('大众点评progress.txt'):
with open('大众点评progress.txt', 'r') as file:
"""如果读到文件中保留的信息,就读取对应的数据"""
progress = file.read().strip().split(',')
if len(progress) == 3:
return int(progress[0]), int(progress[1]), int(progress[2])
# 读不到就返回默认页面和条数都为0
return 1, 1, 1
cookies = {
's_ViewType': '10',
'_lxsdk_cuid': '18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc',
'_hc.v': 'b00d90cb-04e8-043f-b4f4-8e40941a1a79.1695003625',
'WEBDFPID': 'w0y632653u7z5115y74y380yzww10u7181zz383x2xy979586uxzwyw8-2010363625352-1695003625352IEEEUIEfd79fef3d01d5e9aadc18ccd4d0c95071850',
'ctu': 'a8e4f13e85debd2d19d82f50a8e45f64af15a2c240e087345be28a799ba96c74',
'uuid': '0F11FB071A44C7EEED5C38E0D500558068FB62041B48FC0DEF43D9B0B893A431',
'iuuid': '0F11FB071A44C7EEED5C38E0D500558068FB62041B48FC0DEF43D9B0B893A431',
'_lxsdk': '0F11FB071A44C7EEED5C38E0D500558068FB62041B48FC0DEF43D9B0B893A431',
'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1699952737,1700536885,1701241211,1701747273',
'qruuid': '70eeac5c-9b24-442b-8d37-4fb80bbfdd74',
'dper': '0202db235b086ca7c3db7e44b17f5cd4c1b4d706671c061b91dc4e0d9bca84e7dc3c4ecd499e8eaf4564b55b104bb7a1097d754cb5bede268a100000000051200000afb1f3773265f51bef46284decffc240f6cf817ddf5cea294ec681407e34a307cf8ccfaf9193fe388d785d626abbb64c',
'_lxsdk_s': '18fb58ae809-3cd-e34-4ab%7C%7C7',
'll': '7fd06e815b796be3df069dec7836c3df',
'fspop': 'test',
'cy': '224',
'cye': 'nanning',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.dianping.com/guangzhou/ch10/g112o3',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def crawl(url):
"""分析主页面提取对应的菜系名单"""
response = requests.get(url=url, cookies=cookies, headers=headers)
rom_date = random.uniform(5, 10)
time.sleep(rom_date)
if response.status_code == 403:
logging.info('被封了,403')
return None
else:
content = response.text
txt = etree.HTML(content)
start_cx, start_page, start_index = load_progress() # 从文本中读取当前爬取的位置
for i in range(start_cx, 37):
name = txt.xpath(f'//*[@id="classfy"]/a[{i}]/span/text()')[0]
print(f"当前正在爬取{name}")
fl = txt.xpath(f'//*[@id="classfy"]/a[{i}]/@href')[0]
List_index(fl, i, start_page, start_index)
start_page = 1 # 初始化页码
def List_index(fl, start_cx, start_page, start_index):
"""对每个菜系进行拆解,该函数需要传入每一个菜系的链接,编号,每一页的开始,每一条的开始,方便持久化"""
www = []
del_flag = False
for page in range(start_page, 51):
fl = fl + f'p{page}'
response1 = requests.get(url=fl, cookies=cookies, headers=headers, timeout=20) # 每一个页面
if response1.status_code == 403:
logging.info('被封了,403')
return None
else:
rom_date = random.uniform(3, 6)
time.sleep(rom_date)
content1 = response1.text
txt1 = etree.HTML(content1)
for a in range(start_index, 16):
w = Data_analysis(txt1, a, page)
if all(item =='NULL' or item=='' for item in w):
del_flag = True
break
else:
www.append(w)
down_load(www)
www.clear()
save_progress(start_cx, page, a + 1) # 文本保存编号,页码,序号
if del_flag == True:
break
start_index = 1 # 初始化序号
def Data_analysis(txt1, a, page):
"""解析详细数据页面"""
try:
Restaurant_name = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[1]/a/h4/text()')[0] # 店铺名称
except Exception as e:
Restaurant_name = "NULL"
try:
pj_count = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[2]/a[1]/b/text()')[0] # 评论数量
pj_count = pj_count + " 条评论"
except Exception as e:
pj_count = "NULL"
try:
per_price = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[2]/a[2]/b/text()')[0] # 人均消费
per_price = "人均 " + per_price
except Exception as e:
per_price = "NULL"
try:
main_shell = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[3]/a[1]/span/text()')[0] # 售卖主食
except Exception as e:
main_shell = "NULL"
try:
Business_district = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[3]/a[2]/span/text()')[0] # 商圈信息
except Exception as e:
Business_district = "NULL"
try:
Recommended_dish = txt1.xpath(f'string(//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[4])') # 推荐菜系
tjc = remove_newlines(Recommended_dish)
qwer = convert_list_to_string(tjc)
except Exception as e:
qwer = "NULL"
try:
Tg = txt1.xpath(f'string(//*[@id="shop-all-list"]/ul/li[{a}]/div[3]/div)') # 团购
Tg1 = remove_newlines(Tg)
Tg2 = convert_list_to_string(Tg1)
except Exception as e:
Tg2 = 'NULL'
try:
Detailed_link = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[1]/a/@href')[0] # 详情链接
except Exception as e:
Detailed_link = "NULL"
w = [Restaurant_name, pj_count, per_price, main_shell, Business_district, qwer, Tg2, Detailed_link]
print(f'第{page}页第{a}条', w[0])
return w
def remove_newlines(text):
"""去掉换行符"""
return [dish.strip() for dish in text.split(" ") if dish != ""]
def convert_list_to_string(list_data):
"""字符串拼接"""
return "、".join([dish for dish in list_data if dish != ""])
def down_load(wkb):
columns = ['店铺名称', '评论数量', '人均消费', '售卖主食', '商圈信息', '推荐菜系', '团购', '详情链接']
df = pd.DataFrame(wkb, columns=columns)
file_name = '大众点评.csv'
# Check if the file exists
if not os.path.isfile(file_name):
df.to_csv(file_name, index=False, mode='a', header=True)
else:
df.to_csv(file_name, index=False, mode='a', header=False)
if __name__ == '__main__':
city = input('请输入城市的名称:')
city = str(convert_to_pinyin(city))
url = f'https://www.dianping.com/{city}/ch10/g112o3' # 采集的是按照好评排序的商铺信息
crawl(url)