大众点评店铺信息及好评，中评，差评的爬虫程序最新版2024-5月份之基础店铺采集

本文链接：https://blog.youkuaiyun.com/ymdaa555/article/details/139348304
不多废话直接放代码，注意cookie失效了，记得换个自己的
import os
import time
import requests
from lxml import etree
import pandas as pd
import random
import logging
from pypinyin import lazy_pinyin


def save_progress(zy_i, page, index):
    """保存第page页，第index条的数据到progress文件中"""
    with open('大众点评progress.txt', 'w') as file:
        file.write(f'{zy_i},{page},{index}')


def convert_to_pinyin(word):
    return ''.join(lazy_pinyin(word))


def load_progress():
    if os.path.exists('大众点评progress.txt'):
        with open('大众点评progress.txt', 'r') as file:
            """如果读到文件中保留的信息，就读取对应的数据"""
            progress = file.read().strip().split(',')
            if len(progress) == 3:
                return int(progress[0]), int(progress[1]), int(progress[2])
    # 读不到就返回默认页面和条数都为0
    return 1, 1, 1


cookies = {
    's_ViewType': '10',
    '_lxsdk_cuid': '18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc',
    '_hc.v': 'b00d90cb-04e8-043f-b4f4-8e40941a1a79.1695003625',
    'WEBDFPID': 'w0y632653u7z5115y74y380yzww10u7181zz383x2xy979586uxzwyw8-2010363625352-1695003625352IEEEUIEfd79fef3d01d5e9aadc18ccd4d0c95071850',
    'ctu': 'a8e4f13e85debd2d19d82f50a8e45f64af15a2c240e087345be28a799ba96c74',
    'uuid': '0F11FB071A44C7EEED5C38E0D500558068FB62041B48FC0DEF43D9B0B893A431',
    'iuuid': '0F11FB071A44C7EEED5C38E0D500558068FB62041B48FC0DEF43D9B0B893A431',
    '_lxsdk': '0F11FB071A44C7EEED5C38E0D500558068FB62041B48FC0DEF43D9B0B893A431',
    'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1699952737,1700536885,1701241211,1701747273',
    'qruuid': '70eeac5c-9b24-442b-8d37-4fb80bbfdd74',
    'dper': '0202db235b086ca7c3db7e44b17f5cd4c1b4d706671c061b91dc4e0d9bca84e7dc3c4ecd499e8eaf4564b55b104bb7a1097d754cb5bede268a100000000051200000afb1f3773265f51bef46284decffc240f6cf817ddf5cea294ec681407e34a307cf8ccfaf9193fe388d785d626abbb64c',
    '_lxsdk_s': '18fb58ae809-3cd-e34-4ab%7C%7C7',
    'll': '7fd06e815b796be3df069dec7836c3df',
    'fspop': 'test',
    'cy': '224',
    'cye': 'nanning',
}
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Referer': 'https://www.dianping.com/guangzhou/ch10/g112o3',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}


def crawl(url):
    """分析主页面提取对应的菜系名单"""
    response = requests.get(url=url, cookies=cookies, headers=headers)
    rom_date = random.uniform(5, 10)
    time.sleep(rom_date)
    if response.status_code == 403:
        logging.info('被封了，403')
        return None
    else:
        content = response.text
        txt = etree.HTML(content)
        start_cx, start_page, start_index = load_progress()  # 从文本中读取当前爬取的位置
        for i in range(start_cx, 37):
            name = txt.xpath(f'//*[@id="classfy"]/a[{i}]/span/text()')[0]
            print(f"当前正在爬取{name}")
            fl = txt.xpath(f'//*[@id="classfy"]/a[{i}]/@href')[0]
            List_index(fl, i, start_page, start_index)
            start_page = 1  # 初始化页码


def List_index(fl, start_cx, start_page, start_index):
    """对每个菜系进行拆解，该函数需要传入每一个菜系的链接，编号，每一页的开始，每一条的开始，方便持久化"""
    www = []
    del_flag = False
    for page in range(start_page, 51):
        fl = fl + f'p{page}'
        response1 = requests.get(url=fl, cookies=cookies, headers=headers, timeout=20)  # 每一个页面
        if response1.status_code == 403:
            logging.info('被封了，403')
            return None
        else:
            rom_date = random.uniform(3, 6)
            time.sleep(rom_date)
            content1 = response1.text
            txt1 = etree.HTML(content1)
            for a in range(start_index, 16):
                w = Data_analysis(txt1, a, page)
                if all(item =='NULL' or item==''  for item in w):
                    del_flag = True
                    break
                else:
                    www.append(w)
                    down_load(www)
                    www.clear()
                    save_progress(start_cx, page, a + 1)  # 文本保存编号，页码，序号
            if del_flag == True:
                break
            start_index = 1  # 初始化序号


def Data_analysis(txt1, a, page):
    """解析详细数据页面"""
    try:
        Restaurant_name = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[1]/a/h4/text()')[0]  # 店铺名称
    except Exception as e:
        Restaurant_name = "NULL"

    try:
        pj_count = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[2]/a[1]/b/text()')[0]  # 评论数量
        pj_count = pj_count + " 条评论"
    except Exception as e:
        pj_count = "NULL"

    try:
        per_price = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[2]/a[2]/b/text()')[0]  # 人均消费
        per_price = "人均 " + per_price
    except Exception as e:
        per_price = "NULL"

    try:
        main_shell = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[3]/a[1]/span/text()')[0]  # 售卖主食
    except Exception as e:
        main_shell = "NULL"

    try:
        Business_district = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[3]/a[2]/span/text()')[0]  # 商圈信息
    except Exception as e:
        Business_district = "NULL"

    try:
        Recommended_dish = txt1.xpath(f'string(//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[4])')  # 推荐菜系
        tjc = remove_newlines(Recommended_dish)
        qwer = convert_list_to_string(tjc)
    except Exception as e:
        qwer = "NULL"
    try:
        Tg = txt1.xpath(f'string(//*[@id="shop-all-list"]/ul/li[{a}]/div[3]/div)')  # 团购
        Tg1 = remove_newlines(Tg)
        Tg2 = convert_list_to_string(Tg1)
    except  Exception as e:
        Tg2 = 'NULL'

    try:
        Detailed_link = txt1.xpath(f'//*[@id="shop-all-list"]/ul/li[{a}]/div[2]/div[1]/a/@href')[0]  # 详情链接
    except Exception as e:
        Detailed_link = "NULL"
    w = [Restaurant_name, pj_count, per_price, main_shell, Business_district, qwer, Tg2, Detailed_link]
    print(f'第{page}页第{a}条', w[0])
    return w


def remove_newlines(text):
    """去掉换行符"""
    return [dish.strip() for dish in text.split(" ") if dish != ""]


def convert_list_to_string(list_data):
    """字符串拼接"""
    return "、".join([dish for dish in list_data if dish != ""])


def down_load(wkb):
    columns = ['店铺名称', '评论数量', '人均消费', '售卖主食', '商圈信息', '推荐菜系', '团购', '详情链接']
    df = pd.DataFrame(wkb, columns=columns)
    file_name = '大众点评.csv'
    # Check if the file exists
    if not os.path.isfile(file_name):
        df.to_csv(file_name, index=False, mode='a', header=True)
    else:
        df.to_csv(file_name, index=False, mode='a', header=False)


if __name__ == '__main__':
    city = input('请输入城市的名称:')
    city = str(convert_to_pinyin(city))
    url = f'https://www.dianping.com/{city}/ch10/g112o3'  # 采集的是按照好评排序的商铺信息
    crawl(url)