大众点评店铺信息及好评,中评,差评的爬虫程序最新版2024-5月份之店铺详情信息采集

import time
from lxml import etree
import requests
import cx_Oracle as cx
import random
cookies = {
    '_lxsdk_cuid': '18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8',
    '_lxsdk': '18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8',
    '_hc.v': 'fc37de4c-a51c-f6cb-1ced-12b911e2282f.1695957264',
    'WEBDFPID': 'u2vu0vw01uyw50u3y32868xz7513v0v681zwvvv7223979584w0yy40u-2011317264892-1695957264265KOIWSOEfd79fef3d01d5e9aadc18ccd4d0c95073177',
    's_ViewType': '10',
    'ctu': 'af4593d00687df8095e60c8d3aabcd27340bc693d0991a6f92a24d2e8c51db89',
    'fspop': 'test',
    'cy': '4',
    'cye': 'guangzhou',
    'qruuid': '3ee9aeff-bafb-4c47-9855-bca8e6c59b96',
    'dper': '130076071edbb043ed7c98016b5c6bd1982c10ab7b52c874c8891e6990ee4694fb1bc8c0b3409d8a2e02a632b55270b7fa037bd29e3dd7dbba7cae2d010ee1b2',
    'll': '7fd06e815b796be3df069dec7836c3df',
    '_lx_utm': 'utm_source%3Dbing%26utm_medium%3Dorganic',
    'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1697090344,1697092664,1697432446,1697442501',
    'Hm_lpvt_602b80cf8079ae6591966cc70a3940e7': '1697442512',
    '_lxsdk_s': '18b37750f73-ce1-68c-a3e%7C%7C58',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    # 'Cookie': '_lxsdk_cuid=18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8; _lxsdk=18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8; _hc.v=fc37de4c-a51c-f6cb-1ced-12b911e2282f.1695957264; WEBDFPID=u2vu0vw01uyw50u3y32868xz7513v0v681zwvvv7223979584w0yy40u-2011317264892-1695957264265KOIWSOEfd79fef3d01d5e9aadc18ccd4d0c95073177; s_ViewType=10; ctu=af4593d00687df8095e60c8d3aabcd27340bc693d0991a6f92a24d2e8c51db89; fspop=test; cy=4; cye=guangzhou; qruuid=3ee9aeff-bafb-4c47-9855-bca8e6c59b96; dper=130076071edbb043ed7c98016b5c6bd1982c10ab7b52c874c8891e6990ee4694fb1bc8c0b3409d8a2e02a632b55270b7fa037bd29e3dd7dbba7cae2d010ee1b2; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1697090344,1697092664,1697432446,1697442501; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1697442512; _lxsdk_s=18b37750f73-ce1-68c-a3e%7C%7C58',
    'Pragma': 'no-cache',
    'Referer': 'https://www.dianping.com/guangzhou/ch10',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
    'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}



def index_load(url):
    response = requests.get(url=url, headers=headers, cookies=cookies)
    w=random.uniform(5,8)
    time.sleep(w)
    if response.status_code == 200:
        content = response.text
        text = etree.HTML(content)
        try:
            xq_score = text.xpath('string(//*[@id="comment_score"])')  # 详情打分

        except Exception as e:
            xq_score = '无'
        try:
            phone = text.xpath('string(//*[@id="basic-info"]/p)')  # 电话
        except Exception as e:
            phone = '无'
        try:
            dz = text.xpath('//*[@id="address"]/text()')[0]  # 地址
        except Exception as e:
            dz = '无'
        try:
            bm = text.xpath('//*[@id="basic-info"]/div[4]/p[1]/span[2]/text()')[0]  # 别名
        except Exception as e:
            bm = '无'
        try:
            yysj = text.xpath('//*[@id="basic-info"]/div[4]/p[2]/span[2]/text()')[0]  # 营业时间
        except Exception as e:
            yysj = '无'
        return xq_score, phone, dz, bm, yysj
    else:
        print(response.text)
        return 0


def main():
    con = cx.connect('a', 'b', 'c')
    # 连接数据库,a是数据库名,b是密码,c是localhost+库名称
    cursor = con.cursor()
    num=int(input("请输入is_check值(1):"))
    cursor.execute(f"select a1, a7 from dzdp147  where status = 0 and is_check={num}  order by a7 desc")  # 执行sql语句
    data = cursor.fetchall()  # 正确
    for i in data:
        uid = i[1]
        value = i[0]
        xq_score, phone, dz, bm, yysj = index_load(uid)
        print(value, xq_score, phone, dz, bm, yysj)
        cursor.execute(
            f"UPDATE dzdp147 SET xq_score = :a, phone = :b, dz = :c, bm = :d, yysj = :e, status = :f WHERE a7 = :u_id",
            a=xq_score, b=phone, c=dz, d=bm, e=yysj, f=1, u_id=uid)
        con.commit()
    cursor.close()  # 关闭游标
    con.close()  # 关闭数据库连接

if __name__ == "__main__":
    main()
    print("爬取完成")

不多废话直接放代码了在上面,注意cookie失效了,记得换个自己的,这个是接上篇的

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值