import time
from lxml import etree
import requests
import cx_Oracle as cx
import random
cookies = {
'_lxsdk_cuid': '18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8',
'_lxsdk': '18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8',
'_hc.v': 'fc37de4c-a51c-f6cb-1ced-12b911e2282f.1695957264',
'WEBDFPID': 'u2vu0vw01uyw50u3y32868xz7513v0v681zwvvv7223979584w0yy40u-2011317264892-1695957264265KOIWSOEfd79fef3d01d5e9aadc18ccd4d0c95073177',
's_ViewType': '10',
'ctu': 'af4593d00687df8095e60c8d3aabcd27340bc693d0991a6f92a24d2e8c51db89',
'fspop': 'test',
'cy': '4',
'cye': 'guangzhou',
'qruuid': '3ee9aeff-bafb-4c47-9855-bca8e6c59b96',
'dper': '130076071edbb043ed7c98016b5c6bd1982c10ab7b52c874c8891e6990ee4694fb1bc8c0b3409d8a2e02a632b55270b7fa037bd29e3dd7dbba7cae2d010ee1b2',
'll': '7fd06e815b796be3df069dec7836c3df',
'_lx_utm': 'utm_source%3Dbing%26utm_medium%3Dorganic',
'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1697090344,1697092664,1697432446,1697442501',
'Hm_lpvt_602b80cf8079ae6591966cc70a3940e7': '1697442512',
'_lxsdk_s': '18b37750f73-ce1-68c-a3e%7C%7C58',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
# 'Cookie': '_lxsdk_cuid=18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8; _lxsdk=18aa19bf875c8-086838f131a88c-7c54647e-144000-18aa19bf875c8; _hc.v=fc37de4c-a51c-f6cb-1ced-12b911e2282f.1695957264; WEBDFPID=u2vu0vw01uyw50u3y32868xz7513v0v681zwvvv7223979584w0yy40u-2011317264892-1695957264265KOIWSOEfd79fef3d01d5e9aadc18ccd4d0c95073177; s_ViewType=10; ctu=af4593d00687df8095e60c8d3aabcd27340bc693d0991a6f92a24d2e8c51db89; fspop=test; cy=4; cye=guangzhou; qruuid=3ee9aeff-bafb-4c47-9855-bca8e6c59b96; dper=130076071edbb043ed7c98016b5c6bd1982c10ab7b52c874c8891e6990ee4694fb1bc8c0b3409d8a2e02a632b55270b7fa037bd29e3dd7dbba7cae2d010ee1b2; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1697090344,1697092664,1697432446,1697442501; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1697442512; _lxsdk_s=18b37750f73-ce1-68c-a3e%7C%7C58',
'Pragma': 'no-cache',
'Referer': 'https://www.dianping.com/guangzhou/ch10',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def index_load(url):
response = requests.get(url=url, headers=headers, cookies=cookies)
w=random.uniform(5,8)
time.sleep(w)
if response.status_code == 200:
content = response.text
text = etree.HTML(content)
try:
xq_score = text.xpath('string(//*[@id="comment_score"])') # 详情打分
except Exception as e:
xq_score = '无'
try:
phone = text.xpath('string(//*[@id="basic-info"]/p)') # 电话
except Exception as e:
phone = '无'
try:
dz = text.xpath('//*[@id="address"]/text()')[0] # 地址
except Exception as e:
dz = '无'
try:
bm = text.xpath('//*[@id="basic-info"]/div[4]/p[1]/span[2]/text()')[0] # 别名
except Exception as e:
bm = '无'
try:
yysj = text.xpath('//*[@id="basic-info"]/div[4]/p[2]/span[2]/text()')[0] # 营业时间
except Exception as e:
yysj = '无'
return xq_score, phone, dz, bm, yysj
else:
print(response.text)
return 0
def main():
con = cx.connect('a', 'b', 'c')
# 连接数据库,a是数据库名,b是密码,c是localhost+库名称
cursor = con.cursor()
num=int(input("请输入is_check值(1):"))
cursor.execute(f"select a1, a7 from dzdp147 where status = 0 and is_check={num} order by a7 desc") # 执行sql语句
data = cursor.fetchall() # 正确
for i in data:
uid = i[1]
value = i[0]
xq_score, phone, dz, bm, yysj = index_load(uid)
print(value, xq_score, phone, dz, bm, yysj)
cursor.execute(
f"UPDATE dzdp147 SET xq_score = :a, phone = :b, dz = :c, bm = :d, yysj = :e, status = :f WHERE a7 = :u_id",
a=xq_score, b=phone, c=dz, d=bm, e=yysj, f=1, u_id=uid)
con.commit()
cursor.close() # 关闭游标
con.close() # 关闭数据库连接
if __name__ == "__main__":
main()
print("爬取完成")
不多废话直接放代码了在上面,注意cookie失效了,记得换个自己的,这个是接上篇的