信息爬取
import requests
from lxml import etree
import time
import json
import pandas as pd
# 获取商户名称和ID
result = []
for i in range(1,51):
print(i)
url = r'http://www.dianping.com/haikou/ch10/p{page}'.format(page=i)
headers = {
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": "navCtgScroll=1; _lxsdk_cuid=17a1dc4c18e1d-0a7ec7a9c3737a-45410429-240000-17a1dc4c18fc8; _hc.v=028ffd92-97cc-7fa9-e64f-74408f8c9421.1623997072; s_ViewType=10; aburl=1; _dp.ac.v=771d9bb8-a7f2-4d76-8bf3-a13639ae2217; ctu=6f9ef9a624f2bb02349e3412ec41e048b9d1e204d4b5ace5828e7fef8adc319b; uuid=FABC533C9DFB7697EA69101F0E9BA3433B4C21FA22195EF9294F2EB256C50DBD; iuuid=FABC533C9DFB7697EA69101F0E9BA3433B4C21FA22195EF9294F2EB256C50DBD; _lxsdk=FABC533C9DFB7697EA69101F0E9BA3433B4C21FA22195EF9294F2EB256C50DBD; _ga=GA1.2.2029444719.1624263718; ua=dpuser_8723703376; fspop=test; cy=23; cye=haikou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1638601245; dper=92b45344e065dccd73f49a5f35c3199caf0700028fb482052cbdc6dd828e7788da6496adb22783938714abee41ab3b737cb23486307598f8c7321ca1afabdc689d54c6084be55dd55ade40bc9d3270e87f91e3e9a0e3df9456baff3b84e57454; ll=7fd06e815b796be3df069dec7836c3df; uamo=18508921021; dplet=51f7f8eb52ef4c18661bd3a1955e648f; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1638603180; _lxsdk_s=17d843ebc0f-d5-044-030%7C%7C193"}
response = requests.get(url=url,headers=headers)
html = etree.HTML(response.text)
shop_id = html.xpath('//*[@id="shop-all-list"]//ul//li//a[@data-click-name="shop_title_click"]/@data-shopid')
shop_name = html.xpath('//*[@id="shop-all-list"]//ul//li//a[@data-click-name="shop_title_click"]/@title')
for i in zip(shop_id,shop_name):
info={
}
info['店名'] = i[1

该博客介绍了使用Python进行网络爬虫,从大众点评网站抓取海口地区商铺名称和ID,并尝试通过API接口获取详细信息。由于API反爬限制,作者转向解析详情页以获取地址和电话。同时,博客提到了文字反解析技术,用于处理详情页中包含的特殊字符。最后,数据被存储为Excel文件并进行了预处理。
最低0.47元/天 解锁文章
2699

被折叠的 条评论
为什么被折叠?



