爬取百度糯米信息——深圳所有甜品店

最新推荐文章于 2021-06-08 14:17:40 发布

原创最新推荐文章于 2021-06-08 14:17:40 发布 · 549 阅读

0 ·

CC 4.0 BY-SA版权

爬虫专栏收录该内容

1 篇文章

订阅专栏

使用Python网络爬虫技术，通过requests库从百度糯米网站抓取深圳甜品店的详细信息，包括店铺名、评分及人均消费，并将数据整理成结构化格式存储。

最近在看崔庆才的书《Pyhton3网络爬虫开发实战》，学习爬虫。学习了前三章后，将书中的代码执行了一遍，做了第一个爬虫。主要使用requests库，爬取百度糯米网深圳所有甜品店的店铺名、评分和人均消费。

import re
import requests
import json
import time
from requests.exceptions import RequestException
'''网页为https://sz.nuomi.com/880-page1?#j-sort-bar,https://sz.nuomi.com/880-page2?#j-sort-bar...一共有8页，需要分页爬取，偏移量为1'''
def get_one_page(url):
    #获取网页源代码
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebkit/537.36(KHTNML,like Gecko)Chrome/52.0.2743.116 Safari/537.36'}#请求头
        response = requests.get(url, headers=headers)
        response.encoding='utf-8'
        if response.status_code==200:   #状态码为200，请求成功
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    #解析页面
    pattern=re.compile('<li class="shop-infoo-list-item clearfix.*?h3.*?shop-infoo-list-item-title">(.*?)</h3>'+
                       '.*?</span><span class.*?gold.*?style.*?>(.*?)</span>'+
                       '.*?<a href.*?span class.*?target="_blank">.*?item-line-label">(.*?)</span>'+
                       '.*?item-line-label">(.*?)</span>'+
                       '.*?</a>.*?</li>',re.S)      #re.S
    items=re.findall(pattern,html)
    for item in items:
        yield {
            '店铺名':item[0],
            '评分':item[1],
            '人均消费（元）':item[2],
            '地区':item[3]

        }      #遍历提取店铺名，人均消费，评分和地点，将它赋值为一个字典，形成结构化数据



def write_to_file(content):
    with open('百度糯米-甜品店.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')

def main(page_number):
    url='https://sz.nuomi.com/880-page'+str(page_number)+'?#j-sort-bar'
    html=get_one_page(url)
    for item in parse_one_page(html):     #访问生成器，以json字典格式写入文件
        print(item)
        write_to_file(item)

if __name__=='__main__':
    for i in range(1,9,1):
        main(page_number=i)
        time.sleep(1)