Python3 爬虫模板

个人爬虫模板备份,仅供参考

import json
import os
import random
import time

import requests
import re
from bs4 import BeautifulSoup
from urllib import parse
import urllib3
import httpx

urllib3.disable_warnings()

user_agent_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]

'''
爬虫模板

例如爬取b站的搜索列表

'''
# 关键词
key = '目标关键词'

base_url = '基准url'  # url 入口

stop_cnt = 0

cnt = 1
if not os.path.exists('output_' + key + '.csv'):
    with open('output_' + key + '.csv', "a+") as f:
        f.write('编号,...\n')
print('编号,...')
for num in range(1, 26):
    # 此处的 20 是每次 num 可以爬取 20 条数据
    cnt = 1 + 20 * (num - 1)
    base_url = base_url + parse.quote(key) + "&page=" + str(num)
    headers = {
        'Cookie': '此处粘贴cookie',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Cache-Control': 'no-cache',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Pragma': 'no-cache',
    }

    res = requests.get(url=base_url, headers=headers)
    # print(res.text)
    soup = BeautifulSoup(res.text, 'html.parser')
    l = soup.find_all('li', {'class': 'video-item'})
    # 解析方法1  <a class="title">目标内容</a>
    titles = soup.find_all('a', {'class': 'title'})
    # 解析方法2 用正则表达式解析出来
    aid = soup.find_all('script')
    aid = re.findall(r'aid":(.*?),', str(aid[7]), re.M | re.I)[0]

    # 再次遍历
    for i in range(len(l)):
        if cnt <= stop_cnt:
            cnt += 1
            continue
        # print(cnt, end=", ")
        # print(names[i].text, end=", ")
        # print(names[i]['href'])
        # 采用 http2.0 进行爬取
        try:
            user_url = '第二部分url' + '第二部分关键词'
            headers['user-agent'] = random.choice(user_agent_list)
            fans = ' '
            headers['sec-fetch-mode'] = 'navigate'
            headers['accept-encoding'] = 'gzip, deflate, br'
            headers['accept-language'] = 'zh-CN,zh;q=0.9'
            headers['sec-ch-ua'] = '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"'
            headers[
                'accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
            with httpx.Client(headers=headers, http2=True) as client:
                headers = {'scheme': 'https',
                           'authority': 'xxx.com',
                           'path': '/x/space/acc/info?mid=' + '关键词',
                           'method': 'GET'}
                user_res = client.get(user_url, headers=headers)
                # json 加载数据
                fans = str(json.loads(user_res.text)['data']['follower'])
                # print(fans, end=", ")
            video_url = 'http:' + titles[i]['href']
            # print(video_url)
            res = requests.get(url=video_url, headers=headers)
            soup = BeautifulSoup(res.text, 'html.parser')
            like = soup.find_all('span', {'class': 'like'})
            coin = soup.find_all('span', {'class': 'coin'})
            collect = soup.find_all('span', {'class': 'collect'})
            share = soup.find_all('span', {'class': 'share'})
            tags = soup.find_all('li', {'class': 'tag'})
            tags = [i.text.strip() for i in tags]
            tag = '|'.join(tags)

            title = "\"" + titles[i].text.strip() + "\""
            lik = like[0].text.strip()
            coi = coin[0].text.strip()
            if coi == '投币':
                coi = '0'
            collec = collect[0].text.strip()

            shar = share[0].text.strip()
            if shar == '分享':
                shar = '0'

            output = ','.join([str(cnt), fans, title, lik, coi, collec, shar, tag])
            print(output)
            with open('output_' + key + '.csv', "a+") as f:
                f.write(output)
                f.write("\n")
            cnt += 1
        except Exception as e:
            i -= 1
            print('[error ', cnt, ']', e)
            pass
        time.sleep(0.5)
    time.sleep(0.5)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

菜饼同学

帮助别人,就是帮助自己,共勉。

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值