Python3 爬虫模板

最新推荐文章于 2024-01-22 18:12:30 发布

菜饼同学

最新推荐文章于 2024-01-22 18:12:30 发布

阅读量413

点赞数

分类专栏： Python 文章标签：爬虫 Python

本文链接：https://blog.youkuaiyun.com/qwe641259875/article/details/113343383

版权

Python 专栏收录该内容

14 篇文章

订阅专栏

个人爬虫模板备份，仅供参考

import json
import os
import random
import time

import requests
import re
from bs4 import BeautifulSoup
from urllib import parse
import urllib3
import httpx

urllib3.disable_warnings()

user_agent_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]

'''
爬虫模板

例如爬取b站的搜索列表

'''
# 关键词
key = '目标关键词'

base_url = '基准url'  # url 入口

stop_cnt = 0

cnt = 1
if not os.path.exists('output_' + key + '.csv'):
    with open('output_' + key + '.csv', "a+") as f:
        f.write('编号,...\n')
print('编号,...')
for num in range(1, 26):
    # 此处的 20 是每次 num 可以爬取 20 条数据
    cnt = 1 + 20 * (num - 1)
    base_url = base_url + parse.quote(key) + "&page=" + str(num)
    headers = {
        'Cookie': '此处粘贴cookie',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Cache-Control': 'no-cache',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Pragma': 'no-cache',
    }

    res = requests.get(url=base_url, headers=headers)
    # print(res.text)
    soup = BeautifulSoup(res.text, 'html.parser')
    l = soup.find_all('li', {'class': 'video-item'})
    # 解析方法1  <a class="title">目标内容</a>
    titles = soup.find_all('a', {'class': 'title'})
    # 解析方法2 用正则表达式解析出来
    aid = soup.find_all('script')
    aid = re.findall(r'aid":(.*?),', str(aid[7]), re.M | re.I)[0]

    # 再次遍历
    for i in range(len(l)):
        if cnt <= stop_cnt:
            cnt += 1
            continue
        # print(cnt, end=", ")
        # print(names[i].text, end=", ")
        # print(names[i]['href'])
        # 采用 http2.0 进行爬取
        try:
            user_url = '第二部分url' + '第二部分关键词'
            headers['user-agent'] = random.choice(user_agent_list)
            fans = ' '
            headers['sec-fetch-mode'] = 'navigate'
            headers['accept-encoding'] = 'gzip, deflate, br'
            headers['accept-language'] = 'zh-CN,zh;q=0.9'
            headers['sec-ch-ua'] = '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"'
            headers[
                'accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
            with httpx.Client(headers=headers, http2=True) as client:
                headers = {'scheme': 'https',
                           'authority': 'xxx.com',
                           'path': '/x/space/acc/info?mid=' + '关键词',
                           'method': 'GET'}
                user_res = client.get(user_url, headers=headers)
                # json 加载数据
                fans = str(json.loads(user_res.text)['data']['follower'])
                # print(fans, end=", ")
            video_url = 'http:' + titles[i]['href']
            # print(video_url)
            res = requests.get(url=video_url, headers=headers)
            soup = BeautifulSoup(res.text, 'html.parser')
            like = soup.find_all('span', {'class': 'like'})
            coin = soup.find_all('span', {'class': 'coin'})
            collect = soup.find_all('span', {'class': 'collect'})
            share = soup.find_all('span', {'class': 'share'})
            tags = soup.find_all('li', {'class': 'tag'})
            tags = [i.text.strip() for i in tags]
            tag = '|'.join(tags)

            title = "\"" + titles[i].text.strip() + "\""
            lik = like[0].text.strip()
            coi = coin[0].text.strip()
            if coi == '投币':
                coi = '0'
            collec = collect[0].text.strip()

            shar = share[0].text.strip()
            if shar == '分享':
                shar = '0'

            output = ','.join([str(cnt), fans, title, lik, coi, collec, shar, tag])
            print(output)
            with open('output_' + key + '.csv', "a+") as f:
                f.write(output)
                f.write("\n")
            cnt += 1
        except Exception as e:
            i -= 1
            print('[error ', cnt, ']', e)
            pass
        time.sleep(0.5)
    time.sleep(0.5)