个人爬虫模板备份,仅供参考
import json
import os
import random
import time
import requests
import re
from bs4 import BeautifulSoup
from urllib import parse
import urllib3
import httpx
urllib3.disable_warnings()
user_agent_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
'''
爬虫模板
例如爬取b站的搜索列表
'''
# 关键词
key = '目标关键词'
base_url = '基准url' # url 入口
stop_cnt = 0
cnt = 1
if not os.path.exists('output_' + key + '.csv'):
with open('output_' + key + '.csv', "a+") as f:
f.write('编号,...\n')
print('编号,...')
for num in range(1, 26):
# 此处的 20 是每次 num 可以爬取 20 条数据
cnt = 1 + 20 * (num - 1)
base_url = base_url + parse.quote(key) + "&page=" + str(num)
headers = {
'Cookie': '此处粘贴cookie',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
'sec-ch-ua': '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cache-Control': 'no-cache',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Pragma': 'no-cache',
}
res = requests.get(url=base_url, headers=headers)
# print(res.text)
soup = BeautifulSoup(res.text, 'html.parser')
l = soup.find_all('li', {'class': 'video-item'})
# 解析方法1 <a class="title">目标内容</a>
titles = soup.find_all('a', {'class': 'title'})
# 解析方法2 用正则表达式解析出来
aid = soup.find_all('script')
aid = re.findall(r'aid":(.*?),', str(aid[7]), re.M | re.I)[0]
# 再次遍历
for i in range(len(l)):
if cnt <= stop_cnt:
cnt += 1
continue
# print(cnt, end=", ")
# print(names[i].text, end=", ")
# print(names[i]['href'])
# 采用 http2.0 进行爬取
try:
user_url = '第二部分url' + '第二部分关键词'
headers['user-agent'] = random.choice(user_agent_list)
fans = ' '
headers['sec-fetch-mode'] = 'navigate'
headers['accept-encoding'] = 'gzip, deflate, br'
headers['accept-language'] = 'zh-CN,zh;q=0.9'
headers['sec-ch-ua'] = '"Chromium";v="88", "Google Chrome";v="88", ";Not A Brand";v="99"'
headers[
'accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
with httpx.Client(headers=headers, http2=True) as client:
headers = {'scheme': 'https',
'authority': 'xxx.com',
'path': '/x/space/acc/info?mid=' + '关键词',
'method': 'GET'}
user_res = client.get(user_url, headers=headers)
# json 加载数据
fans = str(json.loads(user_res.text)['data']['follower'])
# print(fans, end=", ")
video_url = 'http:' + titles[i]['href']
# print(video_url)
res = requests.get(url=video_url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
like = soup.find_all('span', {'class': 'like'})
coin = soup.find_all('span', {'class': 'coin'})
collect = soup.find_all('span', {'class': 'collect'})
share = soup.find_all('span', {'class': 'share'})
tags = soup.find_all('li', {'class': 'tag'})
tags = [i.text.strip() for i in tags]
tag = '|'.join(tags)
title = "\"" + titles[i].text.strip() + "\""
lik = like[0].text.strip()
coi = coin[0].text.strip()
if coi == '投币':
coi = '0'
collec = collect[0].text.strip()
shar = share[0].text.strip()
if shar == '分享':
shar = '0'
output = ','.join([str(cnt), fans, title, lik, coi, collec, shar, tag])
print(output)
with open('output_' + key + '.csv', "a+") as f:
f.write(output)
f.write("\n")
cnt += 1
except Exception as e:
i -= 1
print('[error ', cnt, ']', e)
pass
time.sleep(0.5)
time.sleep(0.5)