爬取**文章的两种方法
一、方法1
Network-All-刷新网页-第0个请求:post?page=1
import requests
from bs4 import BeautifulSoup
url = 'https://www.……posts?page=1'#**网址
headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
res = requests.get(url,headers = headers)#获取数据
bs = BeautifulSoup(res.text,'html.parser')#解析数据
lists = bs.find_all('h2',class_='ContentItem-title')#定位查找内容位置,用find_all获取列表
for i in lists:
article = [i.find('a').text]
print(article)#循环列表,逐个取出
二、方法2
Network-XHR-锁定请求
import requests
from bs4 import BeautifulSoup
url1 = 'https://www.……articles'#网址
headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}#请求头
offset = 10#初始值
while True:#循环,取多页
params = {
'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'offset': str(offset),
'limit': '10',
'sort_by': 'created'}
res = requests.get(url1,headers = headers,params = params)
articles1 = res.json()
articles = articles1['data']
for i in articles:
title = [i['title']]
print(title)
offset = offset + 20 #注:此位置不同,取出数据有差异
if offset > 30:
break