【python实现网络爬虫(8)】requests+bs4实现笑话大全数据爬取

这里就直接给代码和输出结果
import re
import requests
from bs4 import BeautifulSoup


def get_url(n):
	lst = []
	for i in range(n):
		ui = f"http://xiaohua.zol.com.cn/lengxiaohua/{i}.html"
		lst.append(ui)
	return lst

def get_data(ui,dic_h,dic_c):
	ri = requests.get(ui,headers = dic_h,cookies=dic_c)
	soupi = BeautifulSoup(ri.text, 'lxml')
	lis = soupi.find("ul", class_="article-list").find_all("li")
	lst = []
	for li in lis:
		dic = {}
		title = li.find("a").text
		source = li.find("div",class_ = "article-source").span.next_sibling.text
		content = re.sub(r"\s","",li.find("div",class_ = "summary-text").text)
		dic['标题'] = title
		dic['来源'] = source
		dic['内容'] = content
		lst.append(dic)
	return lst


if __name__ == "__main__":
	urllst = get_url(10)
	dic_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
	
	cookies = 'ip_ck=5ceJ4vn3j7QuMjA4NTMwLjE1NzgxMzM1Nzc%3D; _ga=GA1.3.1225262396.1578133580; bdshare_firstime=1578133579985; z_pro_city=s_provice%3Dhenan%26s_city%3Dxinyang; userProvinceId=22; userCityId=274; userCountyId=0; userLocationId=98988; _gid=GA1.3.714541997.1581779195; questionnaire_close_today=1581724801; questionnaire_close_total=1; lv=1581813976; vn=7; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1581674234,1581677358,1581779195,1581813977; questionnaire_pv=1581811202; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1581813980; 0eaca02be5352ff53a4b3abd16c22bb8=bs283k1v2g1fn2bo278t%7B%7BZ%7D%7D2%7B%7BZ%7D%7Dnull; 25c963336b4e0a1c4aa78f69eb50b24c=bs283k1v2g1fn2bo278t%7B%7BZ%7D%7D2%7B%7BZ%7D%7Dnull; MyZClick_0eaca02be5352ff53a4b3abd16c22bb8=/html/body/div%5B6%5D/div/div%5B2%5D/div/a/; MyZClick_25c963336b4e0a1c4aa78f69eb50b24c=/html/body/div%5B6%5D/div/div%5B2%5D/div/a/'
	dic_cookies = {}
	for i in cookies.split("; "):
		dic_cookies[i.split("=")[0]] = i.split("=")[1]


	data_lst = []
	errorlst = []
	for u in urllst:
		try:
			data_lst.extend(get_data(u,dic_headers,dic_cookies))
			print("已经爬取{}条数据".format(len(data_lst)))
		except:
			errorlst.append(u)
			print("数据采集失败,网址为:",u)

输出结果为:
在这里插入图片描述
excel中的数据为:
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

lys_828

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值