这里就直接给代码和输出结果
import re
import requests
from bs4 import BeautifulSoup
def get_url(n):
lst = []
for i in range(n):
ui = f"http://xiaohua.zol.com.cn/lengxiaohua/{i}.html"
lst.append(ui)
return lst
def get_data(ui,dic_h,dic_c):
ri = requests.get(ui,headers = dic_h,cookies=dic_c)
soupi = BeautifulSoup(ri.text, 'lxml')
lis = soupi.find("ul", class_="article-list").find_all("li")
lst = []
for li in lis:
dic = {}
title = li.find("a").text
source = li.find("div",class_ = "article-source").span.next_sibling.text
content = re.sub(r"\s","",li.find("div",class_ = "summary-text").text)
dic['标题'] = title
dic['来源'] = source
dic['内容'] = content
lst.append(dic)
return lst
if __name__ == "__main__":
urllst = get_url(10)
dic_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
cookies = 'ip_ck=5ceJ4vn3j7QuMjA4NTMwLjE1NzgxMzM1Nzc%3D; _ga=GA1.3.1225262396.1578133580; bdshare_firstime=1578133579985; z_pro_city=s_provice%3Dhenan%26s_city%3Dxinyang; userProvinceId=22; userCityId=274; userCountyId=0; userLocationId=98988; _gid=GA1.3.714541997.1581779195; questionnaire_close_today=1581724801; questionnaire_close_total=1; lv=1581813976; vn=7; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1581674234,1581677358,1581779195,1581813977; questionnaire_pv=1581811202; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1581813980; 0eaca02be5352ff53a4b3abd16c22bb8=bs283k1v2g1fn2bo278t%7B%7BZ%7D%7D2%7B%7BZ%7D%7Dnull; 25c963336b4e0a1c4aa78f69eb50b24c=bs283k1v2g1fn2bo278t%7B%7BZ%7D%7D2%7B%7BZ%7D%7Dnull; MyZClick_0eaca02be5352ff53a4b3abd16c22bb8=/html/body/div%5B6%5D/div/div%5B2%5D/div/a/; MyZClick_25c963336b4e0a1c4aa78f69eb50b24c=/html/body/div%5B6%5D/div/div%5B2%5D/div/a/'
dic_cookies = {}
for i in cookies.split("; "):
dic_cookies[i.split("=")[0]] = i.split("=")[1]
data_lst = []
errorlst = []
for u in urllst:
try:
data_lst.extend(get_data(u,dic_headers,dic_cookies))
print("已经爬取{}条数据".format(len(data_lst)))
except:
errorlst.append(u)
print("数据采集失败,网址为:",u)
输出结果为:
excel中的数据为: