import requests
import xlwt
import re
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def getHtml(url):
try:
r = requests.get(url,headers = headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return None
def get_info(html,mylist):
ranks = re.findall('<div class="num">(\d*)</div><div class="content">',html,re.S)
names = re.findall('target="_blank" class="title">(.*?)</a><!----><div class="detail">',html,re.S)
auctors = re.findall('<i class="b-icon author"></i>(.*?)</span></a>',html,re.S)
scores = re.findall('div class="pts"><div>(\d+)</div>综合得分',html,re.S)
for rank,name,auctor,score in zip(ranks,names,auctors,scores):
temp = [rank,name,auctor,score]
mylist.append(temp)
def writeExel(mylist):
header = ['排名','标题','作者','得分']
book = xlwt.Workbook(encoding='UTF-8')
sheet = book.add_sheet('Sheet1')
for k in range(len(header)):
sheet.write(0,k,header[k])
i = 1
for part in mylist:
j = 0
for what in part:
sheet.write(i,j,what)
j += 1
i += 1
book.save('D:/bilibiliTop100.xls')
if __name__ == '__main__':
mylist = []
url = 'https://www.bilibili.com/ranking?'
html = getHtml(url)
get_info(html,mylist)
writeExel(mylist)
简单练手:B站前100爬取
最新推荐文章于 2024-10-24 00:00:00 发布