豆瓣书籍Top250链接:https://book.douban.com/top250?icn=index-book250-all
知识点及实例部分参考:
python数据分析之爬虫三:BeautifulSoup库爬虫实例
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
def getHTMLText(url):
try:
kw = {'user-agent': 'chrome/10.0'}
r = requests.get(url, params=kw)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('error')
def parseHTML(bookinfo, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', attrs={'class': 'nbg'})
titles = re.findall(r'title=\".+?\"', html)
for i in range(len(titles) - 25):
if 'title="可试读"'