安装
pip install beautifulsoup4
建议安装lxml,作为beautiful soup的内置解析器
对于windows,到 http://www.lfd.uci.edu/~gohlke/pythonlibs/ 搜索下载并安装
pip3 install "lxml-3.6.0-cp35-cp35m-win_amd64.whl"
核心方法
http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
find(name, attrs, recursive, text, **kwargs)
find_all(name, attrs, recursive, text, **kwargs)
name对应tag名字,比如'div'
attrs对应tag的属性,比如id='myid',对于class直接输入名字,不可使用class='xxx',可以设置True和False来过滤tag是否拥有该属性
tag可以通过 . 运算符依次获取下一级的tag,e.g. tag.div.a
tag的内容可以通过string获取,属性可以通过get('attr_name')获取
代码
import requests
from bs4 import BeautifulSoup
user_root_blog = 'http://xuanzhui.iteye.com/'
# 伪装一下
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36'}
page_str = requests.get(user_root_blog, headers = headers).text
# 解析一下总共多少页
soup = BeautifulSoup(page_str, 'lxml')
# 分页数据的根节点
page_div = soup.find('div', 'pagination')
total_page = 1
if page_div:
page_tags = page_div.find_all('a', False)
page_arr = [int(page.string) for page in page_tags if page.string.isdigit()]
if page_arr:
total_page = max(page_arr)
print('total page:', total_page)
# 解析blog的函数
def parse_to_get_blogs(page_str):
soup = BeautifulSoup(page_str, 'lxml')
# 得到文章列表标题的所有节点
title_tags = soup.find_all('div', 'blog_title')
if not title_tags:
return
url_pref = user_root_blog[:-1]
return [(url_pref + tag.h3.a.get('href'), tag.h3.a.string) for tag in title_tags]
blogs = parse_to_get_blogs(page_str)
# 如果没有文章
if not blogs:
print("no valid titles")
exit(0)
for i in range(2, total_page + 1):
url = user_root_blog + '?page=' + str(i)
print('parsing ', url)
page_str = requests.get(url, headers=headers).text
blogs.extend(parse_to_get_blogs(page_str))
with open('blogs.txt', 'w') as f:
for tmp in blogs:
f.write(tmp[0])
f.write('\n')
f.write(tmp[1])
f.write('\n\n')