一、前言
今天介绍将HTML网页抓取下来,然后以PDF保存,废话不多说直接进入教程。
今天的例子以廖雪峰老师的Python 教程网站为例:http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000
二、准备工作
PyPDF2的安装使用(用来合并PDF): PyPDF2版本:1.25.1
https://pypi.python.org/pypi/PyPDF2/1.25.1
或
https://github.com/mstamy2/PyPDF2
安装:
pip install PyPDF2
使用示例:
1
2
3
4
5
6
7
8
9
from PyPDF2
import PdfFileMerger
merger
= PdfFileMerger
(
)
input1
=
open
(
"hql_1_20.pdf"
,
"rb"
)
input2
=
open
(
"hql_21_40.pdf"
,
"rb"
)
merger.
append
( input1
)
merger.
append
( input2
)
# Write to an output PDF document
output
=
open
(
"hql_all.pdf"
,
"wb"
)
merger.
write
( output
)
requests、beautifulsoup 是爬虫两大神器,reuqests 用于网络请求,beautifusoup 用于操作 html 数据。有了这两把梭子,干起活来利索。scrapy 这样的爬虫框架我们就不用了,这样的小程序派上它有点杀鸡用牛刀的意思。此外,既然是把 html 文件转为 pdf,那么也要有相应的库支持, wkhtmltopdf 就是一个非常的工具,它可以用适用于多平台的 html 到 pdf 的转换,pdfkit 是 wkhtmltopdf 的Python封装包。首先安装好下面的依赖包
pip install requests
pip install beautifulsoup4
pip install pdfkit
安装 wkhtmltopdf Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”。Ubuntu 和 CentOS 可以直接用命令行进行安装
$ sudo apt-get install wkhtmltopdf # ubuntu
$ sudo yum intsall wkhtmltopdf # centos
三、数据准备
1.获取每篇文章的url
def get_url_list(): response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000" ) soup = BeautifulSoup(response.content, "html.parser" ) menu_tag = soup.find_all(class_ = "uk-nav uk-nav-side" )[ 1 ] urls = [] for li in menu_tag.find_all( "li" ): url = "http://www.liaoxuefeng.com" + li.a.get( 'href' ) urls.append(url) return urls
2.通过文章url用模板保存每篇文章的HTML文件
html模板:
进行保存:
def parse_url_to_html(url, name): try : response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser' ) body = soup.find_all(class_ = "x-wiki-content" )[ 0 ] title = soup.find('h4' ).get_text() center_tag = soup.new_tag("center" ) title_tag = soup.new_tag('h1' ) title_tag.string = title center_tag.insert(1 , title_tag) body.insert(1 , center_tag) html = str(body) pattern = "(<img .*?src=\")(.*?)(\")" def func(m): if not m.group( 3 ).startswith( "http" ): rtn = m.group(1 ) + "http://www.liaoxuefeng.com" + m.group( 2 ) + m.group( 3 ) return rtn else : return m.group( 1 )+m.group( 2 )+m.group( 3 ) html = re.compile(pattern).sub(func, html) html = html_template.format(content=html) html = html.encode("utf-8" ) with open(name, 'wb' ) as f: f.write(html) return name except Exception as e: logging.error("解析错误" , exc_info= True )
3.把html转换成pdf
def save_pdf(htmls, file_name): options = { 'page-size' : 'Letter' , 'margin-top' : '0.75in' , 'margin-right' : '0.75in' , 'margin-bottom' : '0.75in' , 'margin-left' : '0.75in' , 'encoding' : "UTF-8" , 'custom-header' : [ ('Accept-Encoding' , 'gzip' ) ], 'cookie' : [ ('cookie-name1' , 'cookie-value1' ), ('cookie-name2' , 'cookie-value2' ), ], 'outline-depth' : 10 , } pdfkit.from_file(htmls, file_name, options=options)
4.把转换好的单个PDF合并为一个PDF
merger = PdfFileMerger() for pdf in pdfs: merger.append(open(pdf,'rb' )) print u "合并完成第" +str(i)+ '个pdf' +pdf
完整源码:
import os import re import time import logging import pdfkit import requests from bs4 import BeautifulSoup from PyPDF2 import PdfFileMerger html_template = def parse_url_to_html(url, name): try : response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser' ) body = soup.find_all(class_ = "x-wiki-content" )[ 0 ] title = soup.find('h4' ).get_text() center_tag = soup.new_tag("center" ) title_tag = soup.new_tag('h1' ) title_tag.string = title center_tag.insert(1 , title_tag) body.insert(1 , center_tag) html = str(body) pattern = "(<img .*?src=\")(.*?)(\")" def func(m): if not m.group( 3 ).startswith( "http" ): rtn = m.group(1 ) + "http://www.liaoxuefeng.com" + m.group( 2 ) + m.group( 3 ) return rtn else : return m.group( 1 )+m.group( 2 )+m.group( 3 ) html = re.compile(pattern).sub(func, html) html = html_template.format(content=html) html = html.encode("utf-8" ) with open(name, 'wb' ) as f: f.write(html) return name except Exception as e: logging.error("解析错误" , exc_info= True ) def get_url_list(): response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000" ) soup = BeautifulSoup(response.content, "html.parser" ) menu_tag = soup.find_all(class_ = "uk-nav uk-nav-side" )[ 1 ] urls = [] for li in menu_tag.find_all( "li" ): url = "http://www.liaoxuefeng.com" + li.a.get( 'href' ) urls.append(url) return urls def save_pdf(htmls, file_name): options = { 'page-size' : 'Letter' , 'margin-top' : '0.75in' , 'margin-right' : '0.75in' , 'margin-bottom' : '0.75in' , 'margin-left' : '0.75in' , 'encoding' : "UTF-8" , 'custom-header' : [ ('Accept-Encoding' , 'gzip' ) ], 'cookie' : [ ('cookie-name1' , 'cookie-value1' ), ('cookie-name2' , 'cookie-value2' ), ], 'outline-depth' : 10 , } pdfkit.from_file(htmls, file_name, options=options) def main(): start = time.time() file_name = u"liaoxuefeng_Python3_tutorial" urls = get_url_list() for index, url in enumerate(urls): parse_url_to_html(url, str(index) + ".html" ) htmls =[] pdfs =[] for i in range( 0 , 124 ): htmls.append(str(i)+'.html' ) pdfs.append(file_name+str(i)+'.pdf' ) save_pdf(str(i)+'.html' , file_name+str(i)+ '.pdf' ) print u "转换完成第" +str(i)+ '个html' merger = PdfFileMerger() for pdf in pdfs: merger.append(open(pdf,'rb' )) print u "合并完成第" +str(i)+ '个pdf' +pdf output = open(u"廖雪峰Python_all.pdf" , "wb" ) merger.write(output) print u "输出PDF成功!" for html in htmls: os.remove(html) print u "删除临时文件" +html for pdf in pdfs: os.remove(pdf) print u "删除临时文件" +pdf total_time = time.time() - start print (u "总共耗时:%f 秒" % total_time) if __name__ == '__main__' :
main() 转自http://blog.youkuaiyun.com/hubaoquanu/