更新:之前代码是用 python2 写的,有关 python3 的代码可以参考这位博主的:https://blog.youkuaiyun.com/baidu_28479651/article/details/76158051
代码如下:
# coding = UTF-8
# 爬取李东风PDF文档,网址:http://www.math.pku.edu.cn/teachers/lidf/docs/textrick/index.htm
import urllib.request
import re
import os
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html):
reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)'
url_re = re.compile(reg)
url_lst = url_re.findall(html.decode('gb2312'))
return(url_lst)
def getFile(url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f