htmllib.HTMLParser
#!/usr/bin/env python
import cStringIO
import formatter
from htmllib import HTMLParser
import urllib
url = "http://blog.youkuaiyun.com/Lyq3413/article/details/76577465"
user_name = "proxy_account"
passwd = "proxy_pwd"
proxy_base = "proxy.example.net:8080"
proxy = "http://%s:%s@%s" % (user_name, passwd, proxy_base)
proxies = {"http": proxy, "https": proxy}
f = urllib.urlopen(url, proxies=proxies) #需要代理
#f = urllib.urlopen(url) #不需要代理可以直接这么写
data = f.read()
f.close()
parser = HTMLParser(formatter.AbstractFormatter(
formatter.DumbWriter(cStringIO.StringIO())))
parser.feed(data)
parser.close()
print parser.anchorlist
这段代码的工作方式中,最重要的是parser 类不进行I/O,它只处理一个formatter 对象。
Python 只有一个formatter 对象,即formatter.AbstractFormatter,用来解析数据并使用writer
对象来分配其输出内容。同样,Python 只有一个有用的writer 对象,即formatter.DumbWriter。
可以为该对象提供一个可选的文件对象,表示将输出写入文件。如果不提供这个文件对象,
则会写入标准输出,但后者一般不是所期望的。为了不让输出写到标准输出,先实例化一个
cStringIO 对象。StringIO 对象会吸收掉这些输出
HTMLParser.HTMLParser
from HTMLParser import HTMLParser
from cStringIO import StringIO
from urllib2 import urlopen
import urllib2
from urlparse import urljoin
url = "http://blog.youkuaiyun.com/lyq3413/article/details/76577465"
user_name = "proxy_account"
passwd = "proxy_pwd"
proxy = "http://%s:%s@proxy.example.net:8080" % (user_name, passwd)
proxies = {"http": proxy, "https": proxy}
proxy_handler = urllib2.ProxyHandler(proxies)
opener = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener)
f = urlopen(url)
data = StringIO(f.read())
f.close()
class AnchorParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
if not hasattr(self, 'data'):
self.data = []
for attr in attrs:
if attr[0] == 'href':
self.data.append(attr[1])
parser = AnchorParser()
parser.feed(data.read().decode('utf-8'))
for link in parser.data:
print urljoin(url, link)
BeautifulSoup
BeautifulSoup不是标准库需要单独安装
pip 工具安装:
$pip install BeautifulSoup
from BeautifulSoup import BeautifulSoup, SoupStrainer
links = BeautifulSoup(data, parseOnlyThese=SoupStrainer('a'))