1.识别网站所用技术
pip install builtwith
import builtwith
builtwith.parse('http://www.youkuaiyun.com')
2.寻找网有者
pip install python-whois
import whois
print whois.whois('www.youkuaiyun.com')
3.下载网页
import urllib2
def download(url):
print 'Download:',url
try:
html = urllib2.urlopen(url).read()
except urllib2.URLDrror as e:
print 'Download error:',e.reason
html = None
return html
download('http://www.youkuaiyun.com')
错误信息列表:https://tools.ietf.org/html/rfc7231#section-6
4.解析robots.txt
import robotparser
rp = robotparser.RobotFileParser()
rp.set_url('http://www.youkuaiyun.com/robots.txt')
rp.read()
url = 'http://www.cdsn.net'
user_agent = 'BadCrawler'
rp.can_fetch(user_agent, url)
5.解析网页
(1)正则表达式:
(2)Beautiful Soup
from bs4 import BeautifulSoup
broken_html='<ul class=country><li>Area<li>Propulation</ul>'
soup=BeautifulSoup(broken_html, 'html.parser')
fixed_html = soup.prettifu()
print fixed_html