通过网页抓取有用信息
1. 正则表达式抓取:
import urllib2
import re
def scrape(html):
area = re.findall('<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html)[0]
return area
if __name__ == '__main__':
html = urllib2.urlopen('http://example.webscraping.com/view/United-Kingdom-239').read()
print scrape(html)
2. Beautiful Soup
pip install beautifulsoup4
1)将已下载的HTML内容解析为soup文档。
2)对不完整标签进行补全
3)数据抽取
import urllib2
from bs4 import BeautifulSoup
def scrape(html):
soup = BeautifulSoup(html, "html.parser")
print soup
tr = soup.find(attrs={'id':'places_area__row'}) # locate the area row
# 'class' is a special python attribute so instead 'class_' is used
td = tr.find(attrs={'class':'w2p_fw'}) # locate the area tag
area = td.text # extract the area contents from this tag
return area
if __name__ == '__main__':
html = urllib2.urlopen('http://example.webscraping.com/places/default/view/United-Kingdom-239').read()
print scrape(html)
3. Lxml
1) 安装lxml
sudo pip install lxml
2)安装cssselect sudo pip install cssselect
解析不合法html模块为统一合法模式
补全缺失符号
查找内容
import urllib2
import lxml.html
def scrape(html):
tree = lxml.html.fromstring(html)
td = tree.cssselect('tr#places_neighbours__row > td.w2p_fw')[0]
area = td.text_content()
return area
if __name__ == '__main__':
html = urllib2.urlopen('http://example.webscraping.com/places/default/view/United-Kingdom-239').read()
print scrape(html)