用BeautifulSoup解析时要注意在有汉字的网页中编码解码问题,下面是获取大街网网页中class='jobInfo'的div标签的数据内容
from bs4 import BeautifulSoup
import urllib2
c = urllib2.urlopen('http://job.dajie.com/7262fae6-a1aa-4674-9efa-3baf697faa46.html')
soup = BeautifulSoup(c.read())
for div in soup.find_all('div'):
if div.get('class') == ['jobInfo']:
print 'find it'
#print div.contents
s = div.contents
for x in s:
if (x.encode('GB2312')) != '<br/>' and (x.encode('GB2312')) != '\n': #注意此处GB2312编码不是utf8
print x.encode('GB2312')
break