#EE论坛爬虫#by JerryFang
#2013.11.13
import re
import urllib2
import urllib
import cookielib
def visitpage(url):
nr = urllib2.urlopen(url).read().decode('gbk')
print nr
z = re.compile('''<a href=.*?class="xi2">(.*?)</a>
''', re.S|re.MULTILINE)
t = re.compile('''<em id=".*?">(.*?)</em>
''', re.S|re.MULTILINE)
c = re.compile(ur'''<table.*?>(.*?)</table>
''', re.DOTALL|re.MULTILINE)
## author = z.findall(nr)
## time = t.findall(nr)
cont = c.findall(nr)
## print author
## print time
print cont
raw_input('press any key')
loginurl = 'http://www.eeban.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1'
# cookie
cj = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
# Login登录POST包中获得
user_data = {'username':'nku-ada',
'password':'1986a7da84fc50b4c0140b1124b968d5',
'quickforward':'yes',
'handlekey':'ls'
}
url_data = urllib.urlencode(user_data)
req = urllib2.Request(
url = loginurl,
data = url_data
)
#登录动作
res = opener.open(req)
print 'Login Done!'
#打开
res = opener.open('http://www.eeban.com/forum.php?mod=forumdisplay&fid=137')
print 'Prepare load content'
cont = res.read().decode('gbk')
print 'Load content'
#找内容
b = re.compile(ur'''<th.*?>(.+?)</th>''', re.DOTALL|re.MULTILINE)
c = re.compile(ur'''<em>.*?</em>.*?<a href="(.*?)".*?>(.*?)</a>''', re.DOTALL|re.MULTILINE)
res = b.findall(cont)
for i in res:
subres = c.findall(i)
for j in subres:
print j[1]
visitpage(j[0])
这里有个问题,我在趴一个标题连接内容的时候,论坛是不给我显示的。