无聊的很,写了个程序来提取优快云灌水乐园的帖子。主要使用了urllib和re两个库。 可以提取帖子的标题,链接,分值和回复数。 代码如下:(就不解释了) # -*- coding: utf-8 -*- import urllib, re def parsepost(msg): title = re.search('''title="(.+?)"''', msg).group()[7:-1] print title href = re.search('''title=(.+?)href="(/S+?)"''', msg).group() spos = href.find('href=') href = href[spos + 6:-1] print href nums = [] for r in re.finditer('''<td>[^.]+?/d+''', msg): nums += [r.group()] assert(len(nums) == 2) score = int(re.search('''/d+''', nums[0]).group()) replies = int(re.search('''/d+''', nums[1]).group()) print score print replies print "============================================" sock=urllib.urlopen("http://forum.youkuaiyun.com/SList/FreeZone") source=sock.read() print source startpos = [] endpos = [] trs = [] for i in re.finditer('''<tr class="(dark|light)">''', source): startpos += [i.start()] for j in re.finditer('''</tr>''', source): endpos += [j.end()] startpos = startpos[1:] endpos = endpos[4:] assert(len(startpos) == len(endpos)) for k in range(0, len(startpos)): trs += [source[startpos[k]:endpos[k]]] for t in trs: parsepost(t)