f=open('wuqu.html','r') content=f.read() m=re.findall('<tr>[\w|\W]*?</tr>', content) print len(m) if m is None:return for i in range(1,len(m)): c=m[i] d=re.findall('<a[\w|\W]*?</a>', c) if d is None:continue if not len(d) == 4:continue name= re.search(">(.*?)<",d[0]).group(1) #name classify= re.search(">(.*?)<",d[1]).group(1) #classify url=re.search('"/(.*?)"',d[3]).group(1) #url us=url.split("=") type=us[len(us)-1] print type handle=get_curl() rv,mid,type=curl_fetch(handle,url) store(mid,name,url,classify,type) handle.close()