import urllib2 import urllib import re print 'begin--' url = 'http://hr.tencent.com/position.php?lid=2175&tid=87' user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64)' headers = { 'User-Agent' : user_agent } param = {} param['lid']=2175 param['tid']=87 paramsData = urllib.urlencode(param) print url req = urllib2.Request(url,None,headers) res = urllib2.urlopen(url) page = res.read() print len(page) #print res.info() f = file('1.html','wb') f.write(page) f.close() patten_table = r'<table.*?>(.*?)</table>' tableStr = re.findall(patten_table,page,re.S|re.M) #print tableStr f_table = file('td.html','wb') print('table len=%d'%len(tableStr)) for one_table_str in tableStr: patten_table_tr = r'<tr.*?>(.*?)</tr>' all_tr_str = re.findall(patten_table_tr, one_table_str, re.S | re.M) print('tr len=%d' % len(all_tr_str)) for one_tr_td in all_tr_str: patten_table_tr_td = r'<td.*?>(.*?)</td>' all_tds = re.findall(patten_table_tr_td, one_tr_td, re.S | re.M) print('td len=%d' % len(all_tds)) f_table.write(all_tds[0]) f_table.write('aaaa\r\n') f_table.close() #print page
python抓取招聘信息简单代码
最新推荐文章于 2025-04-14 22:06:21 发布