import urllib,urllib2
import re
import string
def get_htmls(url):
response = urllib.urlopen(url)
html_str = response.read()
http_status = response.code
header_str = str(response.info())
#print html_str
return html_str
def get_match(pattern,string):
match = pattern.search(string)
if match:
print match.group()
return match.group()
else:
print "none"
def update_dict(list_oid,oid):
print oid in list_oid.keys()
if oid in list_oid.keys():
print "have"
oid_value=list_oid.get(oid)
list_oid[oid]=int(oid_value)+1
else:
list_oid[oid] = '0'
if __name__=="__main__":
list_oid={}
for i in range(1,50):
url = 'http://api.sfefefefe'
html_temp=get_htmls(url)
pattern = re.compile(r'http://xxx.com/stats_imp.php.*?vendor_id=')
link_temp=get_match(pattern,html_temp)
pattern = re.compile(r'oid=\d+')
oid_temp=get_match(pattern,link_temp)
pattern = re.compile(r'\d+')
oid_string=get_match(pattern,oid_temp)
update_dict(list_oid,oid_string)
print list_oid.items()
主要是用了 正则 ,循环,字典。
http://www.cnblogs.com/morya/archive/2011/05/12/2044904.html url http等的学习
http://www.cnblogs.com/wxw0813/archive/2012/09/18/2690694.html 超时问题