【python3】
#-*- coding:utf-8 -*- #author:lvfengwen #date:2017/10/12 #descript:爬取环境配置信息 import urllib,requests import http.cookiejar import re # import pprint import gzip #cookielib --> http.cookiejar #urllib2 --> urllib.request #urllib --> urllib.request #百度贴吧爬虫类 class EnvironmentSpider: # 初始化,传入基地址,是否只看楼主的参数 def __init__(self,db_ip): self.base_url = "http://docker.ql.corp/LoginServlet?username=lvfengwen%40mobanker.com&password=123456&action=login" # self.filename = 'cookie.txt' # 设置保存cookie的文件,同级目录下的cookie.txt self.db_ip = db_ip self.result_dic = {} self.opener = None def make_fake_request(self): # 声明一个CookieJar对象实例来保存cookie cookie = http.cookiejar.CookieJar() #cookielib --> http.cookiejar # 利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器 handler = urllib.request.HTTPCookieProcessor(cookie) # 通过handler来构建opener self.opener = urllib.request.build_opener(handler) self.opener.open(self.base_url) # 利用cookie请求访问另一个网址,此网址是成绩查询网址 gradeUrl = 'http://docker.ql.corp/DockerMachineServlet?ipaddress='+ self.db_ip +'&action=environmentSearch' # 请求访问成绩查询网址 result = self.opener.open(gradeUrl) html_str = result.read().decode('UTF-8') # print(html_str) return html_str #获取配置信息详情页 def get_detail_page(self,url_para): base_url = "http://docker.ql.corp/DockerMachineServlet?action=environmentdetail&" target_url = base_url + url_para detail_page = self.opener.open(target_url) return detail_page.read().decode('UTF-8') #在详情页面获取 配置端口信息 def get_server_port(self,html_str,ip): #找到所有服务 # #找手机贷微站 pattern = re.compile("<a href=.*?/micro_site.tomcat.*?:4(\d*?)-->", re.S) print("----") micro_data = re.search(pattern, html_str) if micro_data: micro_port = micro_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['micro_port'] = "4" + str(micro_port) self.result_dic['micro_ip'] = ip #找信审接口 pattern = re.compile("<a href=.*?/business_xs.tomcat.*?:4(\d*?)-->", re.S) print("----") business_xs_data = re.search(pattern, html_str) print(business_xs_data) if business_xs_data: business_xs_port = business_xs_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['business_xs_port'] = "4" + str(business_xs_port) self.result_dic['business_xs_ip'] = ip #找财务接口 pattern = re.compile("<a href=.*?/financial-web.netty.*?:4(\d*?)-->", re.S) print("----") financial_web_data = re.search(pattern, html_str) print(financial_web_data) if financial_web_data: financial_web_port = financial_web_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['financial_web_port'] = "4" + str(financial_web_port) self.result_dic['financial_web_ip'] = ip # 找应花分期资料接口 pattern = re.compile("<a href=.*?/yh-openapi.netty.*?:4(\d*?)-->", re.S) print("----") yhfq_micro_data = re.search(pattern, html_str) print(yhfq_micro_data) if yhfq_micro_data: yhfq_micro_port = yhfq_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['yhfq_micro_port'] = "4" + str(yhfq_micro_port) self.result_dic['yhfq_micro_ip'] = ip # 找卡代偿资料接口 pattern = re.compile("<a href=.*?/kdc-api.netty.*?:4(\d*?)-->", re.S) print("----") kdc_micro_data = re.search(pattern, html_str) print(kdc_micro_data) if kdc_micro_data: kdc_micro_port = kdc_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['kdc_micro_port'] = "4" + str(kdc_micro_port) self.result_dic['kdc_micro_ip'] = ip #获取环境配置跳转列表 def get_environment_list(self,html_str): pattern = re.compile("<a href='./DockerMachineServlet.*?/>(.*?)</td>", re.S) # pprint.pprint(html_str) ip_list = re.findall(pattern,html_str) pattern = re.compile("<a href='./DockerMachineServlet\?action=environmentdetail&(.*?)/>", re.S) url_para_list = re.findall(pattern, html_str) print(url_para_list) print(1) skip_first= False print(2) if url_para_list: print(3) for url_para,ip in zip(url_para_list,ip_list): print(4) if skip_first == False: print(5) skip_first = True continue if url_para.strip() == "" : print(6) continue print("-----------------start---------------------") print(url_para) #深入链接去爬取 print("深入链接去爬取") detail_page = self.get_detail_page(url_para) self.get_server_port(detail_page,ip) #处理环境详细页 print("-----------------end---------------------") # pprint.pprint(self.result_dic) print(self.result_dic) else: return None def get_environment_result(self): html_str = es.make_fake_request() es.get_environment_list(html_str) es = EnvironmentSpider("33.71") es.get_environment_result()
【python 2】
#-*- coding:utf-8 -*- #author:lvfengwen #date:2017/10/12 #descript:爬取环境配置信息 import urllib import urllib2 import cookielib import re import pprint #百度贴吧爬虫类 class EnvironmentSpider: # 初始化,传入基地址,是否只看楼主的参数 def __init__(self): self.base_url = "http://docker.ql.corp/LoginServlet?username=lvfengwen%40mobanker.com&password=123456&action=login" self.filename = 'cookie.txt' # 设置保存cookie的文件,同级目录下的cookie.txt self.result_dic = {} self.opener = None def make_fake_request(self): # 声明一个CookieJar对象实例来保存cookie cookie = cookielib.CookieJar() # 利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器 handler = urllib2.HTTPCookieProcessor(cookie) # 通过handler来构建opener self.opener = urllib2.build_opener(handler) self.opener.open(self.base_url) # 写入文件的一种cookie实现方式,最好不要 ----start----- # # 声明一个MozillaCookieJar对象实例来保存cookie,之后写入文件 # cookie = cookielib.MozillaCookieJar(self.filename) # self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) # # 模拟登录,并把cookie保存到变量 # result = self.opener.open(self.base_url) # # 保存cookie到cookie.txt中 # cookie.save(ignore_discard=True, ignore_expires=True) # 写入文件的一种cookie实现方式,最好不要 ----end----- # 利用cookie请求访问另一个网址,此网址是成绩查询网址 gradeUrl = 'http://docker.ql.corp/DockerMachineServlet?ipaddress=33.71&action=environmentSearch' # 请求访问成绩查询网址 result = self.opener.open(gradeUrl) # print result.read() return result.read() #获取配置信息详情页 def get_detail_page(self,url_para): base_url = "http://docker.ql.corp/DockerMachineServlet?action=environmentdetail&" target_url = base_url + url_para detail_page = self.opener.open(target_url) return detail_page.read() #在详情页面获取 配置端口信息 def get_server_port(self,html_str,ip): #找到所有服务 # #找手机贷微站 pattern = re.compile("<a href=.*?/micro_site.tomcat.*?:4(\d*?)-->", re.S) print("----") micro_data = re.search(pattern, html_str) if micro_data: micro_port = micro_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['micro_port'] = "4" + str(micro_port) self.result_dic['micro_ip'] = ip #找信审接口 pattern = re.compile("<a href=.*?/business_xs.tomcat.*?:4(\d*?)-->", re.S) print("----") business_xs_data = re.search(pattern, html_str) print business_xs_data if business_xs_data: business_xs_port = business_xs_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['business_xs_port'] = "4" + str(business_xs_port) self.result_dic['business_xs_ip'] = ip #找财务接口 pattern = re.compile("<a href=.*?/financial-web.netty.*?:4(\d*?)-->", re.S) print("----") financial_web_data = re.search(pattern, html_str) print financial_web_data if financial_web_data: financial_web_port = financial_web_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['financial_web_port'] = "4" + str(financial_web_port) self.result_dic['financial_web_ip'] = ip # 找应花分期资料接口 pattern = re.compile("<a href=.*?/yh-openapi.netty.*?:4(\d*?)-->", re.S) print("----") yhfq_micro_data = re.search(pattern, html_str) print yhfq_micro_data if yhfq_micro_data: yhfq_micro_port = yhfq_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['yhfq_micro_port'] = "4" + str(yhfq_micro_port) self.result_dic['yhfq_micro_ip'] = ip # 找卡代偿资料接口 pattern = re.compile("<a href=.*?/kdc-api.netty.*?:4(\d*?)-->", re.S) print("----") kdc_micro_data = re.search(pattern, html_str) print kdc_micro_data if kdc_micro_data: kdc_micro_port = kdc_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['kdc_micro_port'] = "4" + str(kdc_micro_port) self.result_dic['kdc_micro_ip'] = ip #获取环境配置跳转列表 def get_environment_list(self,html_str): pattern = re.compile("<a href='./DockerMachineServlet.*?/>(.*?)</td>", re.S) ip_list = re.findall(pattern,html_str) pattern = re.compile("<a href='./DockerMachineServlet\?action=environmentdetail&(.*?)/>", re.S) url_para_list = re.findall(pattern, html_str) print url_para_list print(1) skip_first= False print(2) if url_para_list: print(3) for url_para,ip in zip(url_para_list,ip_list): print(4) if skip_first == False: print(5) skip_first = True continue if url_para.strip() == "" : print(6) continue print "-----------------start---------------------" print(url_para) #深入链接去爬取 print("深入链接去爬取") detail_page = self.get_detail_page(url_para) self.get_server_port(detail_page,ip) #处理环境详细页 print "-----------------end---------------------" # pprint.pprint(self.result_dic) print(self.result_dic) else: return None def get_sjd_config(self,some_ip): pass def get_yhfq_config(self,some_ip): pass def get_kdc_config(self,some_ip): pass es = EnvironmentSpider() html_str = es.make_fake_request() print html_str es.get_environment_list(html_str)