from pyquery import PyQuery
from requests_html import HTMLSession
import json
import logging
from concurrent.futures import ProcessPoolExecutor
logging.basicConfig(filename="", level=logging.INFO)
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}
class Navigation_test(object):
def __init__(self):
self.mylist_all = []
self.mylist_js = []
def get_html(self,url,headers = header):
with HTMLSession() as session:
res = session.get(url=url,headers = headers,timeout = 15)
# if url in self.mylist_js:
# res.html.render()
html = res.html.html
return html
def get_url_navtemplate(self):
#得到所有的json
not_js_render_list = "specify_nav"
# for i in not_js_render_list:
with open("/home/ppwang/PycharmProjects/gitproject/analysis/data/{}.json".format(not_js_render_list), "r") as f:
text = f.read()
mydata = json.loads(text)
for i in mydata:
my_new_data = list(zip(i["info:url"],i["info:nav_template"]))
for kk in my_new_data:
self.mylist_all.append([kk[0][-1], kk[1],i["info:web_site"],kk[0][:-1]])
return self.mylist_all
def get_js_url_navtemplate(self):
#得到js加载的网址
js_render_list = ["info_mylist_js","jsg_js_json"]
for i in js_render_list:
with open("/home/ppwang/PycharmProjects/gitproject/analysis/data/{}.json".format(i), "r") as f:
text = f.read()
mydata = json.loads(text)
for i in mydata:
my_new_data = list(zip(i["info:url"],i["info:nav_template"]))
for kk in my_new_data:
self.mylist_js.append(kk[0][-1])
return self.mylist_js
def add_js_render_func(self):
self.mylist_js.append("")
return self.mylist_js
def judge_dom_content(self,html,nav_tem):
dom = PyQuery(html)
p = dom(nav_tem)("a").length
return p
def get_start(self,b):
html = self.get_html(b[0])
the_num = self.judge_dom_content(html=html,nav_tem=b[1])
logging.info("{},--{}--,a标签的个数为{}".format(b[2], b[3], the_num))
if the_num == 0:
with open("/home/ppwang/PycharmProjects/false12512_nav.txt","a+") as file:
file.write("{},--{}--,a标签的个数为{}".format(b[2],b[3],the_num)+"\r\n")
if __name__ == '__main__':
n = Navigation_test()
n.get_js_url_navtemplate()
n.add_js_render_func()
mylist = n.get_url_navtemplate()
ex = ProcessPoolExecutor(10)
for i in mylist:
ex.submit(n.get_start,i)
ex.shutdown(wait=True)
转载于:https://my.oschina.net/mypeng/blog/2252430