from tkinter import * # 导入窗口控件
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser # 调用浏览器打开网页
from tkinter import messagebox # 弹出提示框
from openpyxl import Workbook
import time #延时
treedata1=[] #全局变量用于存储查询到企业详细信息数据
def download_song():
# 打开查询页面,得到页面总数....................................................
datas = {
"page.pageNo": "1",
"page.orderBy": "",
"page.order": "",
"province": "210000000000",
"city": "211300000000",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
r = requests.post(url, data=datas)
html = etree.HTML(r.text)
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
urlpageidstr = urlpage[21:23] # 截取字符串,得到总页数
# 开始爬取所有页面数据
start_page = 1
#urlpageidstr=(int(urlpageidstr))
urlpageidstr = 2 # 调试数据暂定为2页,提高效率
messagebox.showinfo("提示", "数据正在读取请稍候。。。。")
for page in range(start_page, urlpageidstr):
content = page
# 打开查询页面,得到每个具体企业信息的链接....................................................
urlpage = content
datas = {
"page.pageNo": urlpage,
"page.orderBy": "",
"page.order": "",
"province": "210000000000",
"city": "211300000000",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post("http://permit.mee.gov.cn/permitExt/syssb/xkgg/xkgg!licenseInformation.action", data=datas)
r = requests.post(url, data=datas)
html = etree.HTML(r.text)
href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
href_name = html.xpath('//table[@class="tabtd"]/tr')[1:]
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
#urlpageidstr = urlpage[21:23] # 截取字符串,得到总页数
#print(urlpageidstr)
i = 0
datasum=0 #记录爬取数据个数
# 打开查询页面,得到第N个企业的详细 信息....................................................
for href_url, roos in zip(href_url, href_name):
addurl = href_url
name = roos.xpath('./td[4]/text()')[0]
i = i + 1
addurl = addurl[39:93]
datas = {
"xkgk": "getxxgkContent",
"dataid": addurl}
url = "http://permit.mee.gov.cn/permitExt/xkgkAction!xkgk.action?xkgk=" + addurl
html = requests.get(url, headers=datas)
soup = BeautifulSoup(html.text, 'lxml')
name_id = soup.find_all('p', style="font-size:36px;")[0].text # 得到企业名称
name_add = soup.find_all('p', style="font-weight: bold;color: green;font-size: 14px;")[0].text # 得到企业地址等信息 ..strip() 属性删除空格
content = name_add
content = content.strip() # 删除字符串左边空格
content = content.split() # 拆分字符串,通过指定分隔符对字符串进行分割,默认是空格。rstrip("\xa0\xa0\xa0\xa0\r\n\t\t\t")
# content=content.partition(":")
str2 = ''.join(content)
u1, u2, u3, u4, u5 = str2.split(':', 4)
f1 = u2.find('行业类别')
f2 = u2[0:f1]
g1 = u3.find('所在地区')
g2 = u3[0:g1]
h1 = u4.find('发证机关')
h2 = u4[0:h1]
#ii = str(i)
paiwuxukebianhao=soup.find_all('table', class_="tab0")[0].text.strip
爬取排污许可证所有详细信息2020.5.28
最新推荐文章于 2024-07-09 21:05:07 发布