汇总下elec的二级单位信息
#!/usr/bin/python
# -*- coding: utf-8 -*-
# 导入库
import requests
import re
#目标数据:网站和页面
url = 'http://49.cetc.com.cn/zgdzkj/wzq/inde.html' # 电科网站群,原网页基础上看到源代码之后再次点击链接,end加x
#模拟浏览器发送http请求
response = requests.get(url)
#编码方式
response.encoding = 'utf-8'
#目标主页的网页源码
html = response.text
#新建一个文件
my_text = open('C:/untitled/python-attempt path/electricity.txt','w')
#获取每一个二级单位网站的信息(章节,url)
url_electricity = re.findall(r'<li><a href="(.*?)" target="_blank">',html)
name_electricity = re.findall(r'target="_blank">(.*?)</a></li>',html)
# print(url_electricity)
# print(name_electricity)
addition_info = []
#循环每个,分别下载
for i in list(range(len(name_electricity))):
# addition_info.append(name_electricity[i])
url_2 = url_electricity[i]
response_2 = requests.get(url_2)
# 编码方式
response_2.encoding = 'utf-8'
html_2 = response_2.text
if i == 2:
url_3 = re.findall(r'<a href="(.*?)" target="_parent" >企业概况</a>', html_2)
elif i == 5:
url_3 = re.findall(r'<a href="(.*?)" >关于我们</a>', html_2)
elif i == 7:
url_3 = re.findall(r'<a href="(.*?)" target="_parent">企业概况</a>', html_2)
else:
url_3 = re.findall(r'<a href="(.*?)" >企业概况</a>',html_2)
#利用try可以将url_3中的空白跳过,避免对list进行读取的时候溢出
try:
url_4 = url_electricity[i]+url_3[0]
my_text.write(name_electricity[i]+'\n')
response_4 = requests.get(url_4)
# 编码方式
response_4.encoding = 'utf-8'
html_4 = response_4.text
enterprise_address = re.findall(r'<p>地址:(.*?)</p>',html_4)
enterprise_info = re.findall(r'<ul class="intro_second" opentype="page"> (.*?)<script type="text/javascript">',html_4, re.S)
enterprise_info[0] = enterprise_info[0].replace('微软雅黑','')
#忘记返回值了,真是大意了
enterprise_info[0] = re.compile(u"[^0-9\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+").sub('', enterprise_info[0])
my_text.write(enterprise_address[0]+'\n')
my_text.write(enterprise_info[0] + '\n'+'\n')
print(enterprise_info[0])
except IndexError:
pass
#关闭文件
my_text.close()