elec信息汇总，练练手

最新推荐文章于 2024-05-02 07:39:55 发布

原创最新推荐文章于 2024-05-02 07:39:55 发布 · 283 阅读

CC 4.0 BY-SA版权

汇总下elec的二级单位信息

#!/usr/bin/python
# -*- coding: utf-8 -*-

# 导入库
import requests
import re
#目标数据：网站和页面
url = 'http://49.cetc.com.cn/zgdzkj/wzq/inde.html'   # 电科网站群，原网页基础上看到源代码之后再次点击链接，end加x
#模拟浏览器发送http请求
response = requests.get(url)
#编码方式
response.encoding = 'utf-8'
#目标主页的网页源码
html = response.text
#新建一个文件
my_text = open('C:/untitled/python-attempt path/electricity.txt','w')
#获取每一个二级单位网站的信息（章节，url）
url_electricity = re.findall(r'<li><a href="(.*?)" target="_blank">',html)
name_electricity = re.findall(r'target="_blank">(.*?)</a></li>',html)
# print(url_electricity)
# print(name_electricity)
addition_info = []
#循环每个，分别下载
for i in list(range(len(name_electricity))):
    # addition_info.append(name_electricity[i])
    url_2 = url_electricity[i]
    response_2 = requests.get(url_2)
    # 编码方式
    response_2.encoding = 'utf-8'
    html_2 = response_2.text
    if i == 2:
        url_3 = re.findall(r'<a href="(.*?)"  target="_parent" >企业概况</a>', html_2)
    elif i == 5:
        url_3 = re.findall(r'<a href="(.*?)" >关于我们</a>', html_2)
    elif i == 7:
        url_3 = re.findall(r'<a href="(.*?)" target="_parent">企业概况</a>', html_2)
    else:
        url_3 = re.findall(r'<a href="(.*?)" >企业概况</a>',html_2)
    #利用try可以将url_3中的空白跳过，避免对list进行读取的时候溢出
    try:
        url_4 = url_electricity[i]+url_3[0]
        my_text.write(name_electricity[i]+'\n')
        response_4 = requests.get(url_4)
        # 编码方式
        response_4.encoding = 'utf-8'
        html_4 = response_4.text
        enterprise_address = re.findall(r'<p>地址：(.*?)</p>',html_4)
        enterprise_info = re.findall(r'<ul class="intro_second" opentype="page"> (.*?)<script type="text/javascript">',html_4, re.S)
        enterprise_info[0] = enterprise_info[0].replace('微软雅黑','')

        #忘记返回值了，真是大意了
        enterprise_info[0] = re.compile(u"[^0-9\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]+").sub('', enterprise_info[0])

        my_text.write(enterprise_address[0]+'\n')
        my_text.write(enterprise_info[0] + '\n'+'\n')

        print(enterprise_info[0])
    except IndexError:
        pass

#关闭文件
my_text.close()