批量网页截图自动化-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_43776659/article/details/116127030

from selenium import webdriver
import os
import pandas as pd
base_place=os.getcwd()
os.chdir(base_place)
import datetime
the_time=datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')
os.makedirs('截图'+" "+the_time)
companies=pd.read_excel("待查公司.xlsx")["公司名"]
print(companies)

all_files_place=os.getcwd()+"\\"+'截图'+" "+the_time
os.chdir(all_files_place)
for company in companies:
    os.makedirs(company)
#下载浏览机到这个地址。下载地址：https://sites.google.com/chromium.org/driver/
chromedriver=base_place+"\\chromedriver.exe"
#修改系统的谷歌默认调用地址为此地址
os.environ["webdriver.chrome.driver"] = chromedriver
browser = webdriver.Chrome(chromedriver)

normal_name_url={
        "03.	中华人民共和国国家发展和改革委员会网站":"https://so.ndrc.gov.cn/s?siteCode=bm04000007&ssl=1&token=&qt=",
        "05.	国家税务总局重大税收违法案件信息公布栏网站":"http://www.chinatax.gov.cn/s?siteCode=bm29000002&qt=", #
        "07.	中华人民共和国生态环境部网站":"http://www.mee.gov.cn/qwjs2019/?searchword=",
        "08.	中华人民共和国工业和信息化部网站":"https://www.miit.gov.cn/search/index.html?websiteid=110000000000000&pg=&p=&tpl=&category=&q=", #后头再加上：&jsflIndexSeleted=
        "12.	国家外汇管理局网站":"http://www.safe.gov.cn/safe/search/index.html?q=",#后头要加上：&siteid=safe&order=releasetime
        "14.	中国银行保险监督管理委员会网站":"http://www.cbirc.gov.cn/cn/view/pages/index/jiansuo.html?keyWords=",
        "17.	中国盐业协会网站":"http://www.cnsalt.cn/owsc/search.htm?searchCondition=",
        "18.	中华人民共和国国家统计局网站":"http://www.stats.gov.cn/was5/web/search?channelid=288041&andsen=",
        "19.	中国电力企业联合会网站":"https://cec.org.cn/search/index.html?search=",
        "20.	电力建设企业能力及信用信息查询系统网站":"http://credit.cepca.org.cn/Corp_list.aspx?key=",
        "21.	国家能源局网站":"http://so.news.cn/was5/web/search?channelid=229767&searchword=",
        "22.	国家市场监督管理总局":"http://www.samr.gov.cn/so/s?qt=", #后面还要加上：&x=0&y=0&token=849&siteCode=bm30000012",
        "24.	中华人民共和国农业农村部网站":"http://www.moa.gov.cn/was5/web/search?searchword=", #后面还要加上：&channelid=233424&orderby=-DOCRELTIME"
        "25.	中华人民共和国海关总署网站":"http://search.customs.gov.cn/search/pcRender?pageId=f5261418ddc74f03b27e3590c531102b&q=", #后面还要再加上：&ext=siteId:300632&sr=score%20desc"
        "26.	中华人民共和国住房和城乡建设部网站":"http://search.mohurd.gov.cn/?tn=mohurd&lastq=%24wstquerystring%24&sort=last-modified+desc&rn=10&auth_info=&table_id=%24wsttableid%24&pn=0&query=", #后面还要再加上：&ty=a&ukl=&uka=&ukf=&ukt=&sl=&ts=&te=&upg=0"
        "28.	全国资源公共交易平台":"http://www.ggzy.gov.cn/information/info/news/news.shtml?qt="
           }
#首先针对一个网站，搜索所需的所有公司并截图
  #对正常的网站
normal_names=normal_name_url.keys()
add_dict={"08.	中华人民共和国工业和信息化部网站":"&jsflIndexSeleted=",
          "12.	国家外汇管理局网站":"&siteid=safe&order=releasetime",
          "22.	国家市场监督管理总局":"&x=0&y=0&token=849&siteCode=bm30000012"
          "24.	中华人民共和国农业农村部网站""&channelid=233424&orderby=-DOCRELTIME",
          "25.	中华人民共和国海关总署网站":"&ext=siteId:300632&sr=score%20desc"}
for normal_name in normal_names: #对每一个正常网站
    base_url=normal_name_url[normal_name] #获得基础网址
    for company in companies: #对每一个公司
        if normal_name in add_dict.keys(): #添加公司名及网址后缀（如有）
            true_url=base_url+company+add_dict[normal_name]
        else:
            true_url=base_url+company
        browser.get(true_url) #搜索
        #定位文件夹，如无，则创建新文件夹
        os.chdir(all_files_place+"\\"+company)
        picture_name=company+'-'+normal_name[5:]+'.png'
        browser.save_screenshot(picture_name) #截图并保存
py截图网页