Beautiful SOAP 爬网页

最新推荐文章于 2024-02-14 20:50:12 发布

原创最新推荐文章于 2024-02-14 20:50:12 发布 · 582 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #网络爬虫

python 同时被 2 个专栏收录

1 篇文章

订阅专栏

网络爬虫

1 篇文章

订阅专栏

本文介绍了一个使用Python Beautiful Soup实现的网络爬虫实例，该爬虫用于抓取CVE漏洞信息页面，并解析关键数据如漏洞名称、严重程度等。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Python Beautiful SOAP 是一款强大的html解析工具，堪称网络爬虫利器。

下面代码为工具cvelist.csv文件中的CVE ID，分别爬出该CVE信息的一段代码。供记录。

# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import os
import copy
import random
from urllib2 import Request, urlopen, HTTPError
import logging
import json
from bs4 import BeautifulSoup 
import json
import codecs
import gevent
import logging


URL = "http://cve.scap.org.cn/%s.html"
def fetchCVE(sid):
    sid = "CVE-"+str(sid).strip()
    request_url = URL %(sid)
    request_settings = { 'content-type': 'text/plain','Accept-Encoding':'deflate','User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    req = Request(request_url,headers=request_settings)
    content = ""
    try:
        response = urlopen(req)
        content = response.read().decode('utf8')
    except HTTPError, e:
        pass
    except Exception,e:
        pass
    return content

def fetchCVEByList(sidList,sidContentPair):
    length = len(sidList)
    count=0
    for sid in sidList:
        count = count+1
        debugstr = "Process %d of %d\n" % (count,length)
        content = fetchCVE(sid)
        sidContentPair[sid.strip()] = content
        fp = open("./file/"+sid.strip(), "w")
        fp.write(content.encode("utf-8"))
        fp.close()

def parseTD(table) :
    soup = BeautifulSoup(table,"lxml")
    tds=soup.find_all("td")
    return tds

def getScoreAndSeverity(table):
    tds = parseTD(table)
    if len(tds)>2:
        return (tds[2].string,tds[1].string)
    else:
        return ("","")
def getPlatform(table):
    tds = parseTD(table)
    content  =""
    for td in tds:
        if td.string!=None:
            content = content + "\n"+td.string
    return content

def getSummary(summary):
    soup = BeautifulSoup(summary,"lxml")
    strongs=soup.find_all("strong")
    return strongs[0].string

def writeCVEList(sidContentPair):
    length = 4152
    counter =0;
    logging.info("begin")
    sidInfoDic= {}
    for sid in sidContentPair.keys():
        debugstr = "process %d of total %d rule: SUCCEED\n"
        counter = counter+1
        content = sidContentPair.get(sid)
        if (content== ""):
            logging.error("sid:"+sid+ " content is none")
            continue
        try:
            soup = BeautifulSoup(content,"lxml")
            summary=soup.find_all("div", {'class':'summary'})
            cvsstable=soup.find_all(id="cvss")
            cpetable=soup.find_all(id="cpe")
            (severity,score) = getScoreAndSeverity(cvsstable[0].encode("utf-8"))
            if(severity=="" or score==""):
                logging.error("sid %s no score", sid)
            name = ""
            for content in summary[0].contents:
                if(content.encode("utf-8").find("strong")!=-1):
                    name = getSummary(content.encode("utf-8"))
            platform = getPlatform(cpetable[0].encode("utf-8"))
            sidInfoDic[sid]=[name,score.strip(),severity,platform]
            logging.info(debugstr , counter,length)
        except Exception,e:
            debugstr = "process %d of total %d rule: FAIL,sid="+sid+"\n"
            logging.exception(e)
            logging.info(debugstr , counter,length)
    #wstr = json.dumps(sidInfoDic, ensure_ascii=False)
    fp = open("result.json", "w")
    json.dump( sidInfoDic,fp, ensure_ascii=False,indent=4)
    fp.close()

def dumpResult():
    sidContentPair = {}
    fp = open("cvelist.csv",'r')
    lines = fp.readlines()
    fp.close()
    length = len(lines)
    threadNumber = length/500+1
    taskPerThread = 500
    threadList = []
    for i in xrange(threadNumber+1):
        taskBegin = i* taskPerThread
        taskEnd = (i+1)* taskPerThread
        if(taskEnd>length):
            taskEnd = length
        t = gevent.spawn(fetchCVEByList, lines[taskBegin:taskEnd],sidContentPair)
        threadList.append(t)
    gevent.joinall(threadList)
    writeCVEList(sidContentPair)

def dumpResultByFile():
    sidContentPair = {}
    #cve 文件，一行一个cve id
    fp = open("cvelist.csv",'r')
    lines = fp.readlines()
    fp.close()
    for line in lines:
        fp = open("./file/"+line.strip(), "r")
        content = fp.read()
        fp.close()
        sidContentPair[line.strip()] = content
    writeCVEList(sidContentPair)
    
if __name__=='__main__':
    #dumpResult()
    dumpResultByFile()