抓取华夏安全新闻并存入mysql_新闻存mysql-优快云博客

本文链接：https://blog.youkuaiyun.com/y_angpeng/article/details/38455787

本文介绍了一个用于抓取华霞网络安全相关新闻的Python爬虫程序。该程序能够从指定网站获取新闻列表，并提取每篇文章的详细内容，包括标题、正文和发布时间等。此外，文章还涉及如何将抓取到的数据存储到MySQL数据库中。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# -*- coding: utf-8 -*-

import re, os, requests, sys,  random, MySQLdb
from bs4 import BeautifulSoup, NavigableString, Tag, UnicodeDammit
from time import sleep, ctime

class hxnews(object):

    def __init__(self):
        self.CODEC = u'utf-8'
        self.user_agent = u'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = { u'User-Agent' : self.user_agent }
        self.cur_url = ''
        self.title = ''
        self.body = ''
        self.cur_id = 0
        self.date = ''
        self.last_news = ''

    def init_data(self):
        cur.execute(u"SELECT * FROM hxnews")
        if len(cur.fetchall()) != 0:
            cur.execute(u"SELECT MAX(urlid) FROM hxnews")
            for data in cur.fetchall():
                self.last_news = data[0]
        else:
            self.last_news = 79335  # 取任意列表页任意新闻URL最后数字即可

    def get_list(self):
        self.init_data()
        cur_page = 1
        baseurl = ur'http://www.hx95.com/News/Hack/index_%d.html'
        flag = True
        
        while True:
            if cur_page == 1:
                cur_url = ur'http://www.hx95.com/News/Hack/index.html'
            else:
                cur_url = baseurl % cur_page
            cur_page += 1
            response = requests.get(cur_url, headers = self.headers)
            soup = BeautifulSoup(response.content)
            ul_part = soup.find('ul', class_="global_tx_list4")
            temp = ul_part.find_all('li')  # 获得所有的li标签
            for tag in temp:
                date = tag.find('span', class_="box_r")
                link = tag.find('a',class_ ='title')
                cur_id = self.get_id(unicode(link.get(u'href')))
                if cur_id > self.last_news:
                    self.get_content(cur_id, unicode(link.get(u'href')), unicode(link.string).strip(), unicode(date.string).strip())  #id, 链接，标题，发布日期 
                else:
                    flag = False
                    break
            if not flag: 
                break

    def get_id(self, href):
        temp = href.split(u'/')
        temp = temp[-1]
        temp = temp.split(u'.')
        temp = temp[0]
        temp = int(temp)
        return temp

    def get_content(self, cur_id, href, title, date ):
        self.cur_url = u'http://www.hx95.com' + href
        self.title = title.encode('utf8')
        self.cur_id = cur_id
        self.date = date
        response = requests.get(self.cur_url, headers = self.headers)
        soup = BeautifulSoup(response.content)
        div_part = soup.find(u'div', id=u'article_body')
        if div_part is not None: 
           self.body = unicode(div_part.get_text()).encode('utf8')
        else:
            self.body = u''.encode('utf8')
        self.cur_url = self.cur_url.encode('utf8')
        self.insert_database()

    def insert_database(self):
        ptr = "INSERT into hxnews  values(%s,%s,%s,%s,%s)"
        value = [self.cur_id, self.cur_url, self.title, self.body, self.date]
        cur.execute(ptr,value)
        print self.cur_url.decode('utf8')
        sleep(random.random()*2)
        
if __name__ == "__main__":
    huaxia = hxnews()
    
    while(True):
        cxn = MySQLdb.connect(host='localhost',user='yp',passwd='333',db='bugs',charset="utf8") # 连接数据库
        cur = cxn.cursor()
        cur.execute(u'SET NAMES utf8')
                                
        huaxia.get_list()
        print u'complete this time!'
        
        cur.close()  #关闭数据库连接
        cxn.commit()
        cxn.close()
        
        sleep(3600)