# -*- coding: utf-8 -*-
import re, os, requests, sys, random, MySQLdb
from bs4 import BeautifulSoup, NavigableString, Tag, UnicodeDammit
from time import sleep, ctime
class hxnews(object):
def __init__(self):
self.CODEC = u'utf-8'
self.user_agent = u'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { u'User-Agent' : self.user_agent }
self.cur_url = ''
self.title = ''
self.body = ''
self.cur_id = 0
self.date = ''
self.last_news = ''
def init_data(self):
cur.execute(u"SELECT * FROM hxnews")
if len(cur.fetchall()) != 0:
cur.execute(u"SELECT MAX(urlid) FROM hxnews")
for data in cur.fetchall():
self.last_news = data[0]
else:
self.last_news = 79335 # 取任意列表页任意新闻URL最后数字即可
def get_list(self):
self.init_data()
cur_page = 1
baseurl = ur'http://www.hx95.com/News/Hack/index_%d.html'
flag = True
while True:
if cur_page == 1:
cur_url = ur'http://www.hx95.com/News/Hack/index.html'
else:
cur_url = baseurl % cur_page
cur_page += 1
response = requests.get(cur_url, headers = self.headers)
soup = BeautifulSoup(response.content)
ul_part = soup.find('ul', class_="global_tx_list4")
temp = ul_part.find_all('li') # 获得所有的li标签
for tag in temp:
date = tag.find('span', class_="box_r")
link = tag.find('a',class_ ='title')
cur_id = self.get_id(unicode(link.get(u'href')))
if cur_id > self.last_news:
self.get_content(cur_id, unicode(link.get(u'href')), unicode(link.string).strip(), unicode(date.string).strip()) #id, 链接,标题,发布日期
else:
flag = False
break
if not flag:
break
def get_id(self, href):
temp = href.split(u'/')
temp = temp[-1]
temp = temp.split(u'.')
temp = temp[0]
temp = int(temp)
return temp
def get_content(self, cur_id, href, title, date ):
self.cur_url = u'http://www.hx95.com' + href
self.title = title.encode('utf8')
self.cur_id = cur_id
self.date = date
response = requests.get(self.cur_url, headers = self.headers)
soup = BeautifulSoup(response.content)
div_part = soup.find(u'div', id=u'article_body')
if div_part is not None:
self.body = unicode(div_part.get_text()).encode('utf8')
else:
self.body = u''.encode('utf8')
self.cur_url = self.cur_url.encode('utf8')
self.insert_database()
def insert_database(self):
ptr = "INSERT into hxnews values(%s,%s,%s,%s,%s)"
value = [self.cur_id, self.cur_url, self.title, self.body, self.date]
cur.execute(ptr,value)
print self.cur_url.decode('utf8')
sleep(random.random()*2)
if __name__ == "__main__":
huaxia = hxnews()
while(True):
cxn = MySQLdb.connect(host='localhost',user='yp',passwd='333',db='bugs',charset="utf8") # 连接数据库
cur = cxn.cursor()
cur.execute(u'SET NAMES utf8')
huaxia.get_list()
print u'complete this time!'
cur.close() #关闭数据库连接
cxn.commit()
cxn.close()
sleep(3600)