优快云个人博客阅读评论信息的爬取
用来爬取优快云上个人博客的信息,包括阅读次数,评论数等等
因为优快云更换了界面,原来的爬虫已经失效,所以我重新写了一个爬虫,可以精准的爬取到每篇文章阅读次数和评论次数的变化,并将总结出来的信息发送到邮箱里,
项目中有两个文件,csdn_old.py是旧版优快云界面的爬虫,使用了BeautifulSoup来进行爬取信息,它的功能也是将每日博客信息的变化值总结下来发送到邮箱里,csdn_new.py是新版的爬虫,全部使用re来提取信息,并添加了评论次数的检测。
代码说明:
1.需要配置对应的数据库文件,因为里面保存了博客前一日的全部信息
2.需要发送邮件,必须有对应的邮箱账号和密码才能使用
3.我的代码是放到服务器上每天定时运行的,没有服务器的同学可以联系我,也可以放到我的服务器上。
实现效果

csdn_new.py
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib2
import re
import MySQLdb
import datetime
import time
import smtplib
from email.mime.text import MIMEText
from email.header import Header
def email(text ,toemail):
sender = ""
password = ""
receiver = toemail
data_time = time.strftime("%Y-%m-%d")
subject = data_time + "日的优快云博客报告"
words = text
smtpserver = 'smtp.exmail.qq.com'
msg = MIMEText(words, 'plain', 'utf-8')
msg['Subject'] = Header(subject, 'utf-8')
msg['from'] = sender
msg['to'] = receiver
smtp = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
smtp.connect('smtp.exmail.qq.com')
smtp.login(sender, password)
smtp.sendmail(sender, receiver, msg.as_string())
smtp.quit()
print data_time + "的邮件发送成功!"
def DownLoad_Html(url):
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "error"
print e.code
print e.reason
html = None
return html
def operator_SQL(flag,update=None):
try:
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='test',
charset='utf8',
)
except:
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='',
passwd='',
db='test',
charset='utf8',
)
cur = conn.cursor()
if flag == 1:
res = cur.execute("select * from csdn_article")
cur.execute(
'insert into csdn_article values("%s","%s", "%s", "%s", "%s", "%s")' % (res + 1, update[0], update[1], update[2],update[3], 0 ))
elif flag == 2:
cur.execute("update csdn_article set article_read="+str(update[1])+" where article_id = "+str(update[0]))
cur.execute("update csdn_article set change_read=" + str(update[3]) + " where article_id = " + str(update[0]))
elif flag == 3:
res = cur.execute("select article_read from csdn_article where article_id="+str(update))
if res == 1:
res = cur.fetchmany(res)[0][0]
elif res == 0:
res = -1
return res
elif flag == 4:
res = cur.execute("select article_comment from csdn_article where article_id=" + str(update))
if res == 1:
res = cur.fetchmany(res)[0][0]
return res
elif flag == 5:
cur.execute(
"update csdn_article set article_comment=" + str(update[1]) + " where article_id = " + str(update[0]))
cur.close()
conn.commit()
conn.close()
def main():
starttime = datetime.datetime.now()
article_all_list = []
text = ''
comment_num = 0
read_num = 0
for i in range(1, 10000):
url = 'http://blog.youkuaiyun.com/rain_web/svc/getarticles?pageindex=' + str(i) + '&pagesize=1&categoryId=0&'
html = DownLoad_Html(url)
if html == '':
break
title = re.findall('_blank">(.*?)<', html)
read = re.findall('span>(.*?)<', html)
id = re.findall('details/(.*?)"', html)
article_title = title[0]
article_read = int(read[0])
article_comment = int(read[1])
article_id = id[0]
article_list = []
article_list.append(article_id)
article_list.append(article_read)
article_list.append(article_title)
article_list.append(article_comment)
flag = 0
read = operator_SQL(3, article_id)
if read == -1:
operator_SQL(1, article_list)
elif read != article_read:
article_list.append(article_read - read)
operator_SQL(2, article_list)
text = text + '文章:' + article_title + ':总阅读次数为:' + str(article_read) + ';比昨天增加了:' + str(
article_read - read) + '次。' + '\n'
read_num = read_num + 1
comment = operator_SQL(4, article_id)
if comment != article_comment:
comment_list = []
comment_list.append(article_id)
comment_list.append(article_comment)
operator_SQL(5, comment_list)
comment_num = comment_num + 1
text = text + '文章:' + article_title + ':评论次数为:' + str(article_comment) + ";比昨天增加了:" + str(
article_comment - comment) + '次。' + '\n'
text = "您今天的优快云博客信息报告如下:" + '\n今日有' + str(read_num) + \
'篇文章的阅读数发生了变化\n' + '今日有' + str(comment_num) + '篇文章的评论数发生了变化\n' + text
endtime = datetime.datetime.now()
text = text + '\n本次爬虫运行时间为:' + str((endtime - starttime).seconds) + '秒'
print text
email(text, 'nylrain@163.com')
if __name__=='__main__':
main()
更多技术文章请访问我的个人博客http://www.rain1024.com