1.自定义
a)当前更新微博头条内容,邮件通知
1.分析页面
a)微博主页:https://weibo.com/,记录cookies
b)登陆链接
https://login.sina.com.cn/sso/login.php
c)头条Url
https://d.weibo.com/623751_1
3.实现思路
a)微博模拟登陆,保存cookie信息
b)关注实时头条
c)每个小时,更新一次,新发布头条,邮件通知
4.编码实现
Spider组件
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy import Spider, Request
from lxml import etree
import re
import datetime,time
import uuid
from SinaTopic.items import SinatopicItem
class SinaspiderSpider(Spider):
name = 'SinaSpider'
allowed_domains = ['weibo.com']
start_urls = ['https://d.weibo.com/623751_1']
def start_requests(self):
for i in range(10):
yield Request(url=self.start_urls[0], callback=self.parse, dont_filter=True)
def parse(self, response):
res = response.text.replace('\r','').replace('\\n','').replace('\t','').replace('\\','').strip()
#with open('html.txt','w') as f:
# f.write(res)
lists = re.findall(r'<li class="pt_li pt_li_2 S_bg2"(.*)</li>', res)[0]
selector = etree.HTML(str(lists))
titles = selector.xpath('//div[@class="text_box"]/div[@class="title W_autocut"]/a/text()')
hrefs = selector.xpath('//div[@class="text_box"]/div[@class="title W_autocut"]/a/@href')
subs = selector.xpath('//div[@class="text_box"]/div[@class="text text_cut S_txt2"]/text()')
picmuls = selector.xpath('//ul[@class="pic_m3 clearfix"]/li/img/@src')
accounts = selector.xpath('//div[@class="subinfo_box clearfix"]/a/span[@class="subinfo S_txt2"]/text()')
datatimes = selector.xpath('//div[@class="subinfo_box clearfix"]/span[@class="subinfo S_txt2"]/text()')
refercounts = selector.xpath('//em[@class="W_ficon ficon_forward S_ficon W_f16"]/following-sibling::em[1]/text()')
commontcounts = selector.xpath('//em[@class="W_ficon ficon_repeat S_ficon W_f16"]/following-sibling::em[1]/text()')
addcounts = selector.xpath('//em[@class="W_ficon ficon_praised S_ficon W_f16"]/following-sibling::em[1]/text()')
for (title,href,sub,account,pubtime,refercount,commentcount,addcount) in zip(titles,hrefs,subs,accounts,datatimes,refercounts,commontcounts,addcounts):
item = SinatopicItem()
item["id"] = uuid.uuid1()
item["title"] = title
item["href"] = href
item["sub"] = sub
item["account"] = account[1:]
item["pubtime"] = str(datetime.datetime.now().year)+"年"+pubtime
item["refercount"] = refercount
item["commentcount"] = commentcount
item["addcount"] = addcount
item["addtime"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
index = titles.index(title)
try:
item["picurls"] = picmuls[index]+";"+picmuls[index+1]+";"+picmuls[index+2]
except:
print('{} pictrues error.'.format(titles))
yield item
Piplines组件
from pymongo import MongoClient
from SinaTopic.items import SinatopicItem
from SinaTopic.emails import sendEmail
class SinaspiderPipeline(object):
def __init__(self):
print('monogo')
client = MongoClient('mongodb://localhost:27017')
db = client['SinaWeiBo']
self.collection = db["topic"]
self.count =0
self.exitscount = 0
self.refresh = []
self.reverserefresh = []
def process_item(self, item, spider):
if isinstance(item, SinatopicItem):
try:
content = dict(item)
myquery = {}
myquery['title'] = item["title"]
mydoc = self.collection.find_one(myquery)
if mydoc:
self.exitscount = self.exitscount+1
print('exists',mydoc["title"])
else:
print('not exists',item["title"])
self.collection.insert_one(content)
self.count = self.count + 1
self.refresh.append(item)
print('OK',self.count,self.exitscount)
except Exception:
print('SinatopicItem Erro')
def close_spider(self, spider):
print('sendemail count:',len(self.refresh))
self.refresh.sort(key=lambda x: (int(x["addcount"]), int(x["commentcount"])))
for items in self.refresh:
self.reverserefresh.insert(0,items)
sendEmail(self.reverserefresh)
cookies组件
# encoding=utf-8
import json
import base64
import requests
import os
myWeiBo = [
{'no': 'username', 'psw': 'pwd'},
]
def getCookies(weibo):
""" 获取Cookies """
cookies = []
localcookie = readcookies()
if localcookie:
print('local cookie')
cookies.append(localcookie)
else:
print('get cookies from sina')
loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
for elem in weibo:
account = elem['no']
password = elem['psw']
username = base64.b64encode(account.encode('utf-8')).decode('utf-8')
print('username:{},password:{}'.format(username,password))
postData = {
"entry": "sso",
"gateway": "1",
"from": "null",
"savestate": "30",
"useticket": "0",
"pagerefer": "",
"vsnf": "1",
"su": username,
"service": "sso",
"sp": password,
"sr": "1440*900",
"encoding": "UTF-8",
"cdult": "3",
"domain": "sina.com.cn",
"prelt": "0",
"returntype": "TEXT",
}
session = requests.Session()
r = session.post(loginURL, data=postData)
jsonStr = r.content.decode('gbk')
info = json.loads(jsonStr)
if info["retcode"] == "0":
print ("Get Cookie Success!( Account:%s )" % account)
cookie = session.cookies.get_dict()
cookies.append(cookie)
else:
print ("Failed!( Reason:%s )" % info['reason'])
return cookies
def savecookies(cookies):
with open('cookies.txt','w') as f:
if cookies:
for k,v in cookies.items():
f.writelines('{0}:{1}'.format(k,v)+"\n")
def readcookies():
cookie = {}
for line in open('cookies.txt'):
item = line.split(':')
cookie[item[0]] = item[1]
return cookie
cookies = getCookies(myWeiBo)
print ("Get Cookies Finish!( Num:%d)" % len(cookies))
#test
if __name__ == '__main__':
readcookies()
#cookies = getCookies(myWeiBo)
#savecookies(cookies[0])
#print("Get Cookies Finish:%s)" % cookies)
emails组件
# -*- coding: utf8 -*-
from email.header import Header
from email.mime.text import MIMEText
import smtplib
def sendEmail(list):
"""
邮件通知
"""
sender = "senderusername"
receiver = "XXXXXX@qq.com"
subject = '@主人 微博头条更新通知'
username = "username "
password = "password "
host = "smtp.qq.com"
sendmsg = ''
msg = '第{11}篇\n标题:{0}\n描述:{1}\n原文链接:{2}\n微博账户:{3}\n发布时间:{4}\n转发数:{5}\n评论数:{6}\n点赞数:{7}\n{8}\n{9}\n{10}\n'
if len(list) >0:
for item in list:
index = list.index(item)+1
if index is 21:
print('only send 10 emails')
break
else:
sendmsg += msg.format(item["title"],item["sub"],item["href"],item["account"],item["pubtime"],item["refercount"],item["commentcount"],item["addcount"],item["picurls"].split(';')[0],item["picurls"].split(';')[1],item["picurls"].split(';')[2],index)
msg = MIMEText(sendmsg, 'plain', 'utf-8') # 中文需参数‘utf-8’,单字节字符不需要
msg['Subject'] = Header(subject, 'utf-8')
msg['From'] = sender
msg['To'] = receiver
smtp = smtplib.SMTP_SSL()
smtp.connect(host)
smtp.login(username, password)
smtp.sendmail(sender, receiver.split(","), msg.as_string())
smtp.quit()
print("邮件已通知, 请查收")
print('data is null')
if __name__ == '__main__':
sendEmail(1)
items组件
from scrapy import Item, Field
class SinatopicItem(Item):
# define the fields for your item here like:
id = Field()
title = Field()
href = Field()
sub = Field()
account = Field()
pubtime = Field()
refercount = Field()
commentcount = Field()
addcount = Field()
addtime = Field()
picurls = Field()
middlewares中间件
# encoding=utf-8
import random
from SinaTopic.user_agents import agents
from SinaTopic.cookies import cookies
class UserAgentMiddleware(object):
""" 换User-Agent """
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
class CookiesMiddleware(object):
""" 换Cookie """
def process_request(self, request, spider):
cookie = random.choice(cookies)
request.cookies = cookie
5.OK,功能实现