二.scrapy抓取百度新闻排行榜,并且推送到指定邮箱

本文介绍了一个使用Python Scrapy框架实现的百度新闻爬虫项目。该项目从百度热门新闻榜单抓取数据,并将结果存入MongoDB数据库。此外,还实现了通过邮件发送爬取到的排名前三的新闻详情。涉及的技术包括Scrapy框架的应用、XPath解析网页内容、MongoDB数据库操作及SMTP邮件发送等。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#encoding=utf-8
import scrapy
import requests
from pymongo import MongoClient
from ..items import FirstoneItem
import smtplib
from email.mime.text import MIMEText
from .. import settings
import time
from scrapy.http import Request
from scrapy.spiders import Spider
class MaimaiSpider(Spider):
    #mongodb
    cn=MongoClient(settings.MONGODB_HOST,settings.MONGODB_PORT)
    db=cn[settings.MONGODB_DB]
    tb=db[settings.MONGODB_TABLE]

    name='baidunews'
    allowed_domains=['baidu.com']
    start_urls=['http://top.baidu.com/buzz?b=341']
    mainurl='http://top.baidu.com/'

    def parse(self, response):
        modes=response.xpath('//div[@class="hblock"]/ul/li/a/@href').extract()
        for mode in modes[1:]:
            news_type=response.xpath('//div[@class="hblock"]/ul/li[{}]/a/@title'.format(str(1+modes.index(mode)))).extract_first()
            yield Request(url=self.mainurl+mode[1:],callback=self.parse_item,meta={'news_type':news_type})

    def parse_item(self,response):
        bodys=response.xpath('//table[@class="list-table"]/tr')
        for body in bodys:
            if body.xpath('.//td[@class="first"]').extract():
                items=FirstoneItem()
                num=body.xpath('.//td[@class="first"]/span/text()').extract_first()
                title=body.xpath('.//td[@class="keyword"]/a/text()').extract_first()
                href=body.xpath('.//td[@class="keyword"]/a/@href').extract_first()
                focus_num=body.xpath('.//td[@class="last"]/span/text()').extract_first()
                items['num']=num
                items['_id']=title
                items['news_type']=response.meta['news_type']
                items['baidu_url']=href
                items['focus_num']=focus_num
                yield items

             #   print response.meta['news_type'].encode('gb18030'),num,title.encode('gb18030'),href


    def close(self, reason):
        if reason=='finished':
            header='<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><table border="0" cellspacing="0" cellpadding="3" align="left" >'
            tail='</table></body></html>'
            line=''
            for data in self.tb.find():
                if int(data['num'])<=3:
                    tp0='<tr align="left"><td colspan="6">%s</td></tr>'%('*'*10)
                    tp1='<tr align="left"><td colspan="6">%s</td></tr>'%data['news_type']
                    tp2='<tr align="left"><td colspan="6">%s</td></tr>'%data['num']
                    tp3='<tr align="left"><td colspan="6">%s</td></tr>'%data['_id']
                    tp4='<tr align="left"><td colspan="6">%s</td></tr>'%data['baidu_url']
                    line=line+tp0+tp1+tp2+tp3+tp4
            body=header+line+tail
            msg = MIMEText(body,'html', 'utf-8')
            msg["Subject"] = "[%s]BaiduTopNews"%time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            msg["From"]    = settings.email_From
            msg["To"]      = ','.join(settings.email_To)
            try:
                s = smtplib.SMTP_SSL(settings.smtpHost, settings.smtpPort)
                s.login(settings.email_From,settings.email_pwd)
                s.sendmail(settings.email_From, msg["To"], msg.as_string())
                s.quit()
                print "Success!"
            except smtplib.SMTPException,e:
                print "sendemail_Falied,the reson is %s"%e
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值