python 基础教程 23章NNTP 3

本文介绍了一个基于Python的新闻收集代理程序,该程序利用NNTP协议从新闻组抓取文章,并支持从网页中提取新闻。此外,还提供了两种不同格式的新闻输出方式:纯文本和平HTML格式。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#/usr/bin/env python
#*-*coding:utf-8 *-*


# python 基础教程23章NNTP 23-2 更灵活的新闻收集代理程序
#python2.7 运行


from nntplib import NNTP
from time import strftime, time, localtime
from email import message_from_string
from urllib import urlopen
import  textwrap
import re


day = 24 * 60 * 60


def wrap(string, max=70):
"""
将字符串调整为最大行宽
"""
return '\n'.join(textwrap.wrap(string)) + '\n'


class NewsAgent():
"""
可以从新闻来源获取新闻项目并且发布到新闻目标的对象
"""
def __init__(self):
self.sources = []
self.destinations = []

def addSource(self, source):
self.sources.append(source)
def addDestination(self, dest):
self.destinations.append(dest)

def distribute(self):
"""
从所有来源获取新闻项目并且发布到所有目标
"""
items =[]
for source in self.sources:
items.extend(source.getItems())
for dest in self.destinations:
dest.receiveItems(items)


class NewsItem():
"""
包括主题和主体文本的简单新闻项目
"""
def __init__(self, title, body):
self.title = title
self.body = body


class NNTPSource():
"""
从NNTP组中获取新闻项目的新闻来源
"""
def __init__(self, servername, group, window):
self.servername = servername
self.group = group
self.window = window


def getItems(self):


server = NNTP(self.servername)
(resp, count, frist, last, name) = server.group(self.group)
(resp, subs) = server.xhdr('subject', (str(frist) + '-' +(last)))

for subject in subs[-10:]:
title = subject[1]
(reply, num, id, list) = server.body(subject[0])
body = ''.join(list)


#print(num) #186919
#print(title) #Re: Find out which module a class came from
#print(''.join(list))#prano wrote:> But for merely ordinary obfuscation caused by poor...

yield NewsItem(title, body)
server.quit()


"""
书中原例getItems()方法
返回 nntplib.NNTPTemporaryError: 480 NEWNEWS command disabled by administrator
#480管理员禁用NEWNEWS命令


def getItems(self):
start = localtime(time() - self.window*day)
date = strftime('%y%m%d', start)
hour = strftime('%H%M%S', start)

server = NNTP(self.servername)
ids = server.newnews(self.group, date, hour)[1]

for id in ids:
lines = serverarticle(id)[3]
message = message_from_string('\n'.join(lines))

title = message['subject']
body = message.get_payload()
if message.is_multipat():
body = body[0]

yield NewsItem(title, body)
server.quit()
"""





class SimpleWebSource():
"""
使用正则表达式从网页中提取新闻项目的新闻来源
"""
def __init__(self, url, titlePattern, bodyPattern):
self.url = url 
self.titlePattern = re.compile(titlePattern)
self.bodyPattern = re.compile(bodyPattern)


def getItems(self):
text = urlopen(self.url).read()
titles = self.titlePattern.findall(text)
bodies = self.bodyPattern.findall(text)
for title, body in zip(titles, bodies):
yield NewsItem(title[1], wrap(body[1]))
"""
书中原例 getItems()方法
def getItems(self):
text = urlopen(self.url).read()
titles = self.titlePattern.findall(text)
bodies = self.bodyPattern.findall(text)
for title, body in zip(titles, bodies):
yield NewsItem(title, wrap(body))
"""


class PlainDestination():
"""
将所有新闻项目格式化为纯文本的新闻目标类
"""
def receiveItems(self, items):
for item in items:
print item.title
print '-'*len(item.title)
print item.body


class HTMLDestination():
"""
将所有新闻项目格式化为HTML的目标类
"""
def __init__(self, filename):
self.filename = filename

def receiveItems(self, items):

out = open(self.filename, 'w')

print >> out, """
<html>
<head>
<title>Today's News</title>
</head>
<body>
<h1>Today's News</h1>
"""

print >> out, '<ul>'
id = 0
for item in items:
id += 1
print >> out, '<li><a href="#%i">%s</a></li>' % (id, item.title)
print >> out, '</ul>'

id =0 
for item in items:
id += 1
print >> out, '<h2><a name="%i">%s</a></h2>' % (id, item.title)
print >> out, '<pre>%s</pre>' %item.body

print >> out, """
</body>
</html>
"""


def runDefaultSetup():
"""
来源和目标的默认设置, 可以自己修改
"""
agent = NewsAgent()

#从BBS新闻站获取新闻的SimpleWebSource
bbc_url = 'http://www.bbc.com/news'
bbc_title = r'<h3 class="(.+?)">(.+?)</h3>'
bbc_body = r'<p class="(.+?)">(.+?)</p>'
bbc = SimpleWebSource(bbc_url, bbc_title, bbc_body)

agent.addSource(bbc)

#从 comp.lang.python获取新闻的NNTPSource
"""
NNTP服务器 新闻组
'web.aioe.org', 'comp.lang.python'
'news.gmane.org',  'gmane.comp.python.committers'
"""


clpa_server = 'web.aioe.org' 
clpa_group = 'comp.lang.python'
clpa_window = 1
clpa = NNTPSource(clpa_server, clpa_group, clpa_window)

agent.addSource(clpa)

#增加纯文本目标和HTML目标
agent.addDestination(PlainDestination())
agent.addDestination(HTMLDestination('news.html'))

#发布新闻项目
agent.distribute()


if __name__ == '__main__':
runDefaultSetup()



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值