# -*- coding: utf-8 -*-
import urllib2,re
file = open('fenghuang.txt', 'w')
"""
批量抓取凤凰网文章类练习
"""
class FengHuang(object):
def __init__(self, num, regular):
self.num = num
self.regular = regular
"""
获取页面中所有文章链接地址
"""
def getUrl(self):
aUrl = []
for i in xrange(0, self.num):
headers = {
'GET': '/internet/list_'+str(i)+'/2300.shtml HTTP/1.1',
'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
'Host': 'tech.ifeng.com',
'Cookie': 'prov=cn010; city=010; weather_city=bj; region_ip=114.244.153.x; region_ver=1.1; HOT_TAG=y; samemappingcookie=m; userid=1383726976290_409jrr43; vjuids=-32c7d06f0.1422c8e1c9f.0.13c721b62a19a; vjlast=1383726980.1383726980.30; ifengRotator_AP1998=0; _plst[_plaf_]=2479333135; _plst[_plid_]=3958005968; _plst[ifeng][_pllv_]=1; py_map=1'
}
req = urllib2.Request(
url = 'http://tech.ifeng.com/internet/list_'+str(i)+'/2300.shtml',
headers = headers
)
result = urllib2.urlopen(req).read()
urls = re.findall('href=\"(.+?)\"', result)
for url in urls:
if 'detail_' in url:
aUrl.append(url)
return aUrl
"""
获取文章正文内容
"""
def getContent(self, url):
aList = []
html = urllib2.urlopen(url, timeout = 60).read()
txt = re.findall(self.regular, html, re.S)
for item in txt:
item=re.sub('<[\/\!]*?[^<>]*?>|<script[^>]*?>.*?</script>si|window.zlzp = window.zlzp||{};|&(nbsp|#160);|&(quot|#34);|" target="_blank">(.*)</a>', '', item)
item=re.sub('\n\s*\r| |document.getElementById(.*);|<span class="rz">(.*)</span>]</span>|<a href="| | ', '', item)
item=re.sub(' |>>', ' ', item)
aList.append(item)
return aList
# 匹配文章正文的正则
reg = '<div id="artical_real">(.*)<span class="ifengLogo">'
cj = FengHuang(3, reg)
# 获取当前页面中的文章链接地址集合
fh_url = cj.getUrl()
for (url,k) in zip(fh_url, (xrange(1, len(fh_url)+1))):
# 获取文章正文内容并写入到文件中
for txt in cj.getContent(url):
print k, url
file.write("\r\n" + str(k) + '、' + txt.strip())
file.close()