#!/usr/bin/env python
# coding: utf-8
import re
import urllib2
class doubanTop10:
def __init__(self):
self.url = 'http://movie.douban.com/chart'
self.datas = []
self._top_num = 1
print "正在爬取豆瓣新片榜...\n"
def get_data(self, url):
url = self.url
try:
page_data = urllib2.urlopen(url).read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e, 'code'):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
elif hasattr(e, 'reason'):
print "We failed to reach a server. Please check your url and read the Reason."
print "Reason: %s" % e.reason
return page_data
def find_title(self, page_data):
temp_data = []
# print page_data
movie_items = re.findall(r'<a.*?class="nbg".*?title="(.*?)">', page_data, re.S)
for index, item in enumerate(movie_items):
if item.find(" ") == -1:
temp_data.append("Top" + str(self._top_num) + " " + item)
self._top_num += 1
self.datas.extend(temp_data)
def start_spider(self):
my_page = self.get_data(self.url)
self.find_title(my_page)
def main():
spider = doubanTop10()
spider.start_spider()
for item in spider.datas:
print item
print "\n爬取完成!"
if __name__ == '__main__':
main()
转载于:https://blog.51cto.com/cheney1228/1623437