#!user/bin/python #coding:utf-8 from bs4 import BeautifulSoup import urllib2 import sys reload(sys) sys.setdefaultencoding('utf-8') def get_html(url): req = urllib2.urlopen(url).read() return req def handle_html(html): soup = BeautifulSoup(html) print u'豆瓣电影Top250: 序号 \t影片名\t英文名\t链接地址' list1 = [] for tag in soup.find_all('div', class_="item"): m_order = tag.find('em').get_text() m_name = tag.find('span').get_text() En_name = tag.find_all('span')[1].get_text() m_link = tag.a.get('href') #print '%s %s %s %s' % (m_order, m_name, En_name, m_link) cache = [m_order, m_name, En_name, m_link] list1.append(cache) return list1 def save_result(result): f = open('text.txt', 'wb') for res in result: count = 0 for i in res: f.write(str(i) + '\t') count += 1 if count == 4: f.write('\n') f.close() if __name__ == '__main__': url = 'http://movie.douban.com/top250?format=text' html = get_html(url) result = handle_html(html) save_result(result)
爬取豆瓣的电影
最新推荐文章于 2025-02-27 22:40:04 发布