# coding:utf-8 import urllib2, re class Tools(object): remove_emement = re.compile(r'<.*?>', re.S) remove_space = re.compile(r' ', re.S) remove_n = re.compile(r'\n', re.S) replace_br = re.compile(r'<br>|<br/>') def update_data(self, res_tuple): name = re.sub(self.remove_n, '', res_tuple[0]) name = re.sub(self.remove_emement, '', name) name = re.sub(self.remove_space, '', name) content = re.sub(self.remove_n, '', res_tuple[1]) content = re.sub(self.remove_emement, '', content) content = re.sub(self.remove_space, '', content) content = re.sub(self.replace_br, '\n', content) return (name, content) # 定义百度贴吧爬虫类 class BDTB(object): def __init__(self, numbers): # 基础路径及请求头 self.base_url = "http://tieba.baidu.com/p/"+numbers self.headers = { 'User-Agent':"Mozilla/5.0(Macintosh;IntelMacOSX10.6; rv:2.0.1)Gecko/20100101Firefox/4.0.1" } # 初始化工具类对象 self.tool = Tools() # 定义获取网页源代码的函数,设置一个形参,用于接收页码 def get_page_code(self, page_number): abs_url = self.base_url + '?pn=' + str(page_number) request = urllib2.Request(abs_url, headers=self.headers) try: response = urllib2.urlopen(request) except Exception, e: print '获取第%s页数据失败,原因是:%s'%(page_number, e) return None else: return response.read() # 解析源代码 def get_data(self, html): title_pattern = re.compile(r'<h1 class="core_title_txt.*?> (.*?)</h1>', re.S) title = re.search(title_pattern, html).group(1) numbers_pattern = re.compile(r'<li class="l_reply_num".*? <span class="red">(.*?)</span>', re.S) total_pages = re.search(numbers_pattern, html).group(1) return title, total_pages # 定义获取所有回复内容的函数 def get_comments_data(self, html): comment_pattern = re.compile(r'<a.*?class="p_author_name.*?> (.*?)</a>.*?<div id="post_content.*?>(.*?)</div>', re.S) results_list = re.findall(comment_pattern, html) data_list = [] for res_tuple in results_list: # 将元组传递到工具类中进行过滤 news_tuple = self.tool.update_data(res_tuple) data_list.append(news_tuple) return data_list # 定义用于写入数据的函数 def write_data(self, news_data, f): for name, content in news_data: # 形参news_data接收过滤后的数据,将数据进行保存 f.write('用户昵称:{}'.format(name)) f.write('\n') f.write('回复:{}'.format(content)) f.write('\n') # 定义开始爬虫程序的函数 def start_spider(self): # 先爬取第一页的内容,获取总页数 html = self.get_page_code(1) if html == None: print '链接贴吧失败' return else: title, total_page = self.get_data(html) # 开始下载该贴的内容 # print '要爬取的总页数:%s页' % total_page # 打开本地文件 # 将title标题中的空格删除 space = re.compile(' ') title = re.sub(space, '', title) title = re.sub(re.compile('/'), '', title) title = re.sub(re.compile('\n'), '', title) file_test = open('{}.txt'.format(title).decode('utf-8'), 'w') for x in xrange(1, 2): # print '正在爬取第%s页数据...'%x html = self.get_page_code(x) if html == None: # 当前页面数据获取失败 continue results = self.get_comments_data(html) self.write_data(results, file_test) file_test.close() if __name__ == '__main__': bdtb = BDTB('5147943292') bdtb.start_spider()
贴吧 class
最新推荐文章于 2021-01-30 11:54:02 发布
