# -*- coding: utf-8 -*-
"""
Created on Sun Nov 4 09:58:09 2018
@author: wangf
"""
import re
import requests
import urllib
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x
Python爬取贴吧帖子内容
最新推荐文章于 2023-10-05 23:24:44 发布