爬取内涵段子,使用正则进行简单处理:
#_*_ coding: utf-8 _*_
'''
Created on 2018年7月14日
@author: sss
function:爬去内涵段子(静态网页抓取)
'''
import requests
import urllib
import re
import random
from Tools.scripts.treesync import raw_input
class Spider:
def __init__(self):
#初始话起始页的位置
self.page = 1
#爬去开关,ture表示继续爬取
self.switch = True
def loadPage(self):
"""
下载页面
"""
print('开始下载第'+ str(self.page) + '页:')
url = 'https://www.neihan8.com/article/list_5_' + str(self.page) + '.html'
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0)like Gecko",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X ",
"Mozilla/5.0 (Macintosh; Intel Mac OS "
]
user_agnet = random.choice(ua_list)
headers = {
"Connection" : "keep-alive",
"Accept" : "application/json, text/javascript, */*; q=0.01",
"User-Agent" : user_agnet,
}
# response = requests.get(url, headers = headers)
request = urllib.request.Request(url, headers = headers)
response = urllib.request.urlopen(request)
#获取每页的html源码:
html = response.read().decode('gbk')
# print(html)
#创建正则表达式规则对象,匹配每页里的段子内容,re.Sb表示匹配全部字符串内容
pattern = re.compile('<div\sclass="f18 mb20">(.*?)</div>', re.RegexFlag.S) #在py3中不是re.S
#将正则匹配的对象应用到html源码字符串里,返回这个页面里的所有段子的列表
content_list = pattern.findall(html)
# print(content_list)
self.dealPage(content_list)
def dealPage(self, content_list ):
"""
处理每页都段子
"""
for item in content_list:
item = item.replace('<p>', '').replace('</p>',''.replace('<br>', '')).replace("<br />", '')
# print(itme)
self.writePage(item)
def writePage(self, item):
"""
把每条段子写入到文件里
"""
with open('duanzi.txt', 'a') as f:
f.write(item)
def startWork(self):
"""
控制爬虫的运行
"""
while self.switch:
self.loadPage()
command = raw_input('如果继续爬去,请按回车(退出输入q)')
if command == 'q':
self.switch = False
self.page += 1
print('finish!')
if __name__ == '__main__':
duanziSpider = Spider()
duanziSpider.startWork()