1 正则表达式
. : 匹配任意字符,换行符\n除外
* :匹配前一个字符0次或无限次
? :匹配前一个字符0次或1次
.*:贪心算法
.*?:非贪心算法
():括号内的数据作为结果返回
findall: 匹配所有符合规律的内容,返回包含结果的列表
Search:匹配并提取第一个符合规律的内容,返回一个正则表达式对象(object)
Sub:替换符合规律的内容,返回替换后的值
import re
secret_code = 'hadkfalifexxIxxfasdjifja134xxlovexx23345sdfxxyouxx8dfse'
# .相当于占位符 返回x后的元素 有几个.就返回几个元素
a='xy123'
b=re.findall('x.',a)
print(b)
# *的使用 x*就返回和x一样的元素0次或者无限次 不同的元素则返回空
a='xyxy123'
b=re.findall('x*',a)
print(b)
# ?的使用 x?表示返回x 0次或者1次
a='xyxy123'
b=re.findall('x?',a)
print(b)
# #.*的使用举例 贪心算法 返回两个**之间最多的内容
b=re.findall('xx.*xx',secret_code)
print(b)
# # #.*?的使用举例 非贪心算法 返回符合条件最多的组合 包括两端的xx
c = re.findall('xx.*?xx',secret_code)
print(c)
#
#
#
# #使用括号与不使用括号的差别 返回xx之间的具体内容 不包括xx
d = re.findall('xx(.*?)xx',secret_code)
print(d)
for each in d:
print(each)
s = '''sdfxxhello
xxfsdfxxworldxxasdf'''
d = re.findall('xx(.*?)xx',s,re.S)#re.S是.包括换行
print(d)
#对比findall与search的区别
s2 = 'asdfxxIxx123xxlovexxdfd'
f = re.search('xx(.*?)xx123xx(.*?)xx',s2).group(2)#group(2)表示返回满足第二个括号
print(f)
f2 = re.findall('xx(.*?)xx123xx(.*?)xx',s2)
print(f2[0][1])
#sub的使用举例
s = '123rrrrr123'
output = re.sub('123(.*?)123','123%d123'%789,s)
print(output)
#演示不同的导入方法
#info = findall('xx(.*?)xx',secret_code,S)
#for each in info:
# print(each)
#不要使用compile
# pattern = 'xx(.*?)xx'
# new_pattern = re.compile(pattern,re.S)
# output = re.findall(new_pattern,secret_code)
# print output
#匹配数字
a = 'asdfasf1234567fasd555fas'
b = re.findall('(\d+)',a)
print(b)
正则表达式的应用:
import re
old_url='http://www.jikexueyuan.com/course/android/?pageNum=2'
total_page=20
f=open('text.txt','r',encoding='utf-8')
html=f.read()
f.close()
#读取标题
title=re.search('<title>(.*?)</title>',html,re.S).group(1)
print(title)
#爬取链接
links = re.findall('href="(.*?)"',html,re.S)
for each in links:
print(each)
#抓取部分文字,先大再小
text_fied = re.findall('<ul>(.*?)</ul>',html,re.S)[0]
the_text = re.findall('">(.*?)</a>',text_fied,re.S)
for every_text in the_text:
print(every_text)
#sub实现翻页
for i in range(2,total_page+1):
new_link = re.sub('pageNum=\d+','pageNum=%d'%i,old_url,re.S)
print(new_link)
正则爬取页面
import requests
import re
class spider(object):
def __init__(self):
print('开始爬取内容。。。')
#getsource用来获取网页源代码
def getsource(self,url):
my_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',}
html = requests.get(url,headers=my_header)
return html.text
#changepage用来生产不同页数的链接
def changepage(self,url,total_page):
now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
page_group = []
for i in range(now_page,total_page+1):
link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)#替换url中的页码数
page_group.append(link)
return page_group
#geteveryclass用来抓取每个课程块的信息
def geteveryclass(self,source):
everyclass = re.findall('(<li id=".*?</li>)',source,re.S)
return everyclass
#getinfo用来从每个课程块中提取出我们需要的信息
def getinfo(self,eachclass):
info = {}
info['title'] = re.search('class="lessonimg" title="(.*?)" alt=',eachclass,re.S).group(1)
info['content'] = re.search('display: none;">(.*?)</p>',eachclass,re.S).group(1)
timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
info['classtime'] = timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
return info
#saveinfo用来保存结果到info.txt文件中
def saveinfo(self,classinfo):
f = open('info.txt','a',encoding='utf-8')
for each in classinfo:
f.writelines('title:' + each['title'] + '\n')
f.writelines('content:' + each['content'] + '\n')
f.writelines('classtime:' + each['classtime'] + '\n')
f.writelines('classlevel:' + each['classlevel'] + '\n')
f.writelines('learnnum:' + each['learnnum'] + '\n\n')
f.close()
if __name__ == '__main__':
classinfo = []
url = 'http://www.jikexueyuan.com/course/?pageNum=1'
jikespider = spider()
all_links = jikespider.changepage(url,20)
for link in all_links:
print('正在处理页面:',link)
html = jikespider.getsource(link)
everyclass = jikespider.geteveryclass(html)
for each in everyclass:
info = jikespider.getinfo(each)
classinfo.append(info)
jikespider.saveinfo(classinfo)
# print(classinfo)
2.Xpath
#-*-coding:utf8-*-
from lxml import etree
html = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title>测试-常规用法</title>
</head>
<body>
<div id="content">
<ul id="useful">
<li>这是第一条信息</li>
<li>这是第二条信息</li>
<li>这是第三条信息</li>
</ul>
<ul id="useless">
<li>不需要的信息1</li>
<li>不需要的信息2</li>
<li>不需要的信息3</li>
</ul>
<div id="url">
<a href="http://jikexueyuan.com">极客学院</a>
<a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
</div>
</div>
</body>
</html>
'''
selector = etree.HTML(html)
#提取文本
content = selector.xpath('//ul[@id="useful"]/li/text()')
for each in content:
print(each)
#提取属性
link = selector.xpath('//a/@href')
for each in link:
print(each)
title = selector.xpath('//a/@title')
print(title[0])
特殊用法
starts-with(@属性名称,属性字符相同部分)
标签套标签
string(.)
#-*-coding:utf8-*-
from lxml import etree
html1 = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div id="test-1">需要的内容1</div>
<div id="test-2">需要的内容2</div>
<div id="testfault">需要的内容3</div>
</body>
</html>
'''
html2 = '''
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div id="test3">
我左青龙,
<span id="tiger">
右白虎,
<ul>上朱雀,
<li>下玄武。</li>
</ul>
老牛在当中,
</span>
龙头在胸口。
</div>
</body>
</html>
'''
selector = etree.HTML(html1)
content = selector.xpath('//div[starts-with(@id,"test")]/text()')
for each in content:
print(each)
selector = etree.HTML(html2)
content_1 = selector.xpath('//div[@id="test3"]/text()')
for each in content_1:
print(each)
data = selector.xpath('//div[@id="test3"]')[0]
info = data.xpath('string(.)')
content_2 = info.replace('\n','').replace(' ','')
print(content_2)
3 多线程使用
map函数一手包办了序列操作、参数传递和结果保存等一系列操作。
from multiprocessing.dummy import Pool
pool =Pool(4)
result = pool.map(爬取函数,网址列表)
#-*-coding:utf8-*-
from multiprocessing.dummy import Pool as ThreadPool
import requests
import time
def getsource(url):
html = requests.get(url)
urls = []
for i in range(1,21):
newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
urls.append(newpage)
time1 = time.time()#开始时间
for i in urls:
print(i)
getsource(i)
time2 = time.time()#结束时间
print('单线程耗时:' + str(time2-time1))
pool = ThreadPool(4)
time3 = time.time()
results = pool.map(getsource, urls)
pool.close()
pool.join()
time4 = time.time()
print(u'并行耗时:' + str(time4-time3))
多线程爬虫之贴吧
#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
'''重新运行之前请删除content.txt,因为文件操作使用追加方式,会导致内容太多。'''
def towrite(contentdict):
f.writelines('回帖楼层:' + str(contentdict['topic_reply_time']) + '\n')
f.writelines('回帖内容:' + str(contentdict['topic_reply_content']) + '\n')
f.writelines('发帖人:' + contentdict['author_name'] + '\n\n')
def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath('//li[@class=" j_thread_list clearfix"]')
# print(content_field)
item = {}
for each in content_field:
reply_info = json.loads(each.xpath('@data-field')[0].replace('"',''))
author = reply_info['author_name']
content = each.xpath('div[@class="t_con cleafix"]/div/div/div/div[@class="threadlist_abs threadlist_abs_onlyline "]/text()')[0]
reply_num = reply_info['reply_num']
print(content)
print(reply_num)
print(author)
item['author_name'] = author
item['topic_reply_content'] = content
item['topic_reply_time'] = reply_num
towrite(item)
if __name__ == '__main__':
pool = ThreadPool(4)
f = open('content.txt','a',encoding='utf-8')
page = []
for i in range(1,21):
newpage = 'http://tieba.baidu.com/f?kw=iphone&ie=utf-8&pn=' + str(i)
page.append(newpage)
results = pool.map(spider, page)
pool.close()
pool.join()
f.close()