爬虫

最新推荐文章于 2025-03-15 20:17:26 发布

凌晨两点半还不回家

最新推荐文章于 2025-03-15 20:17:26 发布

阅读量229

点赞数

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/qq_35654080/article/details/81082868

本文介绍正则表达式的基础语法及应用案例，并通过实例演示如何使用多线程提升爬虫效率。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1 正则表达式

. : 匹配任意字符，换行符\n除外

* ：匹配前一个字符0次或无限次

? ：匹配前一个字符0次或1次

.*：贪心算法

.*?：非贪心算法

（）：括号内的数据作为结果返回

findall：匹配所有符合规律的内容，返回包含结果的列表

Search：匹配并提取第一个符合规律的内容，返回一个正则表达式对象（object)

Sub：替换符合规律的内容，返回替换后的值

import re
secret_code = 'hadkfalifexxIxxfasdjifja134xxlovexx23345sdfxxyouxx8dfse'
# .相当于占位符 返回x后的元素 有几个.就返回几个元素
a='xy123'
b=re.findall('x.',a)
print(b)


# *的使用 x*就返回和x一样的元素0次或者无限次 不同的元素则返回空
a='xyxy123'
b=re.findall('x*',a)
print(b)

# ?的使用 x？表示返回x 0次或者1次
a='xyxy123'
b=re.findall('x?',a)
print(b)

# #.*的使用举例 贪心算法 返回两个**之间最多的内容
b=re.findall('xx.*xx',secret_code)
print(b)
# # #.*？的使用举例 非贪心算法 返回符合条件最多的组合 包括两端的xx
c = re.findall('xx.*?xx',secret_code)
print(c)
#
#
#
# #使用括号与不使用括号的差别 返回xx之间的具体内容 不包括xx
d = re.findall('xx(.*?)xx',secret_code)
print(d)
for each in d:
    print(each)

s = '''sdfxxhello
xxfsdfxxworldxxasdf'''
d = re.findall('xx(.*?)xx',s,re.S)#re.S是.包括换行
print(d)


#对比findall与search的区别
s2 = 'asdfxxIxx123xxlovexxdfd'
f = re.search('xx(.*?)xx123xx(.*?)xx',s2).group(2)#group(2)表示返回满足第二个括号
print(f)
f2 = re.findall('xx(.*?)xx123xx(.*?)xx',s2)
print(f2[0][1])

#sub的使用举例
s = '123rrrrr123'
output = re.sub('123(.*?)123','123%d123'%789,s)
print(output)

#演示不同的导入方法
#info = findall('xx(.*?)xx',secret_code,S)
#for each in info:
#    print(each)

#不要使用compile
# pattern = 'xx(.*?)xx'
# new_pattern = re.compile(pattern,re.S)
# output = re.findall(new_pattern,secret_code)
# print output

#匹配数字
a = 'asdfasf1234567fasd555fas'
b = re.findall('(\d+)',a)
print(b)

正则表达式的应用：

import re

old_url='http://www.jikexueyuan.com/course/android/?pageNum=2'
total_page=20

f=open('text.txt','r',encoding='utf-8')
html=f.read()
f.close()

#读取标题
title=re.search('<title>(.*?)</title>',html,re.S).group(1)
print(title)

#爬取链接
links = re.findall('href="(.*?)"',html,re.S)
for each in links:
     print(each)

#抓取部分文字,先大再小
text_fied = re.findall('<ul>(.*?)</ul>',html,re.S)[0]
the_text = re.findall('">(.*?)</a>',text_fied,re.S)
for every_text in the_text:
    print(every_text)

#sub实现翻页
for i in range(2,total_page+1):
    new_link = re.sub('pageNum=\d+','pageNum=%d'%i,old_url,re.S)
    print(new_link)

正则爬取页面
import requests
import re

class spider(object):
    def __init__(self):
        print('开始爬取内容。。。')

#getsource用来获取网页源代码
    def getsource(self,url):
        my_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',}
        html = requests.get(url,headers=my_header)
        return html.text

#changepage用来生产不同页数的链接
    def changepage(self,url,total_page):
        now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
        page_group = []
        for i in range(now_page,total_page+1):
            link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)#替换url中的页码数
            page_group.append(link)
        return page_group
#geteveryclass用来抓取每个课程块的信息
    def geteveryclass(self,source):
        everyclass = re.findall('(<li id=".*?</li>)',source,re.S)
        return everyclass
#getinfo用来从每个课程块中提取出我们需要的信息
    def getinfo(self,eachclass):
        info = {}
        info['title'] = re.search('class="lessonimg" title="(.*?)" alt=',eachclass,re.S).group(1)
        info['content'] = re.search('display: none;">(.*?)</p>',eachclass,re.S).group(1)
        timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
        info['classtime'] = timeandlevel[0]
        info['classlevel'] = timeandlevel[1]
        info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
        return info
#saveinfo用来保存结果到info.txt文件中
    def saveinfo(self,classinfo):
        f = open('info.txt','a',encoding='utf-8')
        for each in classinfo:
            f.writelines('title:' + each['title'] + '\n')
            f.writelines('content:' + each['content'] + '\n')
            f.writelines('classtime:' + each['classtime'] + '\n')
            f.writelines('classlevel:' + each['classlevel'] + '\n')
            f.writelines('learnnum:' + each['learnnum'] + '\n\n')
        f.close()

if __name__ == '__main__':

    classinfo = []
    url = 'http://www.jikexueyuan.com/course/?pageNum=1'
    jikespider = spider()
    all_links = jikespider.changepage(url,20)
    for link in all_links:
        print('正在处理页面：',link)
        html = jikespider.getsource(link)
        everyclass = jikespider.geteveryclass(html)
        for each in everyclass:
            info = jikespider.getinfo(each)
            classinfo.append(info)
    jikespider.saveinfo(classinfo)
 #   print(classinfo)

2.Xpath

#-*-coding:utf8-*-
from lxml import etree
html = '''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title>测试-常规用法</title>
</head>
<body>
<div id="content">
    <ul id="useful">
        <li>这是第一条信息</li>
        <li>这是第二条信息</li>
        <li>这是第三条信息</li>
    </ul>
    <ul id="useless">
        <li>不需要的信息1</li>
        <li>不需要的信息2</li>
        <li>不需要的信息3</li>
    </ul>

    <div id="url">
        <a href="http://jikexueyuan.com">极客学院</a>
        <a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
    </div>
</div>

</body>
</html>
'''

selector = etree.HTML(html)

#提取文本
content = selector.xpath('//ul[@id="useful"]/li/text()')
for each in content:
    print(each)

#提取属性
link = selector.xpath('//a/@href')
for each in link:
    print(each)

title = selector.xpath('//a/@title')
print(title[0])

特殊用法

starts-with(@属性名称，属性字符相同部分）

标签套标签

string（.）

#-*-coding:utf8-*-
from lxml import etree

html1 = '''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title></title>
</head>
<body>
    <div id="test-1">需要的内容1</div>
    <div id="test-2">需要的内容2</div>
    <div id="testfault">需要的内容3</div>
</body>
</html>
'''

html2 = '''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title></title>
</head>
<body>
    <div id="test3">
        我左青龙，
        <span id="tiger">
            右白虎，
            <ul>上朱雀，
                <li>下玄武。</li>
            </ul>
            老牛在当中，
        </span>
        龙头在胸口。
    </div>
</body>
</html>
'''

selector = etree.HTML(html1)
content = selector.xpath('//div[starts-with(@id,"test")]/text()')
for each in content:
    print(each)

selector = etree.HTML(html2)
content_1 = selector.xpath('//div[@id="test3"]/text()')
for each in content_1:
    print(each)


data = selector.xpath('//div[@id="test3"]')[0]
info = data.xpath('string(.)')
content_2 = info.replace('\n','').replace(' ','')
print(content_2)

3 多线程使用

map函数一手包办了序列操作、参数传递和结果保存等一系列操作。

from multiprocessing.dummy import Pool

pool =Pool(4)

result = pool.map(爬取函数，网址列表）

#-*-coding:utf8-*-

from multiprocessing.dummy import Pool as ThreadPool
import requests
import time

def getsource(url):
    html = requests.get(url)

urls = []

for i in range(1,21):
    newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
    urls.append(newpage)

time1 = time.time()#开始时间
for i in urls:
    print(i)
    getsource(i)
time2 = time.time()#结束时间
print('单线程耗时：' + str(time2-time1))

pool = ThreadPool(4)
time3 = time.time()
results = pool.map(getsource, urls)
pool.close()
pool.join()
time4 = time.time()
print(u'并行耗时：' + str(time4-time3))

多线程爬虫之贴吧

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json




'''重新运行之前请删除content.txt，因为文件操作使用追加方式，会导致内容太多。'''

def towrite(contentdict):
    f.writelines('回帖楼层:' + str(contentdict['topic_reply_time']) + '\n')
    f.writelines('回帖内容:' + str(contentdict['topic_reply_content']) + '\n')
    f.writelines('发帖人:' + contentdict['author_name'] + '\n\n')

def spider(url):

    html = requests.get(url)
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//li[@class=" j_thread_list clearfix"]')
#    print(content_field)
    item = {}
    for each in content_field:
        reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot',''))
        author = reply_info['author_name']
        content = each.xpath('div[@class="t_con cleafix"]/div/div/div/div[@class="threadlist_abs threadlist_abs_onlyline "]/text()')[0]
        reply_num = reply_info['reply_num']
        print(content)
        print(reply_num)
        print(author)
        item['author_name'] = author
        item['topic_reply_content'] = content
        item['topic_reply_time'] = reply_num

        towrite(item)

if __name__ == '__main__':
    pool = ThreadPool(4)
    f = open('content.txt','a',encoding='utf-8')
    page = []
    for i in range(1,21):
        newpage = 'http://tieba.baidu.com/f?kw=iphone&ie=utf-8&pn=' + str(i)
        page.append(newpage)

    results = pool.map(spider, page)
    pool.close()
    pool.join()
    f.close()