Python爬虫---队列模拟递归遍历(广度遍历)

最新推荐文章于 2023-04-01 13:30:57 发布

原创最新推荐文章于 2023-04-01 13:30:57 发布 · 1.1k 阅读

5 ·

CC 4.0 BY-SA版权

Python 专栏收录该内容

12 篇文章

订阅专栏

本文介绍了一个简单的网页爬虫程序，该程序利用Python的正则表达式(re)来从网页中提取URL链接及电子邮件地址。通过广度优先搜索(BFS)算法遍历页面链接，并使用urllib.request模块获取网页内容。

1.导入re模块：正则

2.导入urllib.request模块：爬虫

3.导入deque模块：双向队列

4.extend()：列表末尾一次性追加另一个序列中的多个值

5.findall()：相匹配的全部字串，返回形式为数组

6.compile()：将一个字符串编译为字节

7.popleft()：队列为先进后出，都是添加在列表最前面那个值

import urllib.request
import urllib
import re
from collections import deque
#http://bbs.tianya.cn/m/post-140-393974-4.shtml
#http://bbs.tianya.cn
#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
#s广度遍历使用队列

def  geteveryurl(data):
    alllist=[]
    mylist1=[]
    mylist2=[]

    mylist1=getallhttp(data)
    if len(mylist1) >0:
        mylist2=getabsurl(mylist1[0],data)

    alllist.extend(mylist1)
    alllist.extend(mylist2)
    return  alllist


#<a class="u-btn pre-btn" href="/m/post-140-393974-4.shtml"></a>
def  getabsurl(url,data):
    try:
        regex=re.compile("href=\"(.*?)\"",re.IGNORECASE)
        httplist=regex.findall(data)
        newhttplist=httplist.copy()#深拷贝
        for data  in  newhttplist:
            if  data.find("http://")!=-1:
                httplist.remove(data)
            if  data.find("javascript")!=-1:
                httplist.remove(data)
        hostname=gethostname(url)
        if hostname!=None:
            for  i  in range(len(httplist)):
                httplist[i]=hostname+httplist[i]

        return httplist
    except:
        return []


#http://bbs.tianya.cn/post-140-393974-1.shtml'
#http://bbs.tianya.cn
def  gethostname(httpstr):
    try:
        mailregex = re.compile(r"(http://\S*?)/", re.IGNORECASE)
        mylist = mailregex.findall(httpstr)
        if  len(mylist)==0:
            return None
        else:
            return mylist[0]
    except:
        return None


def  getallhttp(data):
    try:
        mailregex = re.compile(r"(http://\S*?)[\"|>|)]", re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return []





def  getallemail(data):
    try:
        mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})", re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return []




def  getdata(url):
    try:
        data=urllib.request.urlopen(url).read().decode("utf-8")
        return data  #没有异常返回字符串
    except:
        return "" #发生异常返回空



def  BFS(urlstr):
    urlqueue=deque([]) #队列
    urlqueue.append(urlstr)
    while len(urlqueue)!=0:
        url=urlqueue.popleft()#队列弹出的数据
        print(url)#打印url链接
        pagedata=getdata(url)#获取网页源代码
        emaillist=getallemail(pagedata) #提取邮箱到列表
        if len(emaillist)!=0:  #邮箱不为空
            for  email in emaillist: #打印所有邮箱
                print(email)
        newurllist=geteveryurl(pagedata)#抓取所有的url
        if  len(newurllist)!=0:  #判断长度
            for urlstr in newurllist: #循环处理每一个url,
                if urlstr not in urlqueue: #判断存在或者不存在
                    urlqueue.append(urlstr)   #插入



#BFS("http://bbs.tianya.cn/m/post-140-393974-5.shtml")
BFS("http://www.baidu.com/")