Python网页静态爬虫

最新推荐文章于 2024-08-26 07:16:00 发布

原创最新推荐文章于 2024-08-26 07:16:00 发布 · 2.9k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#Python3.5 #爬虫 #url

Python 专栏收录该内容

17 篇文章

订阅专栏

本文基于慕课网的视频教程，抓取百度百科中1000条词条页面信息。

编程环境：Python3.5

抓取网页信息包括以下几个部分：url管理器，下载器，解析器，输出器：

（1）读取要爬取的网页URL，可命名为root_url

（2）解析root_url网页中的内容，并将其中包含的其他url存进url管理器

（3）输入HTML文件，包含url,title,summary等信息

下面将配合代码的形式，讲解如何爬取网页信息。

主函数：

# -*- coding:utf-8 -*-
import url_manager     #导入URL管理器
import html_download   #导入下载器
import html_parser     #导入解析器
import html_outputer   #导入输出器
class SpiderMain(object):
    def __init__(self):  #构造函数初始化
        self.urls=url_manager.UrlManage()  
        self.downloader=html_download.Downloader()
        self.parser=html_parser.Parser()
        self.outputer=html_outputer.Outputer()
    def crawl(self,root_url):
        count=1
        self.urls.add_new_url(root_url)   #添加根url
        while self.urls.has_new_url():    #判断URL管理器中是否还存在URL（理论上是肯定存在的，因为每一次打开一个网页都会将其所有超链接存进URL管理器）
            try:     #会出现不存在URL的情况
                new_url=self.urls.get_new_url() #提取url
                print ('crawl %d : %s'%(count,new_url))  #打印URL内容并计数
                html_content=self.downloader.download(new_url) #下载URL的内容
                urls,data=self.parser.parse(new_url,html_content)  #解析URL的内容，得到该URL网页下的所有URL及该URL的标题与总结
                self.urls.add_new_urls(urls)  #将上一步得到的所有URL添加进URL容器，方便循环调用
                self.outputer.collect_data(data)  #收集数据，为下一步导出到HTML文件中做准备
                if count==1000:  #爬取1000个URL
                    break
                count+=1        
            except:
                print ("crawl failed")
        self.outputer.output()  #输出器，将爬取到的内容输出到html文件中
if __name__=="__main__":  #主函数
    root_url="http://baike.baidu.com/view/21087.htm"  #根url
    obj_spider=SpiderMain()  
    obj_spider.crawl(root_url)  #执行crawl函数

url管理器：


class UrlManage(object):
    def __init__(self):
        self.new_urls=set()
        self.old_urls=set()
    def add_new_url(self,url): #添加新的URL（一次添加一条URL）
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls: #说明该url既不在待爬取的URL列表里，也不在爬取过的URL列表里面
            self.new_urls.add(url)

    def has_new_url(self): #判断是否含有URL
        return len(self.new_urls)!=0

    def get_new_url(self):  #提取URL给后续解析，并将其从new_urls剔除，存进old_urls
        new_url=self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url
    def add_new_urls(self,urls):  #将待爬取网页的所有超链接导入new_urls集合中
        if urls is None or len(urls)==0:
            return
        for url in urls:
            self.add_new_url(url)

下载器：

from urllib import request

class Downloader():
    def download(self,url):
        if url is None:
            return
        response=request.urlopen(url) #打开url  
        if response.getcode()!=200:   #如果response.getcode()!=200，说明爬取失败
            return None
        return response.read()        #读取url内容，包括整个网页信息（html形式）

解析器：

from bs4 import BeautifulSoup  #使用网页解析器BeautifulSoup4解析下载后的信息
import re
from urllib import parse

class Parser():
    def get_urls(self,page_url,soup):
        urls=set()
        #href格式为/view/123.htm
        links=soup.find_all('a',href=re.compile(r'/view/\d+\.htm')) #使用正则化，将所有href格式为/view/123.htm的url都存储下来
        for link in links:
            new_url=parse.urljoin(page_url,link['href'])  #需要补全href格式，使用urljoin拼接两个url，得到一个完整的解析后的url
            urls.add(new_url)   #将解析后的url添加进urls中
        return urls
    def get_data(self,page_url,soup): #提取url的title及summary
        data={}
        data['url']=page_url
        title=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
        data['title']=title.get_text()
        summary=soup.find('div',class_="lemma-summary")
        data['summary']=summary.get_text()
        return data

    def parse(self,page_url,html_cont):
        if page_url is None or html_cont is None:
            return
        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf8')
        urls=self.get_urls(page_url,soup)  #将解析后的url存入urls中
        data=self.get_data(page_url,soup)  #将url的title及summary存入data中
        return urls,data

输出器：

# -*- coding:utf-8 -*-
import string
class Outputer():
    def __init__(self):
        self.datas=[]
    def collect_data(self,data):
        if data is None:
            return
        self.datas.append(data)
    def output(self):
        fout=open('output.html','w',encoding='utf-8') #创建html文件
        fout.write('<html>')
        fout.write("<head>")   #这三行是为了解决HTML文件输入中文乱码
        fout.write('<meta charset="utf-8">')  #这三行是为了解决HTML文件输入中文乱码
        fout.write("</head>")  #这三行是为了解决HTML文件输入中文乱码
        fout.write('<body>')   
        fout.write('<table>')  #创建表格形式
        for data in self.datas:
            fout.write('<tr>')  #创建行
            fout.write('<td>%s</td>' % data['url'])  #创建单元格
            fout.write('<td>%s</td>' % data['title'])
            fout.write('<td>%s</td>' % data['summary'])
            fout.write('</tr>') #关闭行
        fout.write('</table>')  #关闭表格
        fout.write('</body>')
        fout.write('</html>')
        fout.close()

对网页解析器BeautifulSoup的补充说明，举例如下：

import re
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup=BeautifulSoup(html_doc,'html.parser',from_encoding='utf8')   
                    #html文档字符串    #html解析器   #html文档的编码

#搜索节点：find_all(name,attrs,string)
#查找所有标签为a的节点
print ('获取所有带a的链接')
links=soup.find_all('a')
for link in links:
    print (link.name,link['href'],link.get_text())
#查找所有标签为a，链接符合'/view/123.htm'形式的节点
#soup.find_all('a',href='/view/123.htm')
print ('获取带tillie的链接')
link1=soup.find('a',href='http://example.com/tillie')
print (link1.name,link1['href'],link1.get_text())
#查找所有标签为a，class为abc,文字为Python的节点
soup.find_all('a',class_='abc',string='Python')
print ('获取带lsi的链接')
link2=soup.find('a',href=re.compile(r'lsi'))  #使用正则表达式
print (link2.name,link2['href'],link2.get_text())
print ('获取带p的链接')
link3=soup.find('p',class_="story")
print (link3.name,link3.get_text())

得到结果如下：

获取所有带a的链接
a http://example.com/elsie Elsie
a http://example.com/lacie Lacie
a http://example.com/tillie Tillie
获取带tillie的链接
a http://example.com/tillie Tillie
获取带lsi的链接
a http://example.com/elsie Elsie
获取带p的链接
p Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.

参考自：Python爬虫----网页解析器和BeautifulSoup第三方模块

接着，对urljoin函数的补充说明，举例如下：

#urlparse解析器简单举例
from urllib import parse
print (parse.urljoin('http://baike.baidu.com/view/21087.htm','/view/53557.htm')) #这里要注意“/”的使用，可以自己尝试看看具体用法

得到结果：

http://baike.baidu.com/view/53557.htm

一开始，按照视频教程写代码，发现html文件的中文读取不到，或者出现中文乱码的情况，为此在输出器中添加了

fout.write("<head>")   #这三行是为了解决HTML文件输入中文乱码
        fout.write('<meta charset="utf-8">')  #这三行是为了解决HTML文件输入中文乱码
        fout.write("</head>")  #这三行是为了解决HTML文件输入中文乱码

参考自：爬虫显示中文乱码

最后，爬出得到的结果是（html文件）：