网络爬虫 学习笔记

本文介绍了一种使用Python实现的针对新浪新闻网站的网络爬虫方案。通过分析网页结构,使用requests库和BeautifulSoup库来抓取新闻标题、来源、评论数量等信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1. 制作新浪新闻网络爬虫 

http://study.163.com/course/courseMain.htm?courseId=1003285002

利用chrome浏览器, 检查,---Network--doc  重新载入  第一个

选择检查元素,查到对应的标签

import requests
from bs4 import BeautifulSoup
newsurl='http://news.sina.com.cn/china/'
res=requests.get(newsurl)    #用chrome查看,知道是get方法,在此可以做各agent
res.encoding='UTF-8'    #防止乱码

’ print res.text     #中途检查结构
soup=BeautifulSoup(res.text,'html.parser')    #‘html.parser    

for news in soup.select('.news-item'):    #取id标号后的内容时候 加 "#", class 标号后的内容的时候,加 "."

    if len(news.select('h2'))>0:
        h2= news.select('h2')[0].text
        time=news.select('.time')[0].text
        a=news.select('a')[0]['href']           #注意此处取的是连接,如果取内容直接text
        print h2,a,time   

----------------------------------------------------------------------------------------


import requests
from datetime import datetime
from bs4 import BeautifulSoup
newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'
res=requests.get(newsurl)
res.encoding='UTF-8'
soup=BeautifulSoup(res.text,'html.parser')
bodyTitle=soup.select('#artibodyTitle')[0].text
timesource=soup.select('.time-source')[0].contents[0].strip()
#dt=datetime.strptime(timesource,'%Y%m%d%H:%M')
source=soup.select('.time-source span a')[0].text
article=[]
for p in soup.select('#artibody p')[:-1]:
    article.append(p.text.strip())
''.join(article)
''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
#soup.select('.article-editor')[0].text.lstrip('责任编辑')


--------------------------------------------------------------------------------------------------------------------------

查找js部分找到连接

import requests
from bs4 import BeautifulSoup
newsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\
&channel=gn&newsid=comos-fyfecvz1234039&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
comments=requests.get(newsurl)
comments.encoding='UTF-8'
import json
jd=json.loads(comments.text.strip('var data='))
jd['result']['count']['total']


---------------------------------------------------------------------------------------------

newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'
newid=newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
print newid


import re
m=re.search('doc-i(.*).shtml',newsurl)
newsid=m.group(1)


-----------------------------------------------------------------------------------------------------------

import re
import json
commentsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\
&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'


def getCommentCounts(newsurl):
    m=re.search('doc-i(.+).shtml',newsurl)
    newsid=m.group(1)
    comments=requests.get(commentsurl.format(newsid))  #把newsid套入commentsurl的{}
    jd=json.loads(comments.text.strip('var data='))
    return jd['result']['count']['total']


news='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'
getCommentCounts(news)


------------------------------------------------------------------------------

最终结果



import requests
from bs4 import BeautifulSoup


def getNewsDetail(newsurl):
    result={}
    res=requests.get(newsurl)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')
    result['title']=soup.select('#artibodyTitle')[0].text
    result['newssource']=soup.select('.time-source')[0].text
    result['comments']=getCommentCounts(newsurl)
    return result


getNewsDetail('http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml')


2. Python开发简单爬虫

http://www.imooc.com/video/10680


获取内容

import urllib2
#直接请求
response=urllib2.urlopen('http://www.baidu.com')
#获取状态码,200表示成功
print response.getcode()
cont=response.read()



-------------------

import urllib2
url='http://www.baidu.com'
request=urllib2.Request(url)
request.add_data('a')
request.add_header('User-Agent','Mozilla/5.0')
response=urllib2.urlopen(request)


-------------------------------------

import urllib2,cookielib
#创建cookie容器
cj=cookielib.CookieJar()
#创建1个opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#给urllib2安装opener
urllib2.install_opener(opener)
response=urllib2.urlopen('http://www.baidu.com')


--------------------------------------------------------------------------------------------------------

from bs4 import BeautifulSoup
import re
html_doc ='''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>
'''


# 根据html创建BeautifulSoup对象
soup=BeautifulSoup(html_doc, #文档字符串
                   'html.parser', #解析器
                  from_encoding='utf=8')


links=soup.find_all('a')
for link in links:
    print link.name,link['href'],link.get_text()


link_node=soup.find('a',href="http://example.com/lacie")
print link_node.name,link['href'],link.get_text()
    


    
link_node=soup.find('a',href=re.compile(r'ill'))
print link_node.name,link['href'],link.get_text()




p_node=soup.find('p',class_='title')  #class是PYthon的关键词,所以加_区分
print p_node.name,p_node.get_text()


----------------------------------------------------------------------------------------

最终

Crawler_mian.py

import URLManager, HTMLDownloader, HTMLParser, HTMLOutputer
class CrawlerMain(object):
    def __init__(self):
        self.urls = URLManager.UrlManager()                     # 初始化URL管理器
        self.downloader = HTMLDownloader.HtmlDownloader()       # 初始化HTML下载器
        self.parser = HTMLParser.HtmlParser()                   # 初始化HTML解析器
        self.outputer = HTMLOutputer.HtmlOutputer()             # 初始化HTML输出器
        pass
    def crawl(self,root_url):
        count = 1        # 爬取计数
        self.urls.add_new_url(root_url)                 # 将入口URL添加进管理器
        while self.urls.has_new_url():                  # 若URL池不为空则进行爬取
            try:
                new_url = self.urls.get_new_url()           # 获取要下载的URL
                print('crawl %d: %s' % (count, new_url))    # 打印正在爬取第几个页面及其URL
                html_cont = self.downloader.download(new_url)      # 下载页面
                new_urls, new_data = self.parser.hparse(new_url, html_cont)  # 获取新的URL列表和页面数据
                self.urls.add_new_urls(new_urls)            # 将新的URL列表添加进管理器
                self.outputer.collect_data(new_data)        # 收集数据
                if count == 10:
                    break
                count = count + 1
            except:
                print('Crawl Failed')
        self.outputer.output_html()   # 将数据输出为HTML
        pass
if __name__ == '__main__':
    root_url = "http://baike.baidu.com/item/Python"         # 入口URL
    obj_crawler = CrawlerMain()                             # 创建爬虫实例
    obj_crawler.crawl(root_url)                             # 调用爬虫方法
---

HTMLDownloader.py
from urllib import request
class HtmlDownloader(object):
    def download(self, url):
        if url is None:
            return None
        # 打开网页
        response = request.urlopen(url)
        if response.getcode() != 200:
            # 打开失败返回None
            return None
        else:
            # 打开成功返回网页内容
            return response.read().decode("utf-8")
----
URLManager.py
class UrlManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()
    def add_new_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            # 如果该URL没有被/添加访问过就添加进管理器
            self.new_urls.add(url)
        pass
    def add_new_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            # 将URL列表添加进待访问队列
            self.new_urls.add(url)
        pass
    def has_new_url(self):
        # 返回URL池是否为空
        return len(self.new_urls) != 0
        pass
    def get_new_url(self):
        # 从未访问的URL中取出一个,并返回取出的URL
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url
        pass
------------
HTMLParser.py
from bs4 import BeautifulSoup
import re
from urllib import parse
class HtmlParser(object):
    # page_url为页面URL, html_cont为获取的页面内容
    def hparse(self, page_url, html_cont):
        if page_url is None or html_cont is None:
            return
        # BeautifulSoup解析网页内容
        soup = BeautifulSoup(html_cont, 'html.parser')
        # 获取页面内容包含的URLs
        new_urls = self._get_new_urls(page_url, soup)
        # 获取页面内容中想要爬取的数据
        new_data = self._get_new_data(page_url, soup)
        return new_urls, new_data
        pass
    def _get_new_urls(self, page_url, soup):
        new_urls = set()
        # 正则表达式模糊匹配
        links = soup.find_all('a', href=re.compile(r"/item/"))
        for link in links:
            new_url = link['href']
           # 连接成网址
new_full_url = parse.urljoin(page_url, new_url)
new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} # url res_data['url'] = page_url # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> title_node = soup.find('dd', class_ = "lemmaWgt-lemmaTitle-title").find("h1") res_data['title'] = title_node.get_text() # <div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div', class_ = "lemma-summary" ) res_data['summary'] = summary_node.get_text() return res_data pass
------------
HTMLOutputer.py
class HtmlOutputer(object):
    def __init__(self):
        self.datas = []
    def collect_data(self, data):
        if data is None:
            return
        self.datas.append(data)
        pass
    def output_html(self):
        fout = open('output.html', 'w', encoding='utf-8')

        fout.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')
        fout.write("<html>")
        fout.write("<body>")
        fout.write("<table>")

        for data in self.datas:
            fout.write("<tr>")
            fout.write("<td>%s</td>" % data['url'])
            fout.write("<td>%s</td>" % data['title'])
            fout.write("<td>%s</td>" % data['summary'])
            fout.write("</tr>")

        fout.write("</html>")
        fout.write("</body>")
        fout.write("</table>")
        pass



 









评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

张博208

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值