1. 制作新浪新闻网络爬虫
http://study.163.com/course/courseMain.htm?courseId=1003285002
利用chrome浏览器, 检查,---Network--doc 重新载入 第一个
选择检查元素,查到对应的标签
import requests
from bs4 import BeautifulSoup
newsurl='http://news.sina.com.cn/china/'
res=requests.get(newsurl) #用chrome查看,知道是get方法,在此可以做各agent
res.encoding='UTF-8' #防止乱码
’ print res.text #中途检查结构
soup=BeautifulSoup(res.text,'html.parser') #‘html.parser
for news in soup.select('.news-item'): #取id标号后的内容时候 加 "#", class 标号后的内容的时候,加 "."
if len(news.select('h2'))>0:
h2= news.select('h2')[0].text
time=news.select('.time')[0].text
a=news.select('a')[0]['href'] #注意此处取的是连接,如果取内容直接text
print h2,a,time
----------------------------------------------------------------------------------------
import requests
from datetime import datetime
from bs4 import BeautifulSoup
newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'
res=requests.get(newsurl)
res.encoding='UTF-8'
soup=BeautifulSoup(res.text,'html.parser')
bodyTitle=soup.select('#artibodyTitle')[0].text
timesource=soup.select('.time-source')[0].contents[0].strip()
#dt=datetime.strptime(timesource,'%Y%m%d%H:%M')
source=soup.select('.time-source span a')[0].text
article=[]
for p in soup.select('#artibody p')[:-1]:
article.append(p.text.strip())
''.join(article)
''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
#soup.select('.article-editor')[0].text.lstrip('责任编辑')
--------------------------------------------------------------------------------------------------------------------------
查找js部分找到连接
import requests
from bs4 import BeautifulSoup
newsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\
&channel=gn&newsid=comos-fyfecvz1234039&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
comments=requests.get(newsurl)
comments.encoding='UTF-8'
import json
jd=json.loads(comments.text.strip('var data='))
jd['result']['count']['total']
---------------------------------------------------------------------------------------------
newsurl='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'
newid=newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
print newid
import re
m=re.search('doc-i(.*).shtml',newsurl)
newsid=m.group(1)
-----------------------------------------------------------------------------------------------------------
import re
import json
commentsurl='http://comment5.news.sina.com.cn/page/info?version=1&format=js\
&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
def getCommentCounts(newsurl):
m=re.search('doc-i(.+).shtml',newsurl)
newsid=m.group(1)
comments=requests.get(commentsurl.format(newsid)) #把newsid套入commentsurl的{}
jd=json.loads(comments.text.strip('var data='))
return jd['result']['count']['total']
news='http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml'
getCommentCounts(news)
------------------------------------------------------------------------------
最终结果
import requests
from bs4 import BeautifulSoup
def getNewsDetail(newsurl):
result={}
res=requests.get(newsurl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
result['title']=soup.select('#artibodyTitle')[0].text
result['newssource']=soup.select('.time-source')[0].text
result['comments']=getCommentCounts(newsurl)
return result
getNewsDetail('http://news.sina.com.cn/o/2017-05-13/doc-ifyfecvz1234039.shtml')
2. Python开发简单爬虫
http://www.imooc.com/video/10680
获取内容
import urllib2
#直接请求
response=urllib2.urlopen('http://www.baidu.com')
#获取状态码,200表示成功
print response.getcode()
cont=response.read()
-------------------
import urllib2
url='http://www.baidu.com'
request=urllib2.Request(url)
request.add_data('a')
request.add_header('User-Agent','Mozilla/5.0')
response=urllib2.urlopen(request)
-------------------------------------
import urllib2,cookielib
#创建cookie容器
cj=cookielib.CookieJar()
#创建1个opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#给urllib2安装opener
urllib2.install_opener(opener)
response=urllib2.urlopen('http://www.baidu.com')
--------------------------------------------------------------------------------------------------------
from bs4 import BeautifulSoup
import re
html_doc ='''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
# 根据html创建BeautifulSoup对象
soup=BeautifulSoup(html_doc, #文档字符串
'html.parser', #解析器
from_encoding='utf=8')
links=soup.find_all('a')
for link in links:
print link.name,link['href'],link.get_text()
link_node=soup.find('a',href="http://example.com/lacie")
print link_node.name,link['href'],link.get_text()
link_node=soup.find('a',href=re.compile(r'ill'))
print link_node.name,link['href'],link.get_text()
p_node=soup.find('p',class_='title') #class是PYthon的关键词,所以加_区分
print p_node.name,p_node.get_text()
----------------------------------------------------------------------------------------
最终
Crawler_mian.py
import URLManager, HTMLDownloader, HTMLParser, HTMLOutputer class CrawlerMain(object): def __init__(self): self.urls = URLManager.UrlManager() # 初始化URL管理器 self.downloader = HTMLDownloader.HtmlDownloader() # 初始化HTML下载器 self.parser = HTMLParser.HtmlParser() # 初始化HTML解析器 self.outputer = HTMLOutputer.HtmlOutputer() # 初始化HTML输出器 pass def crawl(self,root_url): count = 1 # 爬取计数 self.urls.add_new_url(root_url) # 将入口URL添加进管理器 while self.urls.has_new_url(): # 若URL池不为空则进行爬取 try: new_url = self.urls.get_new_url() # 获取要下载的URL print('crawl %d: %s' % (count, new_url)) # 打印正在爬取第几个页面及其URL html_cont = self.downloader.download(new_url) # 下载页面 new_urls, new_data = self.parser.hparse(new_url, html_cont) # 获取新的URL列表和页面数据 self.urls.add_new_urls(new_urls) # 将新的URL列表添加进管理器 self.outputer.collect_data(new_data) # 收集数据 if count == 10: break count = count + 1 except: print('Crawl Failed') self.outputer.output_html() # 将数据输出为HTML pass if __name__ == '__main__': root_url = "http://baike.baidu.com/item/Python" # 入口URL obj_crawler = CrawlerMain() # 创建爬虫实例 obj_crawler.crawl(root_url) # 调用爬虫方法---
HTMLDownloader.py
from urllib import request
class HtmlDownloader(object): def download(self, url): if url is None: return None # 打开网页 response = request.urlopen(url) if response.getcode() != 200: # 打开失败返回None return None else: # 打开成功返回网页内容 return response.read().decode("utf-8")----
URLManager.py
class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: # 如果该URL没有被/添加访问过就添加进管理器 self.new_urls.add(url) pass def add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: # 将URL列表添加进待访问队列 self.new_urls.add(url) pass def has_new_url(self): # 返回URL池是否为空 return len(self.new_urls) != 0 pass def get_new_url(self): # 从未访问的URL中取出一个,并返回取出的URL new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url pass------------
HTMLParser.pyfrom bs4 import BeautifulSoup import re from urllib import parse class HtmlParser(object): # page_url为页面URL, html_cont为获取的页面内容 def hparse(self, page_url, html_cont): if page_url is None or html_cont is None: return # BeautifulSoup解析网页内容 soup = BeautifulSoup(html_cont, 'html.parser') # 获取页面内容包含的URLs new_urls = self._get_new_urls(page_url, soup) # 获取页面内容中想要爬取的数据 new_data = self._get_new_data(page_url, soup) return new_urls, new_data pass def _get_new_urls(self, page_url, soup): new_urls = set() # 正则表达式模糊匹配 links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_url = link['href']------------# 连接成网址
new_full_url = parse.urljoin(page_url, new_url)
new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} # url res_data['url'] = page_url # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1> title_node = soup.find('dd', class_ = "lemmaWgt-lemmaTitle-title").find("h1") res_data['title'] = title_node.get_text() # <div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div', class_ = "lemma-summary" ) res_data['summary'] = summary_node.get_text() return res_data passHTMLOutputer.pyclass HtmlOutputer(object): def __init__(self): self.datas = [] def collect_data(self, data): if data is None: return self.datas.append(data) pass def output_html(self): fout = open('output.html', 'w', encoding='utf-8') fout.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">') fout.write("<html>") fout.write("<body>") fout.write("<table>") for data in self.datas: fout.write("<tr>") fout.write("<td>%s</td>" % data['url']) fout.write("<td>%s</td>" % data['title']) fout.write("<td>%s</td>" % data['summary']) fout.write("</tr>") fout.write("</html>") fout.write("</body>") fout.write("</table>") pass