1.安装python3.3版本
2.安装pip;
3.安装bs4和lxml工具包
安装bs4:pip install bs4或bs4.exe‘
安装lxml:http://blog.youkuaiyun.com/qq_23438131/article/details/52222489
4.控制编码格式:
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
5.引用bs4
import bs4
from bs4 import BeautifulSoup as bs
6.根据关键字百度搜索
#coding:utf-8
import bs4
from bs4 import BeautifulSoup as bs
import urllib.parse
import urllib.request
import functools
import re
import time
from time import sleep
#import socket
#socket.setdefaulttimeout(3)
class BaiduSpider(object):
def __init__(self,word,max_link):
self._word = word
self._max_link = max_link
p = {"word":word}
self._start_url = "http://www.news.baidu.com/ns?" + urllib.parse.urlencode(p)
def _get_links(self):
links = []
links.append(self._start_url)
try:
soup = bs(self._get_html(self._start_url),"lxml")
links_tag = soup.select("#page")
except AttributeError as e_Att:
print(e_Att)
time.sleep(10)
return self._get_links()
if 0 != len(links_tag):
links_tag = links_tag[0]
#get the second page link
for child in links_tag.children:
attr = child.attrs
if attr:
links.append("http://www.news.baidu.com" + attr["href"])
break
#get 20~800 news links
for i in range(20,810,10):
link_temp = links[1].__str__()
PatternObj = re.compile('&pn=(\\d)+?&')
newLink = PatternObj.subn('&pn='+str(i)+'&', link_temp )
links.append(str(newLink[0]))
end = self._max_link if self._max_link < len(links) else len(links)
return links[:end]
def _rightTime(self,summary):
'''
判断summary中的时间是否在2016年6月1日至今
中国基金网 14小时前
网易新闻 2016年08月12日 16:35
'''
#2016-06-01转化为datetime
try:
startDate_str = '2016-06-01'
startTime = time.mktime(time.strptime(startDate_str, '%Y-%m-%d'))
a = summary.split()
time_in_text = a[1]
if '年' in time_in_text:
time_in_text = time_in_text.split(" ")[0]
time_in_text = time_in_text.replace("年",'-').replace("月",'-').replace("日",'')
textTime = time.mktime(time.strptime(time_in_text, '%Y-%m-%d'))
if (float(textTime))<=(float(startTime)):
return False
return True
except ValueError:
print (time_in_text)
def _get_html(self,link):
res = urllib.request.urlopen(link)
return res.read().decode("utf-8")
def _get_html_Content_post(self,link,f_error,retries):
print (link,'open the link using the post method:',time.time())
html_content = ''
try:
request = urllib.request.Request(link)
res =urllib.request.urlopen(request,timeout=3)
html_content = res.read()
except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试机会有3次
print(link+'\n')
print(e)
f_error.write(link+'\n')
if retries:
return self._get_html_Content_post(link, f_error,retries-1)
print ('close:',time.time())
return html_content
def _get_html_Content(self,link, f_error,retries=2):
print (link,'\n','open the link:',time.time())
html_content = ''
try:
user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'
headers={'User-Agent':user_agent}
request = urllib.request.Request(link)
request.add_header('User-Agent', user_agent)
#timeout=2
res =urllib.request.urlopen(request,timeout=3)
html_content = res.read()
except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试用post方式打开
print(link+'\n')
print(e)
f_error.write(link+'\n')
if retries:
return self._get_html_Content_post(link, f_error,retries=3)
print ('close:',time.time())
return html_content
def _get_content(self,content):
# 先要把bs4.element.NavigableString类型转化为string类型
return functools.reduce(lambda x,y:x+y,map(lambda x:x.replace("<em>","").replace("</em>",""),
map(lambda x:x.string,content)))
def _spiderDetail(self, link,f_error,Verbdic):
'''
input:link,f_error
output:contents contained xiepeiyiverb
通过第一步获取的URL,得到新闻所在的内容页面URL,由于百度新闻列表页面上的新闻来自不同的站,
所以很难找到一个通用的结构,大多数的新闻类网站,内容都是放在p标签内,所以就采用了如下的方式获取新闻的内容
'''
html_content = self._get_html_Content(link, f_error,retries=2)
contents =''
if html_content != '':
soup = bs(html_content,"lxml")
#reg=u".+?带领"
#Res = re.compile(reg)
#contents = soup.findAll(name="p", text=Res)
contents = '<p>'
iter = []
nodes_p = soup.find_all(name='p')
for n in nodes_p:
p_cont = n.get_text(strip=True)
for ver in Verbdic:
if ver in p_cont:
iter.append(p_cont)
break
contents = contents.join(iter)
return contents
def _spider(self,f, f_error,Verbdic):
'''
百度新闻列表页面,
根据关键词检索新闻,
获取新闻标题、来源及时间、链接、链接页面文字
'''
total_links = self._get_links()
print (total_links)
for i,l in enumerate(total_links):
print ("Page {0}".format(i+1))
soup = bs(self._get_html(l),"lxml")
# 找到左边内容到的跟节点
left_div = soup.select("#content_left")[0]
# base_div_list是一个新闻列表
for child_div in left_div.children:
if isinstance(child_div,bs4.element.Tag) and child_div.div and child_div.div.get('class') and'result' in child_div.div['class']:
base_div = child_div
childs = base_div.children
for child in childs:
title = child.select(".c-title")[0]
summary = ""
summary = summary.join(self._get_content(child.select(".c-summary")[0].p.contents))
a_link = title.a["href"]
titlename = ""
titlename = titlename.join(self._get_content(title.a.contents))
#爬取新闻内容网页
content = ''
if self._rightTime(summary):
content = self._spiderDetail(a_link, f_error,Verbdic)
f.write ('标题:'+titlename+'\t来源及时间:'+summary+
'\t链接:'+a_link
+'\t新闻内容:'+content+"\n")
def start(self,f, f_error,Verbdic):
self._spider(f,f_error,Verbdic)
if '__main__' == __name__:
'''
f存储爬取结果
#f_error存储读取新闻内容错误的链接
'''
Verbdic = [
'协同','协助'
]
with open("links2.txt",'wt',encoding='utf-8') as f, open("logError2.txt",'wt') as f_error, open("overVerb.txt",'wt') as f_over:
for keyword in Verbdic:
baidu_spider = BaiduSpider(keyword,800)
baidu_spider.start( f, f_error,Verbdic)
f_over.write(keyword+'\n')
7.爬虫问题:
1.Python程序卡住:原因是链接的网站反爬虫、get/post方式错误、网络问题等。
解决方法一:模拟浏览器上网:
user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'
headers={'User-Agent':user_agent}
request = urllib.request.Request(link)
request.add_header('User-Agent', user_agent)
解决方法二:超时重试:
<span style="white-space:pre"> </span>try:
request = urllib.request.Request(link)
<span style="white-space:pre"> </span> res =urllib.request.urlopen(request,timeout=3)
html_content = res.read()
except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试用post方式打开
print(link+'\n')
print(e)
f_error.write(link+'\n')
if retries:
return self._get_html_Content_post(link, f_error,retries=3)
解决方法三:如果模拟浏览器方式无法打开网页,即无法用get方式打开网页,则采用post方式打开网页:
def _get_html_Content_post(self,link,f_error,retries):
print (link,'open the link using the post method:',time.time())
html_content = ''
try:
request = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request.</span><span style="font-family: Arial, Helvetica, sans-serif;">.Request(link)</span>
res = <span style="font-family: Arial, Helvetica, sans-serif;">urllib.request</span><span style="font-family: Arial, Helvetica, sans-serif;">.urlopen(request,timeout=3)</span>
html_content = res.read()
except Exception as e: #爬虫卡住或其他异常,则再次尝试,尝试机会有3次
print(link+'\n')
print(e)
f_error.write(link+'\n')
if retries:
return self._get_html_Content_post(link, f_error,retries-1)
print ('close:',time.time())
return html_content