1. 检查robots.txt文件
有的网站会定义robots.txt文件,让爬虫了解爬取网站存在的限制。
在浏览器里面输入 http://www.youkuaiyun.com/robots.txt 查看robots.txt内容
User-agent: *
Disallow: /scripts
Disallow: /public
Disallow: /css/
Disallow: /images/
Disallow: /content/
Disallow: /ui/
Disallow: /js/
Disallow: /scripts/
Disallow: /article_preview.html*
Sitemap: http://www.youkuaiyun.com/article/sitemap.txt 这个robots.txt规定任何代理都不可爬取下面的那些链接,如果爬取了,也许会被禁止IP几分钟之类的
最后一句定义了Sitemap文件
2.检查Sitmap(网站地图)文件
在浏览器输入上面的sitmap地址,查看网站地图,发现都是网址,这个文件可能不会很完整或者更新不及时
3.识别网站所用的技术
检查网站构建的技术类型工具-----builtwith
安装方法:
pip install builtwith
>>> import builtwith
>>> builtwith.parse("http://www.youkuaiyun.com/")
{'programming-languages': ['Lua'], 'web-servers': ['OpenResty', 'Nginx'], 'javascript-frameworks': ['jQuery']}
>>> builtwith.parse("http://news.baidu.com/")
{'javascript-frameworks': ['RequireJS', 'jQuery UI', 'jQuery'], 'javascript-graphics': ['D3'], 'web-servers': ['Apache']}
>>> builtwith.parse("https://www.baidu.com/")
{}4.寻找网站的所有者
WHOIS工具
安装:
pip install python-whois
>>> import whois
>>> print(whois.whois("youkuaiyun.com"))
{
"status": "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
"registrar": "NETWORK SOLUTIONS, LLC.",
"address": "B3-2-1 ZHaowei Industry Park",
"emails": [
"abuse@web.com",
"Jiangtao@youkuaiyun.com"
],
"state": "Beijing",
"whois_server": "whois.networksolutions.com",
"domain_name": "youkuaiyun.com",
"org": "Beijing Chuangxin Lezhi Co.ltd",
"updated_date": [
"2017-03-10 00:00:00",
"2017-03-10 00:53:11"
],
"country": "CN",
"city": "Beijng",
"name": "Beijing Chuangxin Lezhi Co.ltd",
"expiration_date": [
"2020-03-11 00:00:00",
"2020-03-11 04:00:00"
],
"zipcode": "100016",
"creation_date": [
"1999-03-11 00:00:00",
"1999-03-11 05:00:00"
],
"dnssec": "Unsigned",
"name_servers": [
"NS3.DNSV3.COM",
"NS4.DNSV3.COM"
],
"referral_url": "http://networksolutions.com"
}
>>> 5. 下载网页
# coding=utf-8
import urllib
from urllib import request
def download(url):
print("Downloading: ", url)
try:
html = request.urlopen(url).read()
except urllib.error.URLError as e:
print("Download error", e.reason)
html = None
return html
url1 = "https://www.baidu1111.com/"
print(download(url1))
url2 = "https://www.baidu.com/"
print(download(url2))
重试下载
5xx服务器问题,要重试下载一下 默认重试两次
# coding=utf-8
import urllib
from urllib import request
def download(url, num_retries=2):
print("Downloading: ", url)
try:
html = request.urlopen(url).read()
except urllib.error.URLError as e:
print("Download error", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries-1)
return html
print(download("http://httpstat.us/500"))
设置用户代理
# coding=utf-8
import urllib
from urllib import request
def download(url, user_agent='wswp', num_retries=2):
print("Downloading: ", url)
headers = {'User-agent': user_agent}
req = urllib.request.Request(url, headers=headers)
try:
html = urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("Download error", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, user_agent, num_retries-1)
return html
print(download("http://httpstat.us/500"))
6. 链接爬虫
# coding=utf-8
import urllib
from urllib import request
import re
def download(url, user_agent='wswp', num_retries=2):
print("Downloading: ", url)
headers = {'User-agent': user_agent}
req = urllib.request.Request(url, headers=headers)
try:
html = urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("Download error", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, user_agent, num_retries-1)
return html
### print(download("http://httpstat.us/500"))
def link_crawler(seed_url, link_regex):
'''抓取匹配正则的链接地址'''
crawl_queue = [seed_url]
while crawl_queue:
print(crawl_queue)
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
print("link",link)
if re.search(link_regex,link): ##re.match 从开头开始匹配,所以返回None
crawl_queue.append(link)
print("append",link)
def get_links(html):
'''返回链接列表'''
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html.decode('utf-8'))
link_crawler('http://example.webscraping.com', '/index')
#1.TypeError: cannot use a string pattern on a bytes-like object
#这种情况解决方法就是加上html=html.decode('utf-8')#python3这句代码改为绝对路径
# coding=utf-8
import urllib
from urllib import request
import re
def download(url, user_agent='wswp', num_retries=2):
print("Downloading: ", url)
headers = {'User-agent': user_agent}
req = urllib.request.Request(url, headers=headers)
try:
html = urllib.request.urlopen(req).read()
except urllib.error.URLError as e:
print("Download error", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, user_agent, num_retries-1)
return html
### print(download("http://httpstat.us/500"))
from urllib import parse
def link_crawler(seed_url, link_regex):
'''抓取匹配正则的链接地址'''
crawl_queue = [seed_url]
while crawl_queue:
print(crawl_queue)
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.search(link_regex,link): ##re.match 从开头开始匹配,所以返回None
link = parse.urljoin(seed_url, link)
crawl_queue.append(link)
print("append",link)
def get_links(html):
'''返回链接列表'''
if html == None:
return []
else:
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html.decode('utf-8'))
link_crawler('http://example.webscraping.com', '/index')
#1.TypeError: cannot use a string pattern on a bytes-like object
#这种情况解决方法就是加上html=html.decode('utf-8')#python3这句代码用 urllib.parse.urljoin(seed_url, link) 来拼接绝对路径去重
def link_crawler(seed_url, link_regex):
'''抓取匹配正则的链接地址'''
crawl_queue = [seed_url]
seen = set(crawl_queue)
while crawl_queue:
print(crawl_queue)
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.search(link_regex,link): ##re.match 从开头开始匹配,所以返回None
link = parse.urljoin(seed_url, link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
禁止访问robots.txt中的url
from urllib import robotparser
parser = robotparser.RobotFileParser()
url = "http://www.youkuaiyun.com/"
parser.set_url(urllib.parse.urljoin(url, "robots.txt"))
parser.read()
PATHS = {
'/',
'/scripts/',
'content/',
'/js',
'/lock'
}
for path in PATHS:
url1 = urllib.parse.urljoin(url, path)
print("%6s: %s" %(parser.can_fetch('PyMOTW', url1), url1))
robotsparser处理robots.txt文件 parser.can_fetch()方法判断该User-agent是否可以访问这个url
设定代理
此段代码未验证,需要时再验证
headers = {'User-agent':"user_agent"}
reqst = urllib.request.Request("https//www.baidu.com", headers=headers)
proxy = ''
opener = urllib.request.build_opener()
proxy_names = {urllib.parse.urlparse(url).scheme:proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_names))
response = opener.open(reqst)下载限速
import datetime
import time
class Throttle:
def __init__(self, delay):
self.delay = delay
self.domains = {}
def wait(self, url):
domain = urllib.parse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds()
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()调用:
throttle = Throttle(delay)
......
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)获取parser.urlparse(url)的部分属性:
>>> from urllib.parse import urlparse
>>> o = urlparse('http://www.cwi.nl:80/%7Eguido/Python.html')
>>> o
ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
>>> o.port
80
>>> o.geturl()
'http://www.cwi.nl:80/%7Eguido/Python.html'
>>> o.netloc
'www.cwi.nl:80'
>>> o.hostname
'www.cwi.nl'时间:
>>> import datetime
>>> datetime.datetime
<class 'datetime.datetime'>
>>> datetime.datetime.now
<built-in method now of type object at 0x0000000065C3C3E0>
>>> datetime.datetime.now()
datetime.datetime(2017, 6, 30, 16, 12, 16, 559991)
>>> last_accessed = datetime.datetime.now()
>>> datetime.datetime.now() - last_accessed
datetime.timedelta(0, 15, 470000)
>>> (datetime.datetime.now() - last_accessed).seconds
39避免深度爬虫
避免无止尽爬虫,要设置深度
seen = {}
...
depth = seen[url]
if depth != max_depth:
for link not in seen:
seen[link] = depth + 1
crawl_queue.append(link)
845

被折叠的 条评论
为什么被折叠?



