网络爬虫---1.简介

最新推荐文章于 2022-01-19 16:38:14 发布

原创最新推荐文章于 2022-01-19 16:38:14 发布 · 557 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫

1. 检查robots.txt文件

有的网站会定义robots.txt文件，让爬虫了解爬取网站存在的限制。

在浏览器里面输入 http://www.youkuaiyun.com/robots.txt 查看robots.txt内容

User-agent: * 
Disallow: /scripts 
Disallow: /public 
Disallow: /css/ 
Disallow: /images/ 
Disallow: /content/ 
Disallow: /ui/ 
Disallow: /js/ 
Disallow: /scripts/ 
Disallow: /article_preview.html* 

Sitemap: http://www.youkuaiyun.com/article/sitemap.txt

这个robots.txt规定任何代理都不可爬取下面的那些链接，如果爬取了，也许会被禁止IP几分钟之类的

最后一句定义了Sitemap文件

2.检查Sitmap(网站地图)文件

在浏览器输入上面的sitmap地址，查看网站地图，发现都是网址，这个文件可能不会很完整或者更新不及时

3.识别网站所用的技术

检查网站构建的技术类型工具-----builtwith

安装方法：

pip install builtwith

查看部分网站的技术：

>>> import builtwith
>>> builtwith.parse("http://www.youkuaiyun.com/")
{'programming-languages': ['Lua'], 'web-servers': ['OpenResty', 'Nginx'], 'javascript-frameworks': ['jQuery']}
>>> builtwith.parse("http://news.baidu.com/")
{'javascript-frameworks': ['RequireJS', 'jQuery UI', 'jQuery'], 'javascript-graphics': ['D3'], 'web-servers': ['Apache']}
>>> builtwith.parse("https://www.baidu.com/")
{}

4.寻找网站的所有者

WHOIS工具

安装：

pip install python-whois

>>> import whois
>>> print(whois.whois("youkuaiyun.com"))
{
  "status": "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
  "registrar": "NETWORK SOLUTIONS, LLC.",
  "address": "B3-2-1 ZHaowei Industry Park",
  "emails": [
    "abuse@web.com",
    "Jiangtao@youkuaiyun.com"
  ],
  "state": "Beijing",
  "whois_server": "whois.networksolutions.com",
  "domain_name": "youkuaiyun.com",
  "org": "Beijing Chuangxin Lezhi Co.ltd",
  "updated_date": [
    "2017-03-10 00:00:00",
    "2017-03-10 00:53:11"
  ],
  "country": "CN",
  "city": "Beijng",
  "name": "Beijing Chuangxin Lezhi Co.ltd",
  "expiration_date": [
    "2020-03-11 00:00:00",
    "2020-03-11 04:00:00"
  ],
  "zipcode": "100016",
  "creation_date": [
    "1999-03-11 00:00:00",
    "1999-03-11 05:00:00"
  ],
  "dnssec": "Unsigned",
  "name_servers": [
    "NS3.DNSV3.COM",
    "NS4.DNSV3.COM"
  ],
  "referral_url": "http://networksolutions.com"
}
>>>

5. 下载网页

# coding=utf-8

import urllib
from urllib import request


def download(url):
    print("Downloading: ", url)
    try:
        html = request.urlopen(url).read()
    except urllib.error.URLError as e:
        print("Download error", e.reason)
        html = None
    return html

url1 = "https://www.baidu1111.com/"
print(download(url1))

url2 = "https://www.baidu.com/"
print(download(url2))

重试下载

5xx服务器问题，要重试下载一下默认重试两次

# coding=utf-8

import urllib
from urllib import request


def download(url, num_retries=2):
    print("Downloading: ", url)
    try:
        html = request.urlopen(url).read()
    except urllib.error.URLError as e:
        print("Download error", e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, num_retries-1)
    return html

print(download("http://httpstat.us/500"))

设置用户代理

# coding=utf-8

import urllib
from urllib import request


def download(url, user_agent='wswp', num_retries=2):
    print("Downloading: ", url)
    headers = {'User-agent': user_agent}
    req = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(req).read()
    except urllib.error.URLError as e:
        print("Download error", e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

print(download("http://httpstat.us/500"))

6. 链接爬虫

# coding=utf-8

import urllib
from urllib import request
import re

def download(url, user_agent='wswp', num_retries=2):
    print("Downloading: ", url)
    headers = {'User-agent': user_agent}
    req = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(req).read()
    except urllib.error.URLError as e:
        print("Download error", e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

### print(download("http://httpstat.us/500"))


def link_crawler(seed_url, link_regex):
    '''抓取匹配正则的链接地址'''
    crawl_queue = [seed_url]
    while crawl_queue:
        print(crawl_queue)
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            print("link",link)
            if re.search(link_regex,link): ##re.match 从开头开始匹配，所以返回None
                crawl_queue.append(link)
                print("append",link)
                
def get_links(html):
    '''返回链接列表'''
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    return webpage_regex.findall(html.decode('utf-8'))

link_crawler('http://example.webscraping.com', '/index')

#1.TypeError: cannot use a string pattern on a bytes-like object  
#这种情况解决方法就是加上html=html.decode('utf-8')#python3这句代码

改为绝对路径

# coding=utf-8

import urllib
from urllib import request
import re

def download(url, user_agent='wswp', num_retries=2):
    print("Downloading: ", url)
    headers = {'User-agent': user_agent}
    req = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(req).read()
    except urllib.error.URLError as e:
        print("Download error", e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

### print(download("http://httpstat.us/500"))

from urllib import parse

def link_crawler(seed_url, link_regex):
    '''抓取匹配正则的链接地址'''
    crawl_queue = [seed_url]
    while crawl_queue:
        print(crawl_queue)
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            if re.search(link_regex,link): ##re.match 从开头开始匹配，所以返回None
                link = parse.urljoin(seed_url, link)
                crawl_queue.append(link)
                print("append",link)
                
def get_links(html):
    '''返回链接列表'''
    if html == None:
        return []
    else:
        webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
        return webpage_regex.findall(html.decode('utf-8'))

link_crawler('http://example.webscraping.com', '/index')

#1.TypeError: cannot use a string pattern on a bytes-like object  
#这种情况解决方法就是加上html=html.decode('utf-8')#python3这句代码

用 urllib.parse.urljoin(seed_url, link) 来拼接绝对路径

去重

def link_crawler(seed_url, link_regex):
    '''抓取匹配正则的链接地址'''
    crawl_queue = [seed_url]
    seen = set(crawl_queue)
    while crawl_queue:
        print(crawl_queue)
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            if re.search(link_regex,link): ##re.match 从开头开始匹配，所以返回None
                link = parse.urljoin(seed_url, link)
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)

禁止访问robots.txt中的url

from urllib import robotparser
parser = robotparser.RobotFileParser()
url = "http://www.youkuaiyun.com/"
parser.set_url(urllib.parse.urljoin(url, "robots.txt"))
parser.read()

PATHS = {
    '/',
    '/scripts/',
    'content/',
    '/js',
    '/lock'
    }

for path in PATHS:
    url1 = urllib.parse.urljoin(url, path)
    print("%6s: %s" %(parser.can_fetch('PyMOTW', url1), url1))

robotsparser处理robots.txt文件 parser.can_fetch()方法判断该User-agent是否可以访问这个url

设定代理

此段代码未验证，需要时再验证

headers = {'User-agent':"user_agent"}
reqst = urllib.request.Request("https//www.baidu.com", headers=headers)    
proxy = ''
opener = urllib.request.build_opener()
proxy_names = {urllib.parse.urlparse(url).scheme:proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_names))
response = opener.open(reqst)

下载限速

import datetime
import time
class Throttle:
    def __init__(self, delay):
        self.delay = delay
        self.domains = {}
    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds()
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.datetime.now()

调用：

throttle = Throttle(delay)
......
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)

获取parser.urlparse(url)的部分属性：

>>> from urllib.parse import urlparse
>>> o = urlparse('http://www.cwi.nl:80/%7Eguido/Python.html')
>>> o
ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
>>> o.port
80
>>> o.geturl()
'http://www.cwi.nl:80/%7Eguido/Python.html'
>>> o.netloc
'www.cwi.nl:80'
>>> o.hostname
'www.cwi.nl'

时间：

>>> import datetime
>>> datetime.datetime
<class 'datetime.datetime'>
>>> datetime.datetime.now
<built-in method now of type object at 0x0000000065C3C3E0>
>>> datetime.datetime.now()
datetime.datetime(2017, 6, 30, 16, 12, 16, 559991)
>>> last_accessed = datetime.datetime.now()
>>> datetime.datetime.now() - last_accessed
datetime.timedelta(0, 15, 470000)
>>> (datetime.datetime.now() - last_accessed).seconds
39

避免深度爬虫

避免无止尽爬虫，要设置深度

seen = {}
...
depth = seen[url]
if depth != max_depth:
    for link not in seen:
        seen[link] = depth + 1
        crawl_queue.append(link)