使用Python下载整个网站的连接，适合能目录浏览的网站。

最新推荐文章于 2024-01-11 17:02:43 发布
原创最新推荐文章于 2024-01-11 17:02:43 发布 · 2.2k 阅读
3 ·
CC 4.0 BY-SA版权
文章标签：
#python #download #url #path #integer #import
脚本Python、Perl 同时被 2 个专栏收录
16 篇文章
订阅专栏
数据库
2 篇文章
订阅专栏
# Copyright (C) 2012 xxx(xxx) Co., LTD.
# All rights reserved.
#
# Developed by RD BIOS Team.
#
# Authors: perry <perry.peng@cn.xxx.com>
#
# Date: January 11, 2012
#
# Project Name: WEBDOWN
# Project Version: 1.0.0
#
# Project descrition:
#
# History:
#    Date        Auther      Description
#    -----------------------------------------------------------------
#    2012/01/11  perry       created.
#
# Note:
#  xxx

__version__ = "1.0.0"

import os, sys, io

import sqlite3

try:
  # Python 2.7
  from urlparse import urlparse
  from urllib import (
    unquote,
    url2pathname)

except ImportError:
  # Python 3.2
  from urllib.parse import urlparse

try:
  # Python 2.7
  from HTMLParser import HTMLParser
except ImportError:
  # Python 3.2
  from html.parser import HTMLParser

try:
  # Python 2.7
  from httplib import HTTPConnection
except ImportError:
  # Python 3.2
  from http.client import HTTPConnection

import time
import threading

class DownloadThread(threading.Thread):
  def __init__(self, wd):
    self.wd = wd
    threading.Thread.__init__(self)
    
  def run(self):
    http = HTTPConnection(wd.url)
    
    while True:
      s = self.wd.get1()
      if s is None:
        if not self.wd.finished:
          break
        time.sleep(1)
        continue      
        
      x = unquote(s.encode(sys.stdin.encoding))
      p = os.getcwd() + url2pathname(x)
      
      if not os.path.exists(p):
        try:
          http.close()
          http.request('GET', s)
          r = http.getresponse()
          if r.status == 200:
            print r.getheader('content-length', 0), s
                      
            try:
              f = open(p, 'wb')
              f.write(r.read())
            finally:
              f.close()        
        except:
          print 'FAIL ', s
      else:
        print 'EXISTS ', s
        
      self.wd.set1(s, 1)
      
    print('exit...')

class Webdown(HTMLParser):
  finished = False
  def __init__(self, url):
    try:
      url_info = urlparse(url, 'http')
      self.url = url_info.netloc
      self.http = HTTPConnection(url_info.netloc)
      self.dbc = sqlite3.connect(':memory:', check_same_thread = False)
      self.lock = threading.Lock()
      self.path = url_info.path
      self.dbc.execute('''
        create table if not exists download (
          id integer primary key autoincrement,
          name text,
          url text,
          path text,
          local_path text,
          is_dir integer default 0,
          is_searched integer default 0,
          is_queried integer default 0,
          is_download integer default 0)''')

      name = self.path
      while name.endswith('/'):
        name = name[:-1]
      self.path = name + '/'

      i = name.rfind('/')
      if i > 0:
        name = name[i + 1:]

      self.puturl(name, self.url, self.path, os.getcwd(), 1)
    except:
      print('WebDown initialize failure...')

    HTMLParser.__init__(self)

  def handle_starttag(self, tag, attrs):
    if tag != 'a' or len(attrs) != 1 or attrs[0][0] != 'href':
      return

    href = attrs[0][1]
    if href == '../':     # ignore the parent folder.
      return

    if href == './':      # ignore the current folder.
      return
      
    if href.startswith('?'):
      return
      
    if href.startswith('~'):
      return

    dir = 0
    name = href
    searched = 1

    if name.endswith('/'):
      name = name[:-1]
      searched = 0
      dir = 1

    self.puturl(name, self.url, self.path + href, '', dir, searched)

  def puturl(self, name, url, path, lpath='', isdir=0, searched=0):
    self.lock.acquire()
    self.dbc.execute('insert into download (name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)', (
      name,url, path, lpath, isdir, searched))
    self.lock.release()

  def set1(self, path, status=0):
    self.lock.acquire()
    self.dbc.execute('update download set is_queried=? where path=?', (status, path))
    self.lock.release()

  def get1(self):
    self.lock.acquire()
    r = self.dbc.execute('select path from download where is_dir=0 and is_queried=0 limit 1')
    s = r.fetchone()
    self.lock.release()
    
    if s is not None:
      return s[0]
    return s

  def set2(self, path, status=0):
    self.lock.acquire()
    self.dbc.execute('update download set is_searched=? where path=?', (status, path))
    self.lock.release()
    
  def get2(self, url):
    self.lock.acquire()
    r = self.dbc.execute('select path from download where url=? and is_searched=0 and is_dir=1 limit 1', (url,))
    s = r.fetchone()
    self.lock.release()
    
    if s is not None and s[0] is not None:
      s = s[0]
      if not s.endswith('/'):
        s = s + '/'
    return s

  def set3(self, path, status=0):
    self.lock.acquire()
    self.dbc.execute('update download set is_download=? where path=?', (status, path))

  def get3(self):
    self.lock.acquire()
    r = self.dbc.execute('select path from download where is_dir=0 and is_download=0 limit 1')
    s = r.fetchone()
    self.lock.release()
    
    if s is not None:
      return s[0]
    return s

  def go(self):
    self.finished = True
    q = DownloadThread(self)
    q.start()
    while self.path is not None:
      try:
        s = unquote(self.path.encode(sys.stdin.encoding))
        p = os.getcwd() + url2pathname(s)
        if not os.path.exists(p):
          os.makedirs(p)
        #print(s)
      except:
        pass

      try:
        self.http.close()
        self.http.request('GET', self.path)
        r = self.http.getresponse()
        if r.status == 200:
          self.reset()
          self.feed(r.read())
      except:
        pass

      self.set2(self.path, 1)
      self.path = self.get2(self.url)
      
    self.finished = False
    q.join()

if __name__ == "__main__":
  if len(sys.argv) > 1:
    url = sys.argv[0]
    url = url.strip()
  else:
    # http://www.20cn.net/share/alalmn
    # http://www.gaby.de/ftp/pub/win3x/archive/
    print('You must provide a valid Url.\n')
    print('Usage:\n  Python %s target' % os.path.basename(sys.argv[0]))
    print('    target   --- specify a URL to donwload.\n')
    url = ''
    while len(url) == 0:
      if sys.version.startswith('3.2'):
        url = input('Please enter a URL:')
      else:
        url = raw_input('Please enter a URL:')
      url = url.strip()
  wd = Webdown(url)
  wd.go()