爬取postgresql 源代码(爬虫)

本文介绍了一个使用Python编写的网页爬虫程序,该程序能够解析HTML页面,抓取特定链接并下载文件到本地指定目录。通过自定义的HTML解析器,爬虫可以筛选出包含目标文件的a标签,并跟踪这些链接进行文件下载。此外,文章还包含了错误处理和日志记录功能。
#!/usr/bin/python
# -- coding: utf-8 --
from html.parser import HTMLParser
import urllib.error
import urllib.request
import os, sys
import socket
from urllib import request


def out_log(logfile, message):
    with open(logfile, 'a') as log:
        log.write(message+'\n')


class myparser(HTMLParser):
    '''找到a标签并把属性的值放到列表里'''
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if len(attrs) == 0:
                pass
            else:
                for (variable, value) in attrs:
                    if variable == 'href':
                        if 'v' in value and '/developer/' != value:
                            self.links.append(value)
                            print('links===', self.links)


class myparser1(HTMLParser):
    '''找到a标签并把属性的值放到列表里'''
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if len(attrs) == 0:
                pass
            else:
                for (variable, value) in attrs:
                    if variable == 'href':
                        if 'v' in value and 'tar.gz' in value:
                            self.links.append(value)
                            print('links===', self.links)


def callbackfunc(blocknum, blocksize, totalsize):
    '''回调函数,打印下载进度
    @blocknum: 已经下载的数据块
    @blocksize: 数据块的大小
    @totalsize: 远程文件的大小
    '''
    percent=0
    try:
        percent = int(100.0 * blocknum * blocksize / totalsize)
    except:
        pass
    if totalsize > 505528:
        pass
    else:
        percent = 100
    sys.stdout.write('\r')
    sys.stdout.write(file_name + percent * '>' + str(percent) + '%')
    sys.stdout.flush()


def create_dir(root_tree, catalog):

    """ 根据url的目录结构在本地穿件文件夹 """

    os.chdir(root_tree)
    try:
        # out_log('G:\\HK\\simple\\out.log', '创建目录:' + catalog)
        os.makedirs(catalog)
    except FileExistsError as e:
        pass


def download_file(url, down_path):

    '''下载文件保存到相应的目录,并把下载失败的放在一个字典里'''
    global file_name
    global error_download
    file_name = url.split('/')[-1]
    error_download = {}
    socket.setdefaulttimeout(600)
    try:
        if os.path.exists(down_path):
            print("down_path", down_path)
            pass
            print(22222222222222222222222222)
        else:

            request.urlretrieve(url, down_path, callbackfunc)

    except socket.gaierror as e:
        error_download[url] = down_path

    except urllib.error.URLError as e:
        error_download[url] = down_path

    sys.stdout.write('\n')


def get_url_tree(url_tree):
    ''' 获取一个字典,链接:目录,并把文件夹创建及把文件下载 '''
    url_tree_dict = {}
    for url in url_tree:
        print('url', url)
        try:
            response = request.urlopen(url)
            page = response.read().decode('utf-8')
            hp = myparser()
            hp.feed(page)
            hp.close()
        except urllib.error.URLError as e:
            print(e)
            pass
        try:
            hp1 = list(set(hp.links))
        except ValueError as e:
            pass
        for file in hp1:
            print("file1", file)
            create_dir(url_tree[url], file)
            url_tree_dict[url+file] = url_tree[url]+file
            print(" url_tree_dict",  url_tree_dict)

    return url_tree_dict


def get_download_url_tree(url_tree_dict):

    for url in url_tree_dict:
        print('url', url)
        try:
            response = request.urlopen(url)
            page = response.read().decode('utf-8')
            hp = myparser1()
            hp.feed(page)
            hp.close()
        except urllib.error.URLError as e:
            print(e)
            pass
        try:
            hp1 = list(set(hp.links))
        except ValueError as e:
            pass
        for file in hp1:
            print("file", file)
            download_file(file, url_tree_dict[url] + file.split('/')[-1])



if __name__ == '__main__':

    url_tree = {"https://www.postgresql.org/ftp/source/": 'G:\\HK\\ftp\\source\\'}
    try:
        # os.makedirs('G:\\HK\\ios_FT\\')
        os.makedirs('G:\\HK\\ftp\\source\\')
    except FileExistsError as e:
        print(e)
        pass

    while True:

        url_tree = get_url_tree(url_tree)
        print('test====', url_tree)
        get_download_url_tree(url_tree)

    # for key in error_download:
    #     download_file(key, error_download[key])

 

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值