爬取postgresql 源代码（爬虫）

最新推荐文章于 2025-07-15 16:42:47 发布

原创最新推荐文章于 2025-07-15 16:42:47 发布 · 179 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#数据挖掘

爬虫专栏收录该内容

3 篇文章

订阅专栏

本文介绍了一个使用Python编写的网页爬虫程序，该程序能够解析HTML页面，抓取特定链接并下载文件到本地指定目录。通过自定义的HTML解析器，爬虫可以筛选出包含目标文件的a标签，并跟踪这些链接进行文件下载。此外，文章还包含了错误处理和日志记录功能。

#!/usr/bin/python
# -- coding: utf-8 --
from html.parser import HTMLParser
import urllib.error
import urllib.request
import os, sys
import socket
from urllib import request


def out_log(logfile, message):
    with open(logfile, 'a') as log:
        log.write(message+'\n')


class myparser(HTMLParser):
    '''找到a标签并把属性的值放到列表里'''
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if len(attrs) == 0:
                pass
            else:
                for (variable, value) in attrs:
                    if variable == 'href':
                        if 'v' in value and '/developer/' != value:
                            self.links.append(value)
                            print('links===', self.links)


class myparser1(HTMLParser):
    '''找到a标签并把属性的值放到列表里'''
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if len(attrs) == 0:
                pass
            else:
                for (variable, value) in attrs:
                    if variable == 'href':
                        if 'v' in value and 'tar.gz' in value:
                            self.links.append(value)
                            print('links===', self.links)


def callbackfunc(blocknum, blocksize, totalsize):
    '''回调函数,打印下载进度
    @blocknum: 已经下载的数据块
    @blocksize: 数据块的大小
    @totalsize: 远程文件的大小
    '''
    percent=0
    try:
        percent = int(100.0 * blocknum * blocksize / totalsize)
    except:
        pass
    if totalsize > 505528:
        pass
    else:
        percent = 100
    sys.stdout.write('\r')
    sys.stdout.write(file_name + percent * '>' + str(percent) + '%')
    sys.stdout.flush()


def create_dir(root_tree, catalog):

    """ 根据url的目录结构在本地穿件文件夹 """

    os.chdir(root_tree)
    try:
        # out_log('G:\\HK\\simple\\out.log', '创建目录：' + catalog)
        os.makedirs(catalog)
    except FileExistsError as e:
        pass


def download_file(url, down_path):

    '''下载文件保存到相应的目录，并把下载失败的放在一个字典里'''
    global file_name
    global error_download
    file_name = url.split('/')[-1]
    error_download = {}
    socket.setdefaulttimeout(600)
    try:
        if os.path.exists(down_path):
            print("down_path", down_path)
            pass
            print(22222222222222222222222222)
        else:

            request.urlretrieve(url, down_path, callbackfunc)

    except socket.gaierror as e:
        error_download[url] = down_path

    except urllib.error.URLError as e:
        error_download[url] = down_path

    sys.stdout.write('\n')


def get_url_tree(url_tree):
    ''' 获取一个字典，链接：目录，并把文件夹创建及把文件下载 '''
    url_tree_dict = {}
    for url in url_tree:
        print('url', url)
        try:
            response = request.urlopen(url)
            page = response.read().decode('utf-8')
            hp = myparser()
            hp.feed(page)
            hp.close()
        except urllib.error.URLError as e:
            print(e)
            pass
        try:
            hp1 = list(set(hp.links))
        except ValueError as e:
            pass
        for file in hp1:
            print("file1", file)
            create_dir(url_tree[url], file)
            url_tree_dict[url+file] = url_tree[url]+file
            print(" url_tree_dict",  url_tree_dict)

    return url_tree_dict


def get_download_url_tree(url_tree_dict):

    for url in url_tree_dict:
        print('url', url)
        try:
            response = request.urlopen(url)
            page = response.read().decode('utf-8')
            hp = myparser1()
            hp.feed(page)
            hp.close()
        except urllib.error.URLError as e:
            print(e)
            pass
        try:
            hp1 = list(set(hp.links))
        except ValueError as e:
            pass
        for file in hp1:
            print("file", file)
            download_file(file, url_tree_dict[url] + file.split('/')[-1])



if __name__ == '__main__':

    url_tree = {"https://www.postgresql.org/ftp/source/": 'G:\\HK\\ftp\\source\\'}
    try:
        # os.makedirs('G:\\HK\\ios_FT\\')
        os.makedirs('G:\\HK\\ftp\\source\\')
    except FileExistsError as e:
        print(e)
        pass

    while True:

        url_tree = get_url_tree(url_tree)
        print('test====', url_tree)
        get_download_url_tree(url_tree)

    # for key in error_download:
    #     download_file(key, error_download[key])