删除 Azure Data Lake Storage Gen2 中的blob对象

最新推荐文章于 2025-07-08 11:02:13 发布

原创最新推荐文章于 2025-07-08 11:02:13 发布 · 208 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#azure #python #microsoft

Others 专栏收录该内容

10 篇文章

订阅专栏

该博客介绍了如何使用Python SDK删除Azure Data Lake Storage Gen2中的文件和目录。`DirectoryClient`类提供了`ls_files`、`ls_dirs`、`rm`和`rmdir`方法来列举和删除文件及目录。此外，还包含了一个`droptable`方法用于删除Spark SQL中的表。博客中展示了一个示例，演示了如何递归删除多个表对应的目录。

部署运行你感兴趣的模型镜像

删除 Azure Data Lake Storage Gen2 中的blob对象

# -*- encoding: utf-8 -*-


import os
import time

from retrying import retry
from azure.storage.blob import BlobServiceClient


class DirectoryClient:
    def __init__(self, connection_string, container_name):
        service_client = BlobServiceClient.from_connection_string(connection_string)
        self.client = service_client.get_container_client(container_name)

    def ls_files(self, path, recursive=False):
        """
        列举当前路径下所有文件
        @params1：path 路径
        @params2: recursive 是否递归
        """
        if not path == '' and not path.endswith('/'):
            path += '/'

        blob_iter = self.client.list_blobs(name_starts_with=path)
        files = []
        for blob in blob_iter:
            relative_path = os.path.relpath(blob.name, path)
            if recursive or not '/' in relative_path:
                files.append(relative_path)
        return files

    def ls_dirs(self, path, recursive=False):
        """
        列举当前路径下所有子路径
        """
        if not path == '' and not path.endswith('/'):
            path += '/'

        blob_iter = self.client.list_blobs(name_starts_with=path)
        dirs = []
        for blob in blob_iter:
            relative_dir = os.path.dirname(os.path.relpath(blob.name, path))
            if relative_dir and (recursive or not '/' in relative_dir) and not relative_dir in dirs:
                dirs.append(relative_dir)

        return dirs

    def rm(self, path, recursive=False):
        """
        删除指定路径文件
        """
        if recursive:
            self.rmdir(path)
        else:
            print(f'Deleting {path}')
            self.client.delete_blob(path)

    def rmdir(self, path):
        """
        递归删除指定路径下所有内容（子路径/文件）
        """
        blobs = self.ls_files(path, recursive=True)
        if not blobs:
            return

        if not path == '' and not path.endswith('/'):
            path += '/'
        blobs_list = [path + blob for blob in blobs]
        blobs_length = len(blobs_list)
        if blobs_length <= 200:
            self.client.delete_blobs(*blobs_list)

        else:
            start = 0
            end = 250

            while end <= blobs_length:
                # each time, delete 250 blobs at most
                self.client.delete_blobs(*blobs_list[start:end])
                start = start + 200
                end = end + 200
                if start < blobs_length and end > blobs_length:
                    self.client.delete_blobs(*blobs_list[start:blobs_length])
            print(path + ':blob 删除完成')

    def droptable(self, dbName, tableName):
        spark.sql(f'DROP TABLE IF EXISTS {dbName}.{tableName}')
        print('{0}.{1} 删除完成'.format(dbName, tableName))


@retry(stop_max_attempt_number=20, wait_incrementing_increment=200)
def main():
    blob_connect_string = 'DefaultEndpointsProtocol=https;AccountName=aalsabddev01e2;AccountKey=xxxxxxxxxxxxxxxxxxx==;EndpointSuffix=core.chinacloudapi.cn'
    
    container_name = 'dwm-storage'
    dbName = 'dwm_dev'
    
    Client = DirectoryClient(connection_string=blob_connect_string, container_name=container_name)

    del_table_list = ['m02_iap_track_logon_evt', 'm02_iap_track_vist_evt', 'm02_iap_track_page_clos_evt']
    
    for table in del_table_list:
        Client.rmdir(table)
        # Client.rm(table)
        Client.droptable(dbName, table)