该博客介绍了如何使用Python SDK删除Azure Data Lake Storage Gen2中的文件和目录。`DirectoryClient`类提供了`ls_files`、`ls_dirs`、`rm`和`rmdir`方法来列举和删除文件及目录。此外,还包含了一个`droptable`方法用于删除Spark SQL中的表。博客中展示了一个示例,演示了如何递归删除多个表对应的目录。
部署运行你感兴趣的模型镜像
删除 Azure Data Lake Storage Gen2 中的blob对象
# -*- encoding: utf-8 -*-import os
import time
from retrying import retry
from azure.storage.blob import BlobServiceClient
classDirectoryClient:def__init__(self, connection_string, container_name):
service_client = BlobServiceClient.from_connection_string(connection_string)
self.client = service_client.get_container_client(container_name)defls_files(self, path, recursive=False):"""
列举当前路径下所有文件
@params1:path 路径
@params2: recursive 是否递归
"""ifnot path ==''andnot path.endswith('/'):
path +='/'
blob_iter = self.client.list_blobs(name_starts_with=path)
files =[]for blob in blob_iter:
relative_path = os.path.relpath(blob.name, path)if recursive ornot'/'in relative_path:
files.append(relative_path)return files
defls_dirs(self, path, recursive=False):"""
列举当前路径下所有子路径
"""ifnot path ==''andnot path.endswith('/'):
path +='/'
blob_iter = self.client.list_blobs(name_starts_with=path)
dirs =[]for blob in blob_iter:
relative_dir = os.path.dirname(os.path.relpath(blob.name, path))if relative_dir and(recursive ornot'/'in relative_dir)andnot relative_dir in dirs:
dirs.append(relative_dir)return dirs
defrm(self, path, recursive=False):"""
删除指定路径文件
"""if recursive:
self.rmdir(path)else:print(f'Deleting {path}')
self.client.delete_blob(path)defrmdir(self, path):"""
递归删除指定路径下所有内容(子路径/文件)
"""
blobs = self.ls_files(path, recursive=True)ifnot blobs:returnifnot path ==''andnot path.endswith('/'):
path +='/'
blobs_list =[path + blob for blob in blobs]
blobs_length =len(blobs_list)if blobs_length <=200:
self.client.delete_blobs(*blobs_list)else:
start =0
end =250while end <= blobs_length:# each time, delete 250 blobs at most
self.client.delete_blobs(*blobs_list[start:end])
start = start +200
end = end +200if start < blobs_length and end > blobs_length:
self.client.delete_blobs(*blobs_list[start:blobs_length])print(path +':blob 删除完成')defdroptable(self, dbName, tableName):
spark.sql(f'DROP TABLE IF EXISTS {dbName}.{tableName}')print('{0}.{1} 删除完成'.format(dbName, tableName))@retry(stop_max_attempt_number=20, wait_incrementing_increment=200)defmain():
blob_connect_string ='DefaultEndpointsProtocol=https;AccountName=aalsabddev01e2;AccountKey=xxxxxxxxxxxxxxxxxxx==;EndpointSuffix=core.chinacloudapi.cn'
container_name ='dwm-storage'
dbName ='dwm_dev'
Client = DirectoryClient(connection_string=blob_connect_string, container_name=container_name)
del_table_list =['m02_iap_track_logon_evt','m02_iap_track_vist_evt','m02_iap_track_page_clos_evt']for table in del_table_list:
Client.rmdir(table)# Client.rm(table)
Client.droptable(dbName, table)