1. 背景
从“ A机器” 搬迁数据到 “B机器”,待搬迁数据目录 “/data/test/”;
思路:机器A将待搬迁目录打包压缩test.tar.gz,切分成若干小文件,分批上传至华为云,然后在机器B进行下载,合并这些小文件,解压缩即可。
2. 数据搬迁
2.1 打包压缩&切分
1. 打包压缩
nohup tar zcvf test.tar.gz ../test/ &
2. 切分(每个1G)
nohup cat test.tar.gz | split -b 1024M -d -a 3 - ./test.tar.gz. &
子文件如下:

2.2 上传&下载
2.2.1 Client 类实现
1. 安装华为云obs包
pip install esdk-obs-python
2. 基于ObsClient实现Client类
import os
import cv2
import base64
import uuid
import tqdm
import numpy as np
from glob import glob
# import pandas as pd
# from pandarallel import pandarallel
from obs import ObsClient, DeleteObjectsRequest
CONFIG = {
'server': 'obs.af-south-1.myhuaweicloud.com',
'access_key_id': 'SKB...TAJ',
'secret_access_key': 'gOSCQ0k...AsG',
}
TMP_FILE_PATH = '/tmp/label_tool_report/'
class Client:
def __init__(self):
self.obs_client = ObsClient(**CONFIG)
def clean_files(self, bucket_name, prefix):
resp = self.obs_client.listObjects(bucket_name, prefix)
if resp.status < 300:
contents = resp.body.contents
self.obs_client.deleteObjects(bucket_name, DeleteObjectsRequest(objects=contents))
return 'OK'
else:
return 'ERROR'
def read_img(self, bucket_name, image_key):
resp = self.obs_client.getObject(bucket_name, image_key, loadStreamInMemory=True)
if resp.status < 300:
image_array = np.frombuffer(resp.body.buffer, dtype=np.uint8)
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
return 'OK', image
else:
return 'ERROR', (resp.errorMessage, resp.errorCode)
def upload_img(self, b64img, prefix):
filename = self.generate_tmp_filename()
localfile, target = os.path.join(TMP_FILE_PATH, filename), os.path.join(prefix, filename)
self.generate_tmp_img(b64img, localfile)
self.obs_client.putFile('bd-pai', target, localfile)
os.system(f'rm -fr {localfile}')
return 'opay-pai', target
def upload_file(self, bucket_name, target, local):
self.obs_client.putFile(bucket_name, target, local)
@staticmethod
def generate_tmp_img(b64img, savepath):
img_data = base64.b64decode(b64img)
with open(savepath, 'wb')as f:
f.write(img_data)
@staticmethod
def generate_tmp_filename():
os.makedirs(TMP_FILE_PATH, exist_ok=True)
return uuid.uuid4().hex + 'jpg'
def show_file_list(self, bucket_name, prefix):
resp = self.obs_client.listObjects(bucket_name, prefix)
if resp.status < 300:
return resp.body.contents
def download_files(self, bucket_name, prefix, save_path='.', if_exists='replace'):
files = self.show_file_list(bucket_name, prefix)
for file in tqdm.tqdm(files):
# file: shanghai/test.tar.gz.001
base_path = os.path.basename(file.key)
# base_path: test.tar.gz.001
dir_path = os.path.join(save_path, os.path.dirname(file.key))
# dir_path: ./shanghai
os.makedirs(dir_path, exist_ok=True)
local = os.path.join(dir_path, base_path)
# local: ./shanghai/test.tar.gz.001
if if_exists == 'continue' and os.path.exists(local):
continue
self.obs_client.downloadFile(bucket_name, file, local)
if __name__ == '__main__':
client = Client()
# upload & download
2.2.2 数据上传
在main()中调用
client = Client()
# 每次上传先清理华为云数据
client.clean_files('bd-pai', 'mydir')
# -----上传数据到华为云------
# 从机器A 目录/data/split_data上传至华为云bd-pai桶mydir目录
files = glob('/data/split_data/test.tar.gz.*')
for local_file in tqdm.tqdm(files):
basename = os.path.basename(local_file)
target_file = os.path.join('mydir', basename)
client.upload_file('bd-pai', target_file, local_file)
2.2.3 数据下载
# ------从华为云下载数据-------
# 从bd-pai桶mydir目录下载数据,机器B中存在则跳过
# if_exists='replace' # 覆盖
client.download_files('bd-pai', 'mydir', '.', if_exists='continue')
2.2.4 查看数据
# ------查看华为云数据---------
res = client.show_file_list('bd-pai', 'mydir')
for item in res:
print(item)
# -------删除华为云数据--------
client.clean_files('bd-pai', 'mydir')
2.3 合并&解压缩
1. 合并小文件
cat test.tar.gz.* >test.tar.gz (默认是按小文件后缀有序的)
亦可:
for i in {000..106}; do file_name="test.tar.gz.$i"; echo "${file_name}"; cat ${file_name}>>test.tar.gz; done
2. 解压缩
tar -zxvf test.tar.gz -C .
本文介绍如何使用Python实现数据从机器A到华为云的自动化迁移,包括打包压缩、切分文件、上传下载、合并解压等步骤,并展示了ObsClient类的详细实现。
1075

被折叠的 条评论
为什么被折叠?



