milvus 指定COLLECTION_NAME ,从SOURCE_MILVUS同步collection schema和数据到SINK_MILVUS python脚本,试用batch查询,仅适用于小数据量。
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, utility
import time
# ========== 配置 ==========
SOURCE_MILVUS_HOST = '1.1.1.1'
SOURCE_MILVUS_PORT = '19530'
SINK_MILVUS_HOST = '2.2.2.2'
SINK_MILVUS_PORT = '19530'
COLLECTION_NAME = 'MILVUS_COLLECTION_1'
BATCH_SIZE = 1000
# ========== 连接 ==========
connections.connect("source", host=SOURCE_MILVUS_HOST, port=SOURCE_MILVUS_PORT)
connections.connect("sink", host=SINK_MILVUS_HOST, port=SINK_MILVUS_PORT)
# ========== 获取源 schema ==========
source_collection = Collection(name=COLLECTION_NAME, using='source')
source_schema = source_collection.schema
print(f"Source schema: {source_schema}")
# ========== 在目标 Milvus 上创建相同的 collection ==========
if utility.has_collection(COLLECTION_NAME, using='sink'):
print(f"Sink collection {COLLECTION_NAME} already exists, dropping it first.")
utility.drop_collection(COLLECTION_NAME, using='sink')
sink_fields = []
for field in source_schema.fields:
sink_fields.append(FieldSchema(
name=field.name,
dtype=field.dtype,
is_primary=field.is_primary,
auto_id=field.auto_id,
description=field.description,
dim=field.dim if hasattr(field, "dim") else None
))
sink_schema = CollectionSchema(fields=sink_fields, description=source_schema.description)
sink_collection = Collection(name=COLLECTION_NAME, schema=sink_schema, using='sink')
print("Sink collection created.")
# ========== 批量数据迁移 ==========
offset = 0
total_inserted = 0
output_fields = [f.name for f in source_schema.fields]
while True:
source_collection.load()
results = source_collection.query(
expr="",
offset=offset,
limit=BATCH_SIZE,
output_fields=output_fields
)
if not results:
print("No more data to copy.")
break
# 转换为插入格式:字段为列式结构
column_data = {key: [] for key in output_fields}
for row in results:
for key in output_fields:
column_data[key].append(row[key])
sink_collection.insert([column_data[key] for key in output_fields])
total_inserted += len(results)
print(f"Inserted batch: {len(results)}, total: {total_inserted}")
offset += BATCH_SIZE
# ========== 最后同步(可选) ==========
sink_collection.flush()
print("All data migrated and flushed.")
1387

被折叠的 条评论
为什么被折叠?



