环境准备
安装python3环境
# centos 安装python3
yum install python3
创建激活venv
python3 -m venv .venv
source .venv/bin/activate
zookeeper
pip install kazoo
递归复制目录
from kazoo.client import KazooClient
def copy_node(zk, source_path, destination_path):
# 获取源节点的数据
data, stat = zk.get(source_path)
# 在目标路径创建节点
zk.create(destination_path, data)
# 获取子节点
children = zk.get_children(source_path)
for child in children:
# 递归复制子节点
copy_node(zk, f"{source_path}/{child}", f"{destination_path}/{child}")
zk = KazooClient(hosts='127.0.0.1:2181')
zk.start()
copy_node(zk, '/source_node', '/destination_node')
zk.stop()
设置key的值带有换行符
也可以用zkCli工具,先把值内容写入文件,再执行命令:./zkCli.sh -server 127.0.0.1:2181 set /your/znode/path “cat /tmp/path_value
”
from kazoo.client import KazooClient
# 连接到 Zookeeper
zk = KazooClient(hosts='127.0.0.1:2181')
zk.start()
# 要设置的 key
znode_path = "/your/znode/path"
# 要设置的值,包含换行符
value = """parseTreeCache:
initialCapacity: 128
maximumSize: 1024
sqlStatementCache:
initialCapacity: 2000
maximumSize: 65535"""
# 设置 Zookeeper znode 的值
zk.set(znode_path, value.encode('utf-8'))
# 关闭连接
zk.stop()
weaviate
数据迁移
import time
import weaviate
import weaviate.classes as wvc
from weaviate.auth import Auth
from weaviate.client import WeaviateClient
from weaviate.collections import Collection
"""
Migrate collections from src weaviate instance to target weaviate instance.
pip install weaviate-client
"""
# src weaviate instance
SRC_HOST = ''
SRC_PORT = 8080
SRC_GRPC_PORT = 50051
SRC_API_KEY = ''
# dst weaviate instance
TGT_HOST = ''
TGT_PORT = 8080
TGT_GRPC_PORT = 50051
TGT_API_KEY = ''
CLIENT_SRC = weaviate.connect_to_local(
host=SRC_HOST, port=SRC_PORT, grpc_port=SRC_GRPC_PORT,
auth_credentials=Auth.api_key(SRC_API_KEY)
)
CLIENT_TGT = weaviate.connect_to_local(
host=TGT_HOST, port=TGT_PORT, grpc_port=TGT_GRPC_PORT,
auth_credentials=Auth.api_key(TGT_API_KEY)
)
def get_all_collection_names(client: WeaviateClient) -> list[str]:
"""
Get all collection names from weaviate instance.
:param client:
:return:
"""
response = client.collections.list_all(simple=True)
return list(response.keys())
def create_collection_schema(
client: WeaviateClient, collection_name: str, clean_exists_data: bool = False
) -> None:
"""
create collection schema from weaviate instance.
:param client:
:param collection_name:
:param clean_exists_data: if True, clean exists data in target weaviate instance
:return:
"""
if not client.collections.exists(collection_name):
# if you have detail config can set here
client.collections.create(name=collection_name)
else:
if clean_exists_data:
print(f"Collection {collection_name} exists, force create")
client.collections.delete(collection_name)
client.collections.create(name=collection_name)
def migrate_collection_data(
client_src: WeaviateClient, client_tgt: WeaviateClient, collection_name: str, batch_size: int = 100
) -> None:
"""
Migrate collection data from src weaviate instance to target weaviate instance.
:param client_src:
:param client_tgt:
:param collection_name:
:param batch_size:
:return:
"""
collection_src = client_src.collections.get(collection_name)
collection_tgt = client_tgt.collections.get(collection_name)
start = time.perf_counter()
print(f'Migrating collection data for {collection_name}')
# for obj in collection_src.iterator(include_vector=True):
# collection_tgt.data.insert(uuid=obj.uuid, properties=obj.properties, vector=obj.vector, references=obj.references)
with collection_tgt.batch.fixed_size(batch_size) as batch:
for obj in collection_src.iterator(include_vector=True):
if 'default' in obj.vector and len(obj.vector) == 1:
batch.add_object(uuid=obj.uuid, properties=obj.properties, vector=obj.vector['default'], references=obj.references)
else:
batch.add_object(uuid=obj.uuid, properties=obj.properties, vector=obj.vector, references=obj.references)
print(f'Migrated collection data for {collection_name}, time elapsed: {time.perf_counter() - start}')
if __name__ == '__main__':
collection_names = get_all_collection_names(CLIENT_SRC)
print(f'Found {len(collection_names)} collections in src weaviate instance, details: {collection_names}')
for _collection_name in collection_names:
create_collection_schema(CLIENT_TGT, _collection_name, clean_exists_data=False)
migrate_collection_data(CLIENT_SRC, CLIENT_TGT, _collection_name)
CLIENT_SRC.close()
CLIENT_TGT.close()
添加列
import weaviate
from weaviate.auth import Auth
from weaviate.client import WeaviateClient
from weaviate.collections.classes.config import DataType, Property
"""
Add property to the collections which start with ${COLLECTION_PREFIX}.
pip install weaviate-client -i https://pypi.tuna.tsinghua.edu.cn/simple
"""
# src weaviate instance
HOST = 'localhost'
PORT = 8080
GRPC_PORT = 50051
API_KEY = '<API_KEY>'
COLLECTION_PREFIX = '<replace me with collection name prefix>'
PROPERTIES_TO_ADD = [
Property(name='api_url', data_type=DataType.TEXT),
Property(name='http_method', data_type=DataType.TEXT),
Property(name='api_type', data_type=DataType.TEXT),
Property(name='superclasses', data_type=DataType.TEXT_ARRAY),
Property(name='class_type', data_type=DataType.TEXT),
Property(name='method_name', data_type=DataType.TEXT),
Property(name='class_name', data_type=DataType.TEXT),
]
CLIENT = weaviate.connect_to_local(
host=HOST, port=PORT, grpc_port=GRPC_PORT,
auth_credentials=Auth.api_key(API_KEY)
)
def get_all_collection_names(client: WeaviateClient) -> list[str]:
"""
Get all collection names from weaviate instance.
:param client:
:return:
"""
response = client.collections.list_all(simple=True)
return list(response.keys())
def add_property_if_not_exists(collect_name: str):
coll = CLIENT.collections.get(collect_name)
cfg = coll.config.get()
exists_prop_names = {prop.name for prop in cfg.properties}
added = False
for prop in PROPERTIES_TO_ADD:
if prop.name not in exists_prop_names:
added = True
coll.config.add_property(prop)
if added:
print(f'Added properties to {collect_name}')
if __name__ == '__main__':
collection_names = get_all_collection_names(CLIENT)
if COLLECTION_PREFIX:
collection_names = [name for name in collection_names if name.startswith(COLLECTION_PREFIX)]
for _collection_name in collection_names:
add_property_if_not_exists(_collection_name)
CLIENT.close()
mysql
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
"""
The demonstration of how to use SQLAlchemy to operate a MySQL database.
pip install sqlalchemy pymysql -i https://pypi.tuna.tsinghua.edu.cn/simple
Here is the table creation statement for t_user. Note that the datetime type does not include timezone information.
CREATE TABLE `t_user` (
`id` bigint NOT NULL AUTO_INCREMENT,
`username` varchar(64) DEFAULT NULL,
`password` varchar(128) DEFAULT NULL,
`email` varchar(64) DEFAULT NULL,
`created_at` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `idx_user_name` (`username`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='User';
"""
DB_HOST = '192.168.137.101'
DB_PORT = '3306'
DB_USER = 'root'
DB_PASS = '123456'
DB_NAME = 'demo_db'
# Create database engine
engine_options = {
"pool_size": 1,
"max_overflow": 2,
"pool_recycle": 600,
"pool_pre_ping": True,
"connect_args": {
"init_command": 'SET time_zone="+08:00"'
},
"echo": False,
}
url = f'mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}?charset=utf8mb4'
db = create_engine(url=url, **engine_options)
# Create database session
Session = sessionmaker(bind=db, expire_on_commit=False)
def query_data():
with Session() as session:
sql = 'select username, email, created_at from t_user limit 10'
result = session.execute(text(sql))
return result.fetchall()
if __name__ == '__main__':
rows = query_data()
print(f'row count: {len(rows)}')
for row in rows:
username, email, created_at = row
print(f'username={username}, email={email}, created_at={created_at}')
pg
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
"""
The demonstration of how to use SQLAlchemy to operate a PostgreSQL database.
pip install sqlalchemy psycopg2 -i https://pypi.tuna.tsinghua.edu.cn/simple
Here is the table creation statement for t_user. Note that the TIMESTAMP type does not include timezone information.
CREATE TABLE t_user (
id BIGSERIAL PRIMARY KEY,
username VARCHAR(64),
password VARCHAR(128),
email VARCHAR(64),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_user_name ON t_user (username);
COMMENT ON TABLE t_user IS 'User';
You can use `SET timezone TO 'Asia/Shanghai';` or `SET timezone TO '+08:00';` to change the session timezone
or `SHOW timezone;` to show the current timezone.
"""
DB_HOST = '192.168.137.101'
DB_PORT = '5432'
DB_USER = 'postgres'
DB_PASS = '123456'
DB_NAME = 'demo_db'
# Create database engine
engine_options = {
"pool_size": 1,
"max_overflow": 2,
"pool_recycle": 600,
"pool_pre_ping": True,
"connect_args": {
"options": '-c timezone=Asia/Shanghai'
},
"echo": False,
}
url = f'postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
db = create_engine(url=url, **engine_options)
# Create database session
Session = sessionmaker(bind=db, expire_on_commit=False)
def query_data():
with Session() as session:
sql = 'select username, email, created_at from t_user limit 10'
result = session.execute(text(sql))
return result.fetchall()
if __name__ == '__main__':
rows = query_data()
print(f'row count: {len(rows)}')
for row in rows:
username, email, created_at = row
print(f'username={username}, email={email}, created_at={created_at}')