一些有用的官方参考
- 术语表:https://milvus.io/cn/docs/v2.0.0/glossary.md
- 布尔表达式语法规则:https://milvus.io/cn/docs/v2.0.0/boolean.md
- Field Schema (数据)支持类型:https://milvus.io/cn/docs/v2.0.0/field_schema.md
- Build an Index (索引)支持类型 No_1:https://milvus.io/cn/docs/v2.0.0/build_index.md
- Build an Index (索引)支持类型 No_2:https://milvus.io/cn/docs/v2.0.0/index_selection.md
- 从 DataFrame 生成一个 collection schema 并创建一个 collection:https://milvus.io/cn/docs/v2.0.0/collection_schema.md
- 混合搜索查询:https://milvus.io/cn/docs/v2.0.0/hybridsearch.md
- 性能优化:https://milvus.io/cn/docs/v1.1.0/performance_faq.md
存储向量 server 服务
import datetime
import pandas as pd
import random
import time
from pymilvus import (
connections,
utility,
FieldSchema, CollectionSchema, DataType,
Collection,
)
mil_name = "item_dssm_embedding"
hash_num = 64
df = pd.read_csv('/home/q/milvus_test/data/item_daily.csv', names=['item_name', 'item_city', 'item_index', 'item_embedding'])
df['item_embedding'] = df['item_embedding'].apply(lambda x: [float(i) for i in x])
item_city_list = df['item_city'].drop_duplicates().tolist()
item_city_dict = dict(zip(item_city_list, range(len(item_city_list))))
df['item_city_cate'] = df['item_city'].apply(lambda x: item_city_dict[x])
connections.connect("default", host="localhost", port="19530")
connections.list_connections()
if utility.has_collection(mil_name):
utility.drop_collection(mil_name)
fields = [
FieldSchema(name="item_index", dtype=DataType.INT64, is_primary=True, auto_id=False),
FieldSchema(name="item_city_cate", dtype=DataType.INT64),
FieldSchema(name="item_embedding", dtype=DataType.FLOAT_VECTOR, dim=128)
]
schema = CollectionSchema(fields, mil_name)
my_mil = Collection(mil_name, schema)
for i in range(hash_num):
my_mil.create_partition(partition_name='hash_' + str(i), description='hash_' + str(i))
df['item_index'] = df['item_index'].astype(int)
df['item_city_cate'] = df['item_city_cate'].astype(int)
item_index_list = df['item_index'].tolist()
item_city_cate_list = df['item_city_cate'].tolist()
item_embedding_list = df['item_embedding'].tolist()
entities_list = [[item_index_list[i:i + hash_num],
item_city_cate_list[i:i + hash_num],
item_embedding_list[i:i + hash_num]] for i in range(0, len(item_index_list), hash_num)]
for i, each_entities in enumerate(entities_list):
print(i)
insert_result = my_mil.insert(data=each_entities, partition_name='hash_' + str(city_hash))
index = {
"index_type": "IVF_FLAT",
"metric_type": "L2",
"params": {"nlist": 128},
}
my_mil.create_index("item_embedding", index)
my_mil.load()
'''
# 插入数据
entities = [
df['item_index'].tolist(),
df['item_city_cate'].tolist(),
df['item_embedding'].tolist()
]
# 导入数据
insert_result = my_mil.insert(entities)
my_mil.load()
'''
'''
# 或者直接从 data frame 中插入
# pymilvus.orm.exceptions.CannotInferSchemaException: <CannotInferSchemaException: (code=0, message=Cannot infer schema from empty dataframe.)>
# 需要把 df 的各个列的格式由 object 转成 list、int 等
my_mil, insert_result = Collection.construct_from_dataframe(name=mil_name, dataframe=df, primary_field='item_index', auto_id=False)
# 导入数据
my_mil.load()
'''
'''
# 按照条件删除对应的 entities
ids = [105, 213]
expr = f"item_city_cate in [{ids[0]}, {ids[1]}]"
my_mil.delete(expr)
# 删除 collection
utility.drop_collection(mil_name)
'''
查询向量 client 服务
import datetime
import pandas as pd
import random
import time
from pymilvus import (
connections,
utility,
FieldSchema, CollectionSchema, DataType,
Collection,
)
mil_name = "item_dssm_embedding"
connections.connect(mil_name, host="localhost", port="19530")
connections.list_connections()
my_mil = connections.get_connection(mil_name)
entities = [[random.random() for _ in range(128)] for _ in range(3000)]
vectors_to_search = entities[-2:]
search_params = {
"metric_type": "l2",
"params": {"nprobe": 10},
}
start_time = time.time()
result = my_mil.search(collection_name=mil_name,
data=vectors_to_search,
anns_field="item_embedding",
partition_names=['hash_10'],
param=search_params,
limit=3,
output_fields=["item_city_cate"])
cost_time = time.time() - start_time
for hits in result:
for hit in hits:
print(f"hit: {hit}, item_city_cate field: {hit.entity.get('item_city_cate')}")
start_time = time.time()
result = my_mil.query(collection_name=mil_name,
expr="item_city_cate > 104",
output_fields=["item_city_cate", "item_embedding"])
cost_time = time.time() - start_time
print(f"query result:\n-{result[0]}")
start_time = time.time()
result = my_mil.search(collection_name=mil_name,
data=vectors_to_search,
anns_field="item_embedding",
param=search_params,
limit=3,
expression="item_city_cate > 504",
output_fields=["item_city_cate"])
for hits in result:
for hit in hits:
print(f"hit: {hit}, item_city_cate field: {hit.entity.get('item_city_cate')}")
cost_time = time.time() - start_time
result = my_mil.search(collection_name=mil_name,
data=[vectors_to_search[0]],
anns_field="item_embedding",
param=search_params,
limit=3,
expression="item_city_cate > 504",
output_fields=["item_city_cate"])
result_list = []
for hits in result:
for hit in hits:
each_dict = {'=index': hit.id,
'distance': hit.distance,
'item_city_cate': hit.entity.get('item_city_cate')}
result_list.append(each_dict)
'''
[{'=index': 4674, 'distance': 47.85670471191406, 'item_city_cate': 544},
{'=index': 1936, 'distance': 48.49840545654297, 'item_city_cate': 774},
{'=index': 5095, 'distance': 48.49840545654297, 'item_city_cate': 774}]
'''