理论来源:
(2条消息) 灰色聚类评价模型_卖山楂啦prss的博客-优快云博客_灰色聚类
帖子里的理论已经很完整了
我的代码就是根据该理论完成的,代码结果与帖子里也一样,只不过我本人又添加了几条测试数据
Python环境:Python 3.6.6
IDE:pycharm 2020.2.1社区版
代码:
import argparse
import numpy as np
import pandas as pd
"""
灰色聚类模型 GrayClusteringEvaluation
参考:https://blog.youkuaiyun.com/qq_42374697/article/details/106598075
算法目的:用于检测观测对象属于何类
算法入口: gray_clustering_evaluation.py
因素: n条记录,m个指标,k个分类
输入数据数量:2
输入数据1:nxm矩阵,原始数据,每条记录的各指标值
input_data.csv
输入数据2:
各个指标的白化权函数矩阵:mxk的矩阵
规则:csv格式,每行是某个指标的分类临界值数据,每列是某个分类的各指标针对该分类的临界值
某指标在每个分类的各临界值以英文冒号分割,通过判断不同位置是否为空来决定选取哪个白化权函数
如 a:b::c 选取三角形函数
a:b:: 选取右值梯形
::c:d选取左值梯形
a:b:c:d选取梯形
算法输出:nxk矩阵: 聚类系数矩阵
第i行即为第i个记录的聚类系数向量,向量里的最大值所在的分类即为第i个记录的归属分类
输出样例文件:result.csv
"""
def data_save(data, save_file='result.csv'):
data.to_csv(save_file, encoding='utf_8_sig', header=True)
return
def toFloat(x):
try:
return float(x)
except:
return None
class WeightFunction: # 白化权函数 可能度函数
def __init__(self, str):
self.str = str # a:b:c:d
self.level = [] # 分类临界值
self.type = 0 # 1典型(梯形) 2下限测度(左值梯形) 3上限测度(右值梯形) 4适中测度(三角形)
self.valid = False # 是否合法,主要检查格式
self.parse()
def parse(self):
if self.str is None:
print("WeightFunction str cannot ne None!")
return
segs = self.str.split(":") # 英文冒号
if len(segs) < 4:
print("WeightFunction format error:", self.str)
return
for i in range(4):
cur_v = toFloat(segs[i])
self.level.append(cur_v)
if len(segs[0]) > 0 and len(segs[1]) > 0 and len(segs[2]) > 0 and len(segs[3]) > 0:
if self.level[0] > self.level[1] or self.level[1] > self.level[2] or self.level[2] > self.level[3]:
print("WeightFunction value effor:后面的值不能小于前面的!",self.str)
return
self.type = 1
elif len(segs[0]) > 0 and len(segs[1]) > 0 and len(segs[2]) <= 0 and len(segs[3]) > 0:
if self.level[0] > self.level[1] or self.level[1] > self.level[3]:
print("WeightFunction value effor:后面的值不能小于前面的!",self.str)
return
self.type = 4
elif len(segs[0]) > 0 and len(segs[1]) > 0 and len(segs[2]) <= 0 and len(segs[3]) <= 0:
if self.level[0] > self.level[1]:
print("WeightFunction value effor:后面的值不能小于前面的!",self.str)
return
self.type = 3
elif len(segs[0]) <= 0 and len(segs[1]) <= 0 and len(segs[2]) > 0 and len(segs[3]) > 0:
if self.level[2] > self.level[3]:
print("WeightFunction value effor:后面的值不能小于前面的!",self.str)
return
self.type = 2
else:
print("WeightFunction str content error:", self.str)
return
self.valid = True
def calucate_trapezium(self,x):#梯形 1
if self.type != 1:
print("calucate_trapezium,type error:白化权函数的临界值需要是梯形类型")
return -1
if x <= self.level[0] or x >= self.level[3]:
return 0
if x >= self.level[0] and x <= self.level[1]:
w = (x - self.level[0]) / (self.level[1]-self.level[0])
return w
if x >= self.level[1] and x <= self.level[2]:
return 1
if x >= self.level[2] and x <= self.level[3]:
w = (self.level[3] - x) / (self.level[3] - self.level[2])
return w
def calucate_trapezium_left(self, x): # 左值梯形 2
if self.type != 2:
print("calucate_trapezium_left,type error:白化权函数的临界值需要是左值梯形类型")
return -1
if x <= self.level[2]:
return 1
if x >= self.level[3]:
return 0
if x >= self.level[2] and x <= self.level[3]:
w = (self.level[3] - x) / (self.level[3] - self.level[2])
return w
def calucate_trapezium_right(self, x): # 右值梯形 3
if self.type != 3:
print("calucate_trapezium_right,type error:白化权函数的临界值需要是右值梯形类型")
return -1
if x <= self.level[0]:
return 0
if x >= self.level[1]:
return 1
if x >= self.level[0] and x <= self.level[1]:
w = (x - self.level[0]) / (self.level[1] - self.level[0])
return w
def calucate_triangle(self,x): #三角形 4
if self.type != 4:
print("calucate_triangle,type error:白化权函数的临界值需要是三角形类型")
return -1
if x <= self.level[0] or x >= self.level[3]:
return 0
if x >= self.level[0] and x <= self.level[1]:
w = (x - self.level[0]) / (self.level[1] - self.level[0])
return w
if x >= self.level[1] and x <= self.level[3]:
w = (self.level[3] - x) / (self.level[3] - self.level[1])
return w
def calucate(self,x):
if self.valid is not True:
print("calucate,data error:白化权函数矩阵不合法!")
return -1
if self.type == 1:
return self.calucate_trapezium(x)
if self.type == 2:
return self.calucate_trapezium_left(x)
if self.type == 3:
return self.calucate_trapezium_right(x)
if self.type == 4:
return self.calucate_triangle(x)
print("calucate,data error:临界值类型未知:",self.type)
return -1
def get_criticle_value(self):#获取临界值
if self.valid is not True:
print("get_criticle_value,data error:白化权函数矩阵不合法!")
return -1
if self.type == 1:
return (self.level[1]+self.level[2]) / 2
if self.type == 2:
return self.level[2]
if self.type == 3:
return self.level[1]
if self.type == 4:
return self.level[1]
print("get_criticle_value,data error:临界值类型未知:",self.type)
return -1
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input_data', help='level_of_each_rule_data', default=None, type=str)
parser.add_argument('--function_matrix', help='rule_data_of_each_schema', default=None, type=str)
parser.add_argument('--output_file', help='output data', default='result.csv', type=str)
args = parser.parse_args()
input_data = args.input_data
function_matrix = args.function_matrix
output_file = args.output_file
print("input_data=", input_data)
print("function_matrix=", function_matrix)
print("output_file=", output_file)
# #just for test
input_data = "./input_data.csv"
function_matrix = "./function_matrix.csv"
#test end
if input_data is None or function_matrix is None:
print("error:input_data or function_matrix can not be empty!")
exit(1)
print("=================== schema_data ===============================")
# n条记录,m个指标,k个分类
schema_data = pd.read_csv(input_data) # 各条目数据 nxm
schema_data_rows_count = schema_data.shape[0] # 条目数量n
schema_data_cols_count = schema_data.shape[1] # 指标数量m
rules_count = schema_data_cols_count # 指标数量m
input_data_matrix = np.array(schema_data)
print("schema_data_rows_count==", schema_data_rows_count)
print("rules_count==", rules_count) #
print("schema_data==\n", schema_data)
print("input_data_matrix==\n", input_data_matrix)
print("=================== function_matirx ===============================")
# n条记录,m个指标,k个分类
function_matrix_data = pd.read_csv(function_matrix) # m x k
function_data_rows_count = function_matrix_data.shape[0] # m
function_data_cols_count = function_matrix_data.shape[1] # k
print("function_data_rows_count==", function_data_rows_count) # m
print("function_data_cols_count==", function_data_cols_count) #
print("function_matrix_data==", function_matrix_data)
if rules_count != function_data_rows_count:
print("error:数据集与白化权函数矩阵里的指标数量不一致!(input_data,function_matrix)=", rules_count, function_data_rows_count)
exit(1)
kind_count = function_data_cols_count
schema_count = schema_data_rows_count
# print("function_matrix_data[0:1]==",function_matrix_data.iloc[0,0])
# print("function_matrix_data[0:1]==",function_matrix_data[1:2])
# print("function_matrix_data[0:1]==",function_matrix_data[2:3])
function_obj_matrix = [] #各指标针对某分类的函数
criticle_value = [] #临界值
#各指标对于各分类的白化权函数 F_jk
for j in range(rules_count):
cur_function_obj_list = []
criticle_value_j = []
for k in range(kind_count):
str = function_matrix_data.iloc[j,k]
func_obj = WeightFunction(str)
# print(j, k, "str=", str,"type=",func_obj.type,"临界值:",func_obj.get_criticle_value())
cur_function_obj_list.append(func_obj)
criticle_value_j.append(func_obj.get_criticle_value())
function_obj_matrix.append(cur_function_obj_list)
criticle_value.append(criticle_value_j)
criticle_value_matrix = np.array(criticle_value) #临界值矩阵
print("criticle_value_matrix==\n",criticle_value_matrix)
# print("criticle_value_matrix==\n", criticle_value_matrix[:,0:1])
# 各指标对于各分类的权值:η_jk
X = np.zeros(shape=(rules_count,1))
for k in range(kind_count):
a = criticle_value_matrix[:,k:k+1]
b = a / sum(a)
# print("a==\n", a)
# print("b==\n", b)
if k==0:
X = b
else:
X = np.hstack([X, b])
print("X==\n", X)
yita_matirx = X
#第i个记录属于第k个分类的灰色变权聚类系数: delta_ik
delta_ik = []
for i in range(schema_count):
res_i = []
for k in range(kind_count):
sum_v = 0
for j in range(rules_count):
f_jk = function_obj_matrix[j][k] # k=0,...
yita_jk = yita_matirx[j][k] # k=0,...
Xij = input_data_matrix[i][j] # i=0,...
v = f_jk.calucate(Xij) * yita_jk
sum_v = sum_v + v
# print(i,k,"sum_v==",sum_v)
res_i.append(sum_v)
delta_ik.append(res_i)
print("delta_ik=",delta_ik)
w = np.array(delta_ik)
w = pd.DataFrame(w)
w.columns = function_matrix_data.columns
print("w==", w)
data_save(w, save_file=output_file)
输入参数文件1:function_matrix.csv
(如果要问这个矩阵怎么来的,是专家定的,专家也可以是你自己,哈)
k1,k2,k3 30:80::,10:40::70,::10:30 30:90::,20:50::90,::20:40 40:100::,30:60::90,::30:50
输入参数文件2:input_data.csv
r1,r2,r3 80,20,100 40,30,30 10,90,60 50,80,70 50,80,10 10,10,100 200,0,0 0,0,0
结果文件:result.csv
,k1,k2,k3 0,0.6666666666666666,0.0,0.3333333333333333 1,0.05925925925925926,0.37777777777777777,0.6666666666666666 2,0.4567901234567901,0.4,0.16666666666666666 3,0.5814814814814815,0.5277777777777777,0.0 4,0.3962962962962963,0.26111111111111107,0.5 5,0.37037037037037035,0.0,0.5 6,0.2962962962962963,0.0,0.8333333333333333 7,0.0,0.0,1.0