啥也不是
import numpy as np
import pandas as pd
import xlwt
from pathlib import Path
from copy import deepcopy
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,mean_absolute_error,f1_score,recall_score
class LogistA():
def __init__(self, X_pool, y_pool, labeled, budget, X_test, y_test):
self.X_pool = X_pool
self.y_pool = y_pool
self.X_test = X_test
self.y_test = y_test
self.labeled = list(deepcopy(labeled))
self.labels = np.unique(y_pool)
self.target = np.array([int(_) for _ in np.arange(self.labels[0], self.labels[-1], 1)])
self.nClass = len(np.unique(self.y_pool))
self.nTar = self.nClass - 1 ## the K in R codes
self.nAtt = self.X_pool.shape[1] ## the p in R codes
self.newY = 0
self.ocModel = self.init_logist_model()
self.unlabeled = self.initialization()
self.beta_mat = self.init_beta_mat()
self.Kp = self.nAtt * (self.nClass-1)
self.budgetLeft = deepcopy(budget)
self.budget = deepcopy(budget)
self.tar_idx = None
self.LATmodel = LogisticAT()
## 记录评价指标
self.AccList = []
self.MAEList = []
self.RecallList = []
self.FscoreList = []
self.ALC_ACC = []
self.ALC_MAE = []
self.ALC_F1 = []
self.ALC_Recall = []
def initialization(self):
unlabeled = [i for i in range(len(self.y_pool))]
for j in self.labeled:
unlabeled.remove(j)
return unlabeled
def init_logist_model(self):
model_dict = OrderedDict()
for tar in self.target:
model_dict[tar] = LogisticRegression(solver='newton-cg', penalty='l2')
return model_dict
# def reconstruct_fit(self):
# print(self.target)
# for tar in self.target:
# tmp_train_ids = []
# for idx in self.labeled:
# if self.y_pool[idx] == tar or self.y_pool[idx] == tar+1:
# tmp_train_ids.append(idx)
# self.ocModel[tar].fit(X=self.X_pool[tmp_train_ids],y=self.y_pool[tmp_train_ids])
# self.beta_mat[:,tar] = self.ocModel[tar].coef_[0]
# self.newY = 0
def train_test(self):
self.LATmodel.fit(X=self.X_pool[self.labeled],y=self.y_pool[self.labeled])
y_pred = self.LATmodel.predict(X=self.X_test)
self.AccList.append(accuracy_score(y_pred=y_pred,y_true=self.y_test))
self.MAEList.append(mean_absolute_error(y_pred=y_pred,y_true=self.y_test))
self.RecallList.append(recall_score(y_pred=y_pred,y_true=self.y_test,average='macro'))
self.FscoreList.append(f1_score(y_pred=y_pred,y_true=self.y_test,average='macro'))
def init_beta_mat(self):
beta_mat = np.zeros((self.nAtt,self.nClass-1),dtype=float)
for tar in self.target:
tmp_train_ids = []
for idx in self.labeled:
if self.y_pool[idx] == tar or self.y_pool[idx] == tar + 1:
tmp_train_ids.append(idx)
self.ocModel[tar].fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
print("tar=",tar)
beta_mat[:, tar] = self.ocModel[tar].coef_[0]
return beta_mat
def update_beta_mat0(self,newY):
if newY == 0 or newY == self.target[-1]:
# print("newY==",newY)
tmp_train_ids = []
for idx in self.labeled:
if self.y_pool[idx] == newY or self.y_pool[idx] == newY+1:
tmp_train_ids.append(idx)
model = LogisticRegression(solver='newton-cg', penalty='l2')
model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
self.beta_mat[:,newY] = model.coef_[0]
# print(model.coef_[0])
else:
# print("newY==",newY)
for tar in [newY,newY+1]:
tmp_train_ids = []
for idx in self.labeled:
if self.y_pool[idx] == tar or self.y_pool[idx] == tar + 1:
tmp_train_ids.append(idx)
model = LogisticRegression(solver='newton-cg', penalty='l2')
model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
self.beta_mat[:, tar] = model.coef_[0]
# print(model.coef_[0])
def update_beta_mat(self):
if self.tar_idx == None:
pass
else:
if self.y_pool[self.tar_idx] == 0:
tmp_train_ids = []
for idx in self.labeled:
if self.y_pool[idx] == 0 or self.y_pool[idx] == 1:
tmp_train_ids.append(idx)
model = LogisticRegression(solver='newton-cg', penalty='l2')
model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
self.beta_mat[:, 0] = model.coef_[0]
elif self.y_pool[self.tar_idx] == self.labels[-1]:
tmp_train_ids = []
end_l = self.labels[-1]
end_ll = end_l-1
for idx in self.labeled:
if self.y_pool[idx] == end_l or self.y_pool[idx] == end_ll:
tmp_train_ids.append(idx)
model = LogisticRegression(solver='newton-cg', penalty='l2')
model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
self.beta_mat[:, self.target[-1]] = model.coef_[0]
else:
for tar in [self.y_pool[self.tar_idx]-1,self.y_pool[self.tar_idx]]:
tmp_train_ids = []
for idx in self.labeled:
if self.y_pool[idx] == tar or self.y_pool[idx] == tar + 1:
tmp_train_ids.append(idx)
model = LogisticRegression(solver='newton-cg', penalty='l2')
model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
print("tar::",tar)
self.beta_mat[:, int(tar)] = model.coef_[0]
def getWH(self):
# N = len(self.labeled)
X = self.X_pool[self.labeled]
W = np.zeros((self.Kp,self.Kp))
H = np.zeros((self.Kp,self.Kp))
predictor = X @ self.beta_mat
print("predictor=", predictor.shape)
# print(predictor)
## 计算分子的子过程
tmp = np.exp(predictor)
print("tmp=",tmp.shape)
## 计算theta
theta = tmp/(1+tmp)
print("theta=",theta.shape)
## 计算分母的子过程
accu_tmp = np.exp(np.cumsum(predictor,1))
phi = accu_tmp / (1+np.sum(accu_tmp,1)).reshape(-1,1)
for i, idx in enumerate(self.labeled):
x = self.X_pool[idx].reshape(-1,1)
xt = x.T
gram = x @ xt
print('gram::',gram.shape)
w = np.zeros((self.nClass-1,self.nClass-1))
print("w::",w.shape)
for k in range(self.nClass-2):
print("k==",k)
print("i==",i)
w[k,k+1] = -phi[i,k] * (1 - theta[i,k]) * theta[i,k+1]
w = w.T + w
# print("w::")
# print(w)
h = np.diag(phi[i] % (1-theta[i]))
print("h==",h.shape)
for k in range(self.nClass-1):
w[k,k] = h[k,k]
print("w==",w.shape)
W += np.kron(w,gram)
H += np.kron(h,gram)
print("W::",W.shape)
print("H::",H.shape)
# inv_sigma = H @ np.linalg.inv(W) @ H
# print("inv_sigma==",inv_sigma.shape)
# eigenvalue, featurevector = np.linalg.eig(inv_sigma)
# print("eigenvalue::",eigenvalue)
return W
def A_optimal_ord(self,W):
trace_value = OrderedDict()
for idx in self.unlabeled:
x = self.X_pool[idx]
predictor = x @ self.beta_mat
print("predictor::",predictor.shape)
print("predictor==",predictor)
tmp = np.exp(predictor)
print("tmp::",tmp.shape)
print("tmp==",tmp)
theta = tmp / (1 + tmp)
print("theta::",theta.shape)
print("theta==",theta)
accu_tmp = np.exp(np.cumsum(predictor))
print("accu_tmp::",accu_tmp.shape)
print("accu_tmp==",accu_tmp)
phi = accu_tmp / (1+ np.sum(accu_tmp))
print("phi::",phi.shape)
print("phi==",phi)
print("x=",x)
x = self.X_pool[idx].reshape(-1, 1)
xt = x.T
gram = x @ xt
print("gram==",gram)
print(x.shape)
print(self.beta_mat.shape)
w = np.zeros((self.nClass - 1, self.nClass - 1))
for k in range(self.nClass - 2):
w[k,k+1] = -phi[k] * (1-theta[k]) * theta[k+1]
w = w.T + w
for k in range(self.nClass-1):
w[k,k] = phi[k] % (1-theta[k])
print("w==",w)
trace_value[idx] = np.trace(np.linalg.inv(W + np.kron(w,gram)))
print("trace_value[idx]===",trace_value[idx])
self.tar_idx = min(trace_value,key=trace_value.get)
print("tar_idx==",self.tar_idx)
def select(self):
while self.budgetLeft > 0:
self.update_beta_mat()
W = self.getWH()
self.A_optimal_ord(W=W)
print("判断是否在已标记样本::",self.tar_idx in self.labeled)
self.labeled.append(self.tar_idx)
self.unlabeled.remove(self.tar_idx)
self.budgetLeft -= 1