Python：LogitA-优快云博客

本文链接：https://blog.youkuaiyun.com/DeniuHe/article/details/114920583
啥也不是

import numpy as np
import pandas as pd
import xlwt
from pathlib import Path
from copy import deepcopy

from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,mean_absolute_error,f1_score,recall_score

class LogistA():
    def __init__(self, X_pool, y_pool, labeled, budget, X_test, y_test):
        self.X_pool = X_pool
        self.y_pool = y_pool
        self.X_test = X_test
        self.y_test = y_test
        self.labeled = list(deepcopy(labeled))
        self.labels = np.unique(y_pool)
        self.target = np.array([int(_) for _ in np.arange(self.labels[0], self.labels[-1], 1)])
        self.nClass = len(np.unique(self.y_pool))
        self.nTar = self.nClass - 1  ## the K in R codes
        self.nAtt = self.X_pool.shape[1]  ## the p in R codes
        self.newY = 0
        self.ocModel = self.init_logist_model()
        self.unlabeled = self.initialization()
        self.beta_mat = self.init_beta_mat()
        self.Kp = self.nAtt * (self.nClass-1)
        self.budgetLeft = deepcopy(budget)
        self.budget = deepcopy(budget)
        self.tar_idx = None
        self.LATmodel = LogisticAT()
        ## 记录评价指标
        self.AccList = []
        self.MAEList = []
        self.RecallList = []
        self.FscoreList = []
        self.ALC_ACC = []
        self.ALC_MAE = []
        self.ALC_F1 = []
        self.ALC_Recall = []
        

    def initialization(self):
        unlabeled = [i for i in range(len(self.y_pool))]
        for j in self.labeled:
            unlabeled.remove(j)
        return unlabeled

    def init_logist_model(self):
        model_dict = OrderedDict()
        for tar in self.target:
            model_dict[tar] = LogisticRegression(solver='newton-cg', penalty='l2')
        return model_dict

    # def reconstruct_fit(self):
    #     print(self.target)
    #     for tar in self.target:
    #         tmp_train_ids = []
    #         for idx in self.labeled:
    #             if self.y_pool[idx] == tar or self.y_pool[idx] == tar+1:
    #                 tmp_train_ids.append(idx)
    #         self.ocModel[tar].fit(X=self.X_pool[tmp_train_ids],y=self.y_pool[tmp_train_ids])
    #         self.beta_mat[:,tar] = self.ocModel[tar].coef_[0]
    #     self.newY = 0

    def train_test(self):
        self.LATmodel.fit(X=self.X_pool[self.labeled],y=self.y_pool[self.labeled])
        y_pred = self.LATmodel.predict(X=self.X_test)
        self.AccList.append(accuracy_score(y_pred=y_pred,y_true=self.y_test))
        self.MAEList.append(mean_absolute_error(y_pred=y_pred,y_true=self.y_test))
        self.RecallList.append(recall_score(y_pred=y_pred,y_true=self.y_test,average='macro'))
        self.FscoreList.append(f1_score(y_pred=y_pred,y_true=self.y_test,average='macro'))

    def init_beta_mat(self):
        beta_mat = np.zeros((self.nAtt,self.nClass-1),dtype=float)
        for tar in self.target:
            tmp_train_ids = []
            for idx in self.labeled:
                if self.y_pool[idx] == tar or self.y_pool[idx] == tar + 1:
                    tmp_train_ids.append(idx)
            self.ocModel[tar].fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
            print("tar=",tar)
            beta_mat[:, tar] = self.ocModel[tar].coef_[0]
        return beta_mat

    def update_beta_mat0(self,newY):
        if newY == 0 or newY == self.target[-1]:
            # print("newY==",newY)
            tmp_train_ids = []
            for idx in self.labeled:
                if self.y_pool[idx] == newY or self.y_pool[idx] == newY+1:
                    tmp_train_ids.append(idx)
            model = LogisticRegression(solver='newton-cg', penalty='l2')
            model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
            self.beta_mat[:,newY] = model.coef_[0]
            # print(model.coef_[0])

        else:
            # print("newY==",newY)
            for tar in [newY,newY+1]:
                tmp_train_ids = []
                for idx in self.labeled:
                    if self.y_pool[idx] == tar or self.y_pool[idx] == tar + 1:
                        tmp_train_ids.append(idx)
                model = LogisticRegression(solver='newton-cg', penalty='l2')
                model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
                self.beta_mat[:, tar] = model.coef_[0]
                # print(model.coef_[0])

    def update_beta_mat(self):
        if self.tar_idx == None:
            pass
        else:
            if self.y_pool[self.tar_idx] == 0:
                tmp_train_ids = []
                for idx in self.labeled:
                    if self.y_pool[idx] == 0 or self.y_pool[idx] == 1:
                        tmp_train_ids.append(idx)
                model = LogisticRegression(solver='newton-cg', penalty='l2')
                model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
                self.beta_mat[:, 0] = model.coef_[0]
            elif self.y_pool[self.tar_idx] == self.labels[-1]:
                tmp_train_ids = []
                end_l = self.labels[-1]
                end_ll = end_l-1
                for idx in self.labeled:
                    if self.y_pool[idx] == end_l or self.y_pool[idx] == end_ll:
                        tmp_train_ids.append(idx)
                model = LogisticRegression(solver='newton-cg', penalty='l2')
                model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
                self.beta_mat[:, self.target[-1]] = model.coef_[0]
            else:
                for tar in [self.y_pool[self.tar_idx]-1,self.y_pool[self.tar_idx]]:
                    tmp_train_ids = []
                    for idx in self.labeled:
                        if self.y_pool[idx] == tar or self.y_pool[idx] == tar + 1:
                            tmp_train_ids.append(idx)
                    model = LogisticRegression(solver='newton-cg', penalty='l2')
                    model.fit(X=self.X_pool[tmp_train_ids], y=self.y_pool[tmp_train_ids])
                    print("tar::",tar)
                    self.beta_mat[:, int(tar)] = model.coef_[0]

    def getWH(self):
        # N = len(self.labeled)
        X = self.X_pool[self.labeled]
        W = np.zeros((self.Kp,self.Kp))
        H = np.zeros((self.Kp,self.Kp))
        predictor = X @ self.beta_mat
        print("predictor=", predictor.shape)
        # print(predictor)
        ## 计算分子的子过程
        tmp = np.exp(predictor)
        print("tmp=",tmp.shape)
        ## 计算theta
        theta = tmp/(1+tmp)
        print("theta=",theta.shape)
        ## 计算分母的子过程
        accu_tmp = np.exp(np.cumsum(predictor,1))
        phi = accu_tmp / (1+np.sum(accu_tmp,1)).reshape(-1,1)

        for i, idx in enumerate(self.labeled):
            x = self.X_pool[idx].reshape(-1,1)
            xt = x.T
            gram = x @ xt
            print('gram::',gram.shape)
            w = np.zeros((self.nClass-1,self.nClass-1))
            print("w::",w.shape)
            for k in range(self.nClass-2):
                print("k==",k)
                print("i==",i)
                w[k,k+1] = -phi[i,k] * (1 - theta[i,k]) * theta[i,k+1]
            w = w.T + w
            # print("w::")
            # print(w)
            h = np.diag(phi[i] % (1-theta[i]))
            print("h==",h.shape)
            for k in range(self.nClass-1):
                w[k,k] = h[k,k]
            print("w==",w.shape)
            W += np.kron(w,gram)
            H += np.kron(h,gram)
            print("W::",W.shape)
            print("H::",H.shape)
        # inv_sigma = H @ np.linalg.inv(W) @ H
        # print("inv_sigma==",inv_sigma.shape)
        # eigenvalue, featurevector = np.linalg.eig(inv_sigma)
        # print("eigenvalue::",eigenvalue)
        return W

    def A_optimal_ord(self,W):
        trace_value = OrderedDict()
        for idx in self.unlabeled:
            x = self.X_pool[idx]
            predictor = x @ self.beta_mat
            print("predictor::",predictor.shape)
            print("predictor==",predictor)
            tmp = np.exp(predictor)
            print("tmp::",tmp.shape)
            print("tmp==",tmp)
            theta = tmp / (1 + tmp)
            print("theta::",theta.shape)
            print("theta==",theta)
            accu_tmp = np.exp(np.cumsum(predictor))
            print("accu_tmp::",accu_tmp.shape)
            print("accu_tmp==",accu_tmp)
            phi = accu_tmp / (1+ np.sum(accu_tmp))
            print("phi::",phi.shape)
            print("phi==",phi)

            print("x=",x)
            x = self.X_pool[idx].reshape(-1, 1)
            xt = x.T
            gram = x @ xt
            print("gram==",gram)
            print(x.shape)
            print(self.beta_mat.shape)
            w = np.zeros((self.nClass - 1, self.nClass - 1))
            for k in range(self.nClass - 2):
                w[k,k+1] = -phi[k] * (1-theta[k]) * theta[k+1]
            w = w.T + w
            for k in range(self.nClass-1):
                w[k,k] = phi[k] % (1-theta[k])
            print("w==",w)

            trace_value[idx] = np.trace(np.linalg.inv(W + np.kron(w,gram)))
            print("trace_value[idx]===",trace_value[idx])
        self.tar_idx = min(trace_value,key=trace_value.get)
        print("tar_idx==",self.tar_idx)

    def select(self):
        while self.budgetLeft > 0:
            self.update_beta_mat()
            W = self.getWH()
            self.A_optimal_ord(W=W)
            print("判断是否在已标记样本：：",self.tar_idx in self.labeled)
            self.labeled.append(self.tar_idx)
            self.unlabeled.remove(self.tar_idx)
            self.budgetLeft -= 1