朴素贝叶斯代码(Python)

朴素贝叶斯

使用朴素贝叶斯,特征向量为离散型
x1,x2是两个特征向量,Y是类别

123456789101112131415
x1111112222233333
x2SMMSSSMMLLLMMLL
Y-1-111-1-1-11111111-1

手算过程见文末参考博客

思想:

1. 创建数据集
2. 计算各类概率
2.1 算p(y = -1),p(y = 1), 即算各类概率
2.2 根据输入特征向量x = (2, 'S'), 计算p(x向量 | y) = 累乘 p(xi | y)
3. 预测:给一个特征向量,按照2中算出的值相乘,各类概率大的获胜

代码:

def createDataSet():
    dataSet = [[1, 'S', -1],
               [1, 'M', -1],
               [1, 'M', 1],
               [1, 'S', 1],
               [1, 'S', -1],
               [2, 'S', -1],
               [2, 'M', -1],
               [2, 'M', 1],
               [2, 'L', 1],
               [2, 'L', 1],
               [3, 'L', 1],
               [3, 'M', 1],
               [3, 'M', 1],
               [3, 'L', 1],
               [3, 'L', -1]]
    labels = ['x1', 'x2', 'y']
    return dataSet, labels

# 统计yi的个数
def typeCount(typeList, t):
    cnt = 0
    for tL in typeList:
        if tL == t:
            cnt += 1
    return cnt

# 计算Y=-1或1的条件下,X等于某值 个数
def featCount(dataSet, i, feat, y):
    cnt = 0
    #print(i, feat, y)
    for row in dataSet:
        if row[i] == feat and row[-1] == y:
            cnt += 1
    return cnt

def calcBayes(dataSet):
    # 以 x = (2, 'S') 为例
    X = [2 , 'S']
    
    lenDataSet = len(dataSet)
    typeList = [row[-1] for row in dataSet] 
    typeSet = set(typeList) # 类别集合
    print(typeList, typeSet)
    typeLen = len(typeSet)
    # 遍历一类 t=1; t=-1
    pList = [] # 记录预计 各类类别 概率
    for t in typeSet:
        yNum = typeCount(typeList, t)# 计算yi的个数
        print(f'{t} num =',yNum)
        py = yNum / lenDataSet
        print(f'P(Y = {t}) =', py)
        pSum = py
        # 对每个特征分量计数
        for i in range(len(X)):
            xiNum = featCount(dataSet, i, X[i], t) # 统计Y条件下 Xi取相应特征 的数量
            print(f'特征{X[i]} num =',xiNum)
            # 条件概率P{X = xi | Y = yi}
            pxy = xiNum / yNum
            print(f'条件概率 =', pxy)
            pSum *= pxy
        pList.append(pSum)
    #print(pList)
    return pList, typeSet

# 就是找最大的概率,记录下标
def predict(pList, typeList):
    for i in range(len(pList)):
        if pList[i] == max(pList):
            print('*'*50)
            print(f'预测类 为 = {typeList[i]}')
            
if __name__ == '__main__':
    dataSet, labels = createDataSet()
    pList, typeSet = calcBayes(dataSet)
    predict(pList, list(typeSet)) 

参考博客

后序可能会写其他贝叶斯和 处理连续型特征向量,占个坑~

### 实现树增强朴素贝叶斯算法 树增强朴素贝叶斯(TAN)通过引入条件依赖关系来改进传统朴素贝叶斯分类器的性能。具体来说,在构建模型时,除了考虑特征之间的独立性外,还允许某些特征之间存在直接的依赖关系。 下面是一个简单的 TAN 算法 Python 实现: ```python import numpy as np from sklearn.datasets import load_iris from scipy.stats import chi2_contingency from itertools import combinations from collections import defaultdict class TreeAugmentedNaiveBayes: def __init__(self): self.tree = None self.class_probabilities = {} self.feature_given_class_and_parent = {} def fit(self, X, y): n_samples, n_features = X.shape # 计算类别的先验概率 P(C) class_counts = np.bincount(y) total_count = float(sum(class_counts)) self.class_probabilities = {c: count / total_count for c, count in enumerate(class_counts)} # 构建特征间的最大加权生成树 MWST (使用卡方检验作为权重) weights = defaultdict(float) for feature_pair in combinations(range(n_features), 2): contingency_table = np.zeros((len(set(X[:, feature_pair[0]])), len(set(X[:, feature_pair[1]])))) for i in range(len(X)): row_index = list(set(X[:, feature_pair[0]])).index(X[i][feature_pair[0]]) col_index = list(set(X[:, feature_pair[1]])).index(X[i][feature_pair[1]]) contingency_table[row_index][col_index] += 1 _, p_value, _, _ = chi2_contingency(contingency_table) weights[feature_pair] = -np.log(p_value) if p_value != 0 else float('inf') edges = sorted(weights.items(), key=lambda item: item[1], reverse=True) parent_map = {f: None for f in range(n_features)} used_edges = set() while len(used_edges) < n_features - 1 and edges: edge, weight = edges.pop(0) u, v = min(edge), max(edge) if not any(u in e or v in e for e in used_edges): parent_map[v] = u used_edges.add((u, v)) self.tree = [(child, parent) for child, parent in parent_map.items() if parent is not None] # 学习局部条件概率表 LCPDs for cls in set(y): mask = (y == cls) masked_X = X[mask] local_probs = defaultdict(lambda: defaultdict(dict)) for feat_idx in range(n_features): parents = [parent for child, parent in self.tree if child == feat_idx] if not parents: counts = np.bincount(masked_X[:, feat_idx]) probs = counts / sum(counts) for value, prob in zip(np.unique(masked_X[:, feat_idx]), probs): local_probs[(feat_idx,)][(value,)] = prob else: unique_values = np.unique(masked_X[:, [parents[0], feat_idx]], axis=0) for val_combination in unique_values: subset_mask = (masked_X[:, parents[0]] == val_combination[0]) & \ (masked_X[:, feat_idx] == val_combination[1]) local_probs[tuple([feat_idx] + parents)][tuple(val_combination)] = \ sum(subset_mask) / sum(masked_X[:, parents[0]] == val_combination[0]) self.feature_given_class_and_parent[cls] = dict(local_probs) def predict_proba(self, X_test): predictions = [] for sample in X_test: scores = [] for cls, prior in self.class_probabilities.items(): score = np.log(prior) for feat_idx, value in enumerate(sample): parents = [parent for child, parent in self.tree if child == feat_idx] if not parents: try: score += np.log(self.feature_given_class_and_parent[cls][(feat_idx,)][(value,)]) except KeyError: pass # 如果未见过该组合,则跳过 else: try: parent_val = sample[parents[0]] score += np.log( self.feature_given_class_and_parent[cls][tuple([feat_idx] + parents)] [tuple([value, parent_val])] ) except KeyError: pass # 同样处理未知组合的情况 scores.append(score) predictions.append(scores) return np.array(predictions) def predict(self, X_test): log_likelihoods = self.predict_proba(X_test) return np.argmax(log_likelihoods, axis=-1) if __name__ == "__main__": data = load_iris() model = TreeAugmentedNaiveBayes() model.fit(data.data[:100], data.target[:100]) test_predictions = model.predict(data.data[100:]) print(f"Predictions on remaining samples:\n{test_predictions}") ``` 此代码实现了基于最大加权生成树的最大似然估计方法学习结构并计算局部条件概率分布[^1]。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值