机器学习手撕代码(1)贝叶斯分类器
- 本篇分享一下贝叶斯分类器的代码,naive_bayes.py为朴素贝叶斯分类器,normal_bayes.py中为正态贝叶斯分类器。utils.py中为可视化结果的工具。
- dataset见本系列第0篇。
naive_bayes.py
import numpy as np
from datasets.dataset import DataSet
from sklearn.model_selection import train_test_split
from utils import Visualization
class NaiveBayesClassifier:
def __init__(self,n_class):
self.pc = None
self.dis_i = []
self.con_i = []
self.n_feats = 0
self.dis_feat_count = None
self.con_feat_count = None
self.n_class = n_class
def fit(self,data_all,targets,dis_i=[]): # dis_i为离散特征的索引
cla_count = np.bincount(targets)
self.pc = cla_count/np.sum(cla_count) # 训练集中每个类的概率
self.dis_i = dis_i
self.con_i = list(range(data_all.shape[1]))
for i in dis_i:
self.con_i.remove(i) # 删除离散特征的索引,留下连续特征的索引
self.n_feats = data_all.shape[1]
all_dis_feat_count = [] # 存放每个类数据中离散特征的统计
all_con_feat_count = [] # 存放每个类数据中连续特征的统计
max_nv = 0
for i in dis_i:
max_nv = max(max_nv,np.max(np.unique(data_all[:,i])))
max_nv = int(max_nv+1)
for cla in np.unique(targets):
data = data_all[targets==cla] # 取出一个类的数据
dis_feat_count = []
con_feat_count = []
for i in range(data.shape[1]):
data_i = data[:,i]
if i in dis_i: # 离散特征
data_i = data_i.astype(np.int32)
count = np.bincount(data_i)+1 # 拉普拉斯平滑
dis_feat_count.append(count) # 一个类的所有特征的值的统计
else: # 如果是连续特征则计算均值和标准差
mu = np.mean(data_i)
sigma = np.std(data_i,ddof=1)
con_feat_count.append([mu,sigma])
for i in range(len(dis_feat_count)):
count_i = dis_feat_count[i]
if count_i.shape[0]<max_nv: # 这里统一数组的长度,便于转化为ndarray计算
dif = max_nv-count_i.shape[0]
dis_feat_count[i] = np.concatenate([count_i,np.zeros(dif)])
all_dis_feat_count.append(np.array(dis_feat_count))
all_con_feat_count.append(np.array(con_feat_count))
self.dis_feat_co

最低0.47元/天 解锁文章
2140

被折叠的 条评论
为什么被折叠?



