版权声明:本文为博主原创文章,转载请注明作者和出处。https://blog.youkuaiyun.com/xq920831/article/details/83892585
这篇主要为记录帖,整理一下这几天的工作。
进入正题!
一、制作数据集
网上大多数为cifar-10数据集上训练,那么如果我们有自己的数据,怎么将自己的数据做成数据集将是开始的关键一步。
这里给出制作pkl数据的方法
注:pkl单次只能处理4GB一下的数据量,如果数据量很大,则手动分块处理,dump多次。我自己对比了一下pickle和joblib,感觉没什么区别(可能是我的数据集太大的缘故),二者都不能一次处理完。
我自己的数据集:一个测试集,九个训练集(手动分)。
由于我的彩色原图太多,且分辨率比较高,为了考虑整体的训练效果,这里把图片统一reshape为100*100的。
代码如下:
# -*- coding:utf-8 -*-
# Author: Agent Xu
from skimage import io,transform
import numpy as np
import random
import pickle
import os
import joblib
w = 100
h = 100
#函数调用:生成数据集
def initPKL(img_arr , train_or_test):
labels = []
img_arr1 = np.zeros((len(img_arr),30000))
if train_or_test == 'train':
set_name = 'trainSet_9.pkl'
else:
set_name = 'testSet.pkl'
k = 0
for i in img_arr:
# imgSet.append(i)
i[0] = i[0].flatten()
img_arr1[k][:] = i[0]
k +=1
labels.append(i[1])
img_arr1 = np.array(img_arr1)
labels = np.array(labels)
arr = (img_arr1,labels)
#写入文件
data = (arr[0],arr[1])
output = open(set_name, 'wb')
pickle.dump(data, output)
output.close()
def initArr(folders_path):
i = 0
imgSet = []
folders = os.listdir(folders_path)
for folder in folders:
#类别个数,几个0代表几类
label = [0,0,0,0,0]
files = os.listdir(folders_path + folder)
label[i] = 1
for file in files:
#读取图片
img = io.imread(folders_path + folder + '/' + file)
img = transform.resize(img,(w,h))
img_arr = np.array(img,dtype='float64') / 256
imgSet.append((img_arr,label))
i += 1
return imgSet
#将图片转换成数组
#train_folders_path = 'D:/GJAI_data/tupian/train_9/'
test_folders_path = 'D:/GJAI_data/tupian/validation/'
#train_imgSet = initArr(train_folders_path)
test_imgSet = initArr(test_folders_path)
#打乱顺序
#random.shuffle(train_imgSet)
random.shuffle(test_imgSet)
#train_set_shuffle = np.array(train_imgSet)
test_set_shuffle = np.array(test_imgSet)
# 分别生成训练集和测试集
#initPKL(train_set_shuffle, 'train')
initPKL(test_set_shuffle, 'test')
# #测试生成的数据集
# f = open('./trainSet_1.pkl', 'rb')
# x, y = pickle.load(f)
# f.close()
#
# print(np.shape(x[3]), y[3])
训练集和测试集可分开,注释掉相应代码即可(上面的代码为生成测试集的pkl文件)。
如有疑问,欢迎留言。
二、KNN训练分类
# -*- coding:utf-8 -*-
# Author: Agent Xu
import numpy as np
import os
import pickle
class kNearestNeighbor:
def __init__(self):
pass
def train(self, X, y):
self.Xtr = X
self.ytr = y
def predict(self, X, k=1):
num_test = X.shape[0]
Ypred = np.zeros(num_test, dtype = self.ytr.dtype)
for i in range(num_test):
distances = np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
closest_y = y_train[np.argsort(distances)[:k]]
u, indices = np.unique(closest_y, return_inverse=True)
Ypred[i] = u[np.argmax(np.bincount(indices))]
return Ypred
def load_data_batch(filename):
""" load single batch of cifar """
with open(filename, 'rb') as f:
X,Y = pickle.load(f,encoding='latin1')
Y = Y.tolist()
for w in range(len(Y)):
Y[w] = Y[w].index(1)
X = X.reshape(len(Y), 3, 100, 100).transpose(0,2,3,1).astype("float")
Y = np.array(Y)
return X, Y
def load_data(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1,10):
f = os.path.join(ROOT, 'trainSet_%d.pkl' %(b))
X, Y = load_data_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs) #使变成行向量
Ytr = np.concatenate(ys)
del X,Y
Xte, Yte = load_data_batch(os.path.join(ROOT, 'testSet.pkl'))
return Xtr, Ytr, Xte, Yte
Xtr, Ytr, Xte, Yte = load_data('D:/GJAI_data/tupian')
Xtr_rows = Xtr.reshape(Xtr.shape[0], 100 * 100 * 3)
Xte_rows = Xte.reshape(Xte.shape[0], 100 * 100 * 3)
num_training = 16000
num_test = 1900
x_train = Xtr_rows[:num_training, :]
y_train = Ytr[:num_training]
x_test = Xte_rows[:num_test, :]
y_test = Yte[:num_test]
knn = kNearestNeighbor()
knn.train(x_train, y_train)
y_predict = knn.predict(x_test, k=7)
acc = np.mean(y_predict == y_test)
print('accuracy : %f' %(acc))
代码参考:https://blog.youkuaiyun.com/zhousishuo/article/details/78877466
根据自己的数据情况稍作改动。
原文还有K交叉验证的内容,自行参考。