图像分类实操--分析数据集

本文介绍了如何使用Python进行数据集的划分,包括训练集和测试集的生成,并统计了各分类的数量,最后通过图像展示各类别样本的具体内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

分析数据集的好处:能够清楚的认识到自己所以处理的内容是什么,目前存在的问题有哪些,以及采用何种办法来解决。

1)将数据集分成训练集和测试集

刚开始打算用PIL里面的Image.save来保存图像,但没有成功,所有采用了Opencv来读取图像和保存图像。

Code:

importos
import random
import cv2

def split_dataset(datapath):
   
# input: data path
    # output: data split result includegenerate train image, test image
   
files = os.listdir(datapath)                                   # all classes name
   
for file in files:                                             # each class name
       
class_path = os.path.join(datapath, file)                  # each class path
       
image_sequence =os.listdir(class_path)                   # each class imagename
       
each_class_number = len(image_sequence)                    # number of each class image
       
train_number = int(each_class_number/2)                    # the ratio of train set
       
train_image_sequence =random.sample(image_sequence, train_number)  #random sample to compose train set
       
train_image_list = generate_image_path(class_path, train_image_sequence)  #generate train image full path
       
generate_file(InputImagePathList=train_image_list, OutputImagePath='../AID50/train')    # generate trainimage file
       
test_image_sequence = list(set(image_sequence).difference(set(train_image_sequence)))    #test image set
       
test_image_list =generate_image_path(class_path, test_image_sequence)    #generate test image full path
       
generate_file(InputImagePathList=test_image_list, OutputImagePath='../AID50/test')      # generate testimage file


def generate_image_path(RootPath, ImageNameSequence):
   
# input : root path
    # input : image name sequence
    # output : return image path sequence
   
FullImagePath = []
   
for imagename in ImageNameSequence:
        fullimagepath =os.path.join(RootPath
, imagename)
       FullImagePath.append(fullimagepath)
   
return FullImagePath

def generate_file(InputImagePathList,OutputImagePath):
   
# input: input image path
    # input: output image path
    # output: generate image file
   
imagenumber = 0           # each class imageindex
   
for imagepath in InputImagePathList:
        image = cv2.imread(imagepath)
        imagenumber = imagenumber +
1
       
class_name = imagepath.split('\\')[1]   # from Input Image Path sparse class name
       
ImageSavePath =os.path.join(OutputImagePath, class_name)
       
if not os.path.exists(ImageSavePath):
            os.makedirs(ImageSavePath)
        imagename = ImageSavePath +
'\\' + class_name + '_' + str(imagenumber) + '.jpg'
       
cv2.imwrite(imagename, image)

def main():
    AID_Path =
"../AID"
   
split_dataset(datapath=AID_Path)

if __name__ == "__main__":
    main()

2)统计训练数据集中每个类别的数量

importos
import matplotlib.pyplot as plt
import copy
import numpy as np

def plot_classes_number(InputPath):
    files = os.listdir(InputPath)
   
all_classes_number = len(files)  #the number of all classes
   
class2number = {}
   
for file in files:
        class_path =os.path.join(InputPath
, file) # each class path
       
picture_name_sequence =os.listdir(class_path)  # each class name list
       
each_class_number = len(picture_name_sequence)  #each class number
       
class2number[file] = each_class_number  #dict to store class number
   
list_classname = []
    list_classnumber = []
   
for file in files:
        list_classname.append(file)
       list_classnumber.append(class2number[file])
    plt.figure(
figsize=(11, 4))
    width =
0.7
   
x = np.arange(len(list_classname))
    y = np.array(list_classnumber)
    plt.bar(x
, y, width, align='center'# plot a bar
   
plt.ylabel("Number")
   
# plt.xlabel("Class name")
    # plt.title("Number to eachclass")
    # set x scale
   
plt.xticks(x, list_classname, size='small', rotation=270)
   
# set digital label
   
for a, b in zip(x, y):
        plt.text(a
, b + 1, '%.0f' % b, ha='center', va='bottom', fontsize=7)
    plt.show()

AID_Train_Path =
"../AID50/train"
plot_classes_number(AID_Train_Path)

可以每个类别的数目是一样的,所以存在样本不平衡的问题。

3)人眼观察一下每个类别图像的具体内容

fromPIL import Image
import matplotlib.pyplot as plt
import os
import random

def acquire_sample_picture(PictureSequencePath, Number):
   
# input: PictureSequencePath stand for allpictures of each class in an fold
    # input: Number is return sample pictures'number
    # output: return sample pictures'name list
   
picture_name_sequence =os.listdir(PictureSequencePath) #each class name list
   
relative_sample_picture_list =random.sample(picture_name_sequence, Number)
    absolutive_sample_picture_list = []
   
for relative_sample_picture in relative_sample_picture_list:
        absolutive_sample_picture =os.path.join(PictureSequencePath
, relative_sample_picture)
       absolutive_sample_picture_list.append(absolutive_sample_picture)
   
return absolutive_sample_picture_list

def PlotFigure(AID_Path, ColNumber, EachClassSampleNumber):
   
# input: AID_Path; every picture inAID_Path/class name/picture name
    # ColNumber: Number of picturesdisplay in a col
    # EachClassSampleNumber: number of samplesfor each class
    # output: show a figure
   
files = os.listdir(AID_Path)              # all classes name list
   
Classes_Number = len(files)               # number of classes
   
Col_Number = ColNumber                    # figure col number
   
Sample_Number = EachClassSampleNumber     #each class sample number
   
Row_Number =(Sample_Number*Classes_Number)/Col_Number # figure row number
   
sample_picture_number = 0                 # figure image index
   
for file in files:
        class_path =os.path.join(AID_Path
, file)
        sample_picture_list =acquire_sample_picture(
PictureSequencePath=class_path, Number=Sample_Number)
       
for picturepath in sample_picture_list:
            image =Image.open(picturepath)
            sample_picture_number =sample_picture_number +
#col index
           
plt.subplot(Row_Number, Col_Number, sample_picture_number)
            plt.imshow(image)
            plt.axis(
"off")                  # shut down axis
   
plt.subplots_adjust(wspace=0, hspace=0#no gap between nearby images
   
plt.show()

def main():
    AID_Path =
"../AID50/train"
   
PlotFigure(AID_Path, ColNumber=10, EachClassSampleNumber=2)

if __name__ == "__main__":
    main()

按行看,每类图像展示了两张,可以大致看出相同类别存在差异性,比如第一行的第五张和第六张图像属于相同的类别,但肉眼很难判断出是同一类;还有不同类别间的相似性,比如第二行的第六张和第七张,分属不同类别但具有很强的相似性。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值