原理
数据集
http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
在使用时我直接放入到了txt中并且加上了标题栏,要不要都无所谓,但是程序需要稍作修改不然就会少一条数据。
ID Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave_mean symmetry_mean fractal_mean radius_sd texture_sd perimeter_sd area_sd smoothness_sd compactness_sd concavity_sd concave_sd symmetry_sd fractal_sd radius_max texture_max perimeter_max area_max smoothness_max compactness_max concavity_max concave_max symmetry_max fractal_max
程序
写的不好,欢迎指正
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author:albert time:2018/11/28
import random
def read_dataset(address, val_list): # 将txt中的数据读到list中
f = open(address)
colum = f.readline()
while len(colum) != 0:
val_list.append(colum.split('\t'))
colum = f.readline()
return val_list
def split_dataset(dataset, percent): # 输入想分割的数据集和分割比例,返回percent比例的训练集和剩下的训练集
dataset.remove(dataset[0]) # 去表头
n = int(len(dataset) * percent)
test_data =[]
while n >0:
number = random.randint(0, len(dataset)-1)
test_data.append(dataset[number])
dataset.remove(dataset[number])
n = n-1
return<