chap1
从csv中加载数据集
加载csv文件
# Load a CSV file
def load_csv(filename):
file = open(filename, "r")
lines = reader(file)
dataset = list(lines)
return dataset
其中,len(dataset)表示行数, len(dataset[0]))表示列数。
上述方法有局限性,可能会引入空行,下面通过一行一行的引入来解决:
from csv import reader
# Load a CSV file def
load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Load dataset
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
将字符型数值转化成浮点型
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
eg:
[‘6’, ‘148’, ‘72’, ‘35’, ‘0’, ‘33.6’, ‘0.627’, ‘50’, ‘1’]
转化成
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
将字符转换成整型
# Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column] for row in dataset]
unique = set(class_values) #筛选种类
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i #字典写入
for row in dataset: #修改数值
row[column] = lookup[row[column]]
return lookup
总结
# Example of integer encoding string class values
from csv import reader
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Convert string column to integer
def str_column_to_int(dataset, column):
class_values = [row[column]
for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup
# Load iris dataset
filename = 'iris.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
print(dataset[0])
# convert string columns to float
for i in range(4):
str_column_to_float(dataset, i)
# convert class column to int
lookup = str_column_to_int(dataset, 4)
print(dataset[0])
print(lookup)
结果如下:
Loaded data file iris.csv with 150 rows and 5 columns
['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
[5.1, 3.5, 1.4, 0.2, 1]
{'Iris-virginica': 0, 'Iris-setosa': 1, 'Iris-versicolor': 2}