数据文件如下:
代码如下:
import os
os.environ["PATH"] += os.pathsep + 'F:/graphviz-2.38/release/bin'
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import tree
# 导入文件
film_data = open('film.csv', 'rt')
reader = csv.reader(film_data)
headers = next(reader)
# 打印表头
print(headers)
# 创建特征集和结果集
feature_list = []
result_list = []
# 遍历,特征集追加字典,该字典由header[1:-1]和row[1:-1]
# 结果集追加row[-1]
for row in reader:
feature_list.append(dict(zip(headers[1:-1], row[1:-1])))
result_list.append(row[-1])
print(feature_list)
print(result_list)
# 将dict类型的数据转换成nummpy array
vec = DictVectorizer()
# 将你的List转化成二进制形式并且已按照List原格式首字母排序
# country gross type
dummyX = vec.fit_transform(feature_list).toarray()
dummyY = preprocessing.LabelBinarizer().fit_transform(result_list)
print(dummyX)
print(dummyY)
clf = tree.DecisionTreeCla