机器学习实操练习 1 (没有理论)
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import jieba
def datasets_demo():
iris = load_iris()
print("数据集\n",iris)
print("数据集描述\n",iris.DESCR)
print("特征值名称\n",iris.feature_names)
print("查看特征值\n",iris.data,iris.data.shape)
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
print(x_train,x_train.shape)
return None
def dict_demo():
data = [{'1': 'h', 't': 100}, {'1': 's', 't': 120}, {'1': 's', 't': 130}, {'1': 'h', 't': 110}, ]
transfer = DictVectorizer(sparse=False)
data_new = transfer.fit_transform(data)
print(data_new)
print(transfer.get_feature_names())
def count_demo():
data = ['like day one two','yes one day good,one']
transfer =CountVectorizer()
data_new = transfer.fit_transform(data)
print(data_new)
print(data_new.toarray())
print(transfer.get_feature_names())
def count_chinese_demo():
data = ['该生学习认真努力,注重专业知识学习,能将学习的知识与实践相结合,积极参加社会实践;',
'积极参与学校和院系组织的活动,为班级建设做出了积极的贡献;',
'团结同学,尊敬师长,遵守校规校纪,乐于助人。']
data_new = []
print(list(jieba.cut('找打的骄傲空当接龙')))
for i in data:
data_new.append(' '.join(list(jieba.cut(i))))
print(data_new)
transfer = CountVectorizer(stop_words=['该'])
new_data = transfer.fit_transform(data_new)
print(new_data)
print(new_data.toarray())
print(transfer.get_feature_names())
def minmax():
data = pd.read_csv("base.csv", encoding='gbk')
data = data.iloc[:,:3]
transfer = MinMaxScaler(feature_range=[1,2])
data_new = transfer.fit_transform(data)
data.iloc[:, :3] = data_new
print(data)
def stand_demo():
data = pd.read_csv("base.csv", encoding='gbk')
data = data.iloc[:,:3]
transfer = StandardScaler()
data_new = transfer.fit_transform(data)
data.iloc[:, :3] = data_new
print(data)
def variance_demo():
data = pd.read_csv("base.csv", encoding='gbk')
data_new = data.iloc[:,0:3]
transfer = VarianceThreshold(threshold=5)
data_new = transfer.fit_transform(data_new)
data = data.dropna()
r1 = pearsonr(data['注册时间'],data['注册资本'])
print(r1)
def pca_demo():
data = [[1,2],[55,3],[1,2]]
transfer = PCA(n_components=0.95)
transfer2 = PCA(n_components=2)
data_new = transfer.fit_transform(data)
print(data_new)
data_new = transfer2.fit_transform(data)
print(data_new)
def knn_iris_demo():
iris = load_iris()
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
estimator = KNeighborsClassifier(n_neighbors=5)
estimator.fit(x_train,y_train)
y_predict = estimator.predict(x_test)
print('y_predict:\n',y_predict)
print("真实值与预测值对比:\n",y_predict == y_test)
score = estimator.score(x_test,y_test)
print("准确率:",score)
def knn_iris_wg_jc_demo():
iris = load_iris()
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.2,random_state=12)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
estimator = KNeighborsClassifier()
param_dict = {'n_neighbors':[1,3,5,7,9,11]}
estimator = GridSearchCV(estimator,param_grid=param_dict,cv=10)
estimator.fit(x_train,y_train)
y_predict = estimator.predict(x_test)
print('y_predict:\n', y_predict)
print("真实值与预测值对比:\n", y_predict == y_test)
score = estimator.score(x_test, y_test)
print("准确率:", score)
print("最佳参数:",estimator.best_params_)
print("最佳结果:",estimator.best_score_)
print("最佳估计器:",estimator.best_estimator_)
if __name__ == '__main__':
'''year = pd.read_csv('year_report.csv',encoding='gbk')
# print(year)
#crosstab 找到两个特征之间的关系 统计出来
year = pd.crosstab(year['ID'],year['year'])
print(year)'''
knn_iris_wg_jc_demo()