Python Machine Learning Blueprints

最新推荐文章于 2022-03-17 17:54:19 发布

原创最新推荐文章于 2022-03-17 17:54:19 发布 · 414 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#Python #Numpy #Pandas #Matplotlib

Python 专栏收录该内容

12 篇文章

订阅专栏

本文通过使用Python中的多种库，如Pandas、Matplotlib、Seaborn等，对鸢尾花数据集进行了深入的数据分析，并利用Statsmodels进行了回归分析，同时展示了如何使用Scikit-learn进行机器学习模型训练。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import requests
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import  train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split

#获取
# r = requests.get(r"https://api.github.com/users/acombs/starred")
# print(r.json())

#Pandas
# PATH = r"/"
# r = requests.get(r"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
# with open(PATH+'iris.data','w') as f:
#     f.write(r.text)
# os.chdir(PATH)
# df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width',
#                                             'petal length','petal width','class'])
# print(df.head())
# print(df['sepal length'])
# print(df.ix[:3,:2])
# print(df.ix[:2,[x for x in df.columns if 'width' in x]])
# print(df['class'].unique())
# print(df[df['class']=='Iris-virginica'])
# print(df.count())
# print(df[df['class']=='Iris-virginica'].count())
# virginica = df[df['class']=='Iris-virginica'].reset_index(drop=True)
# print(virginica)
# print(df[(df['class']=='Iris-virginica')&(df['petal width'] > 2.2)])
# print(df.describe())
# print(df.describe(percentiles=[.20,.40,.80,.90,.95]))
# print(df.corr())

#Matplotlib
# 柱状图
# fig, ax = plt.subplots(figsize=(6,4))
# ax.hist(df['petal width'], color='black')
# ax.set_ylabel('Count', fontsize = 12)
# ax.set_xlabel('Width',fontsize = 12)
# plt.title('Iris Petal Width', fontsize=14, y=1.01)
# plt.show()

# fig, ax = plt.subplots(2,2, figsize = (8,4))
# ax[0][0].hist(df['petal width'], color='black')
# ax[0][0].set_ylabel('Count',fontsize = 12)
# ax[0][0].set_xlabel('Width', fontsize = 12)
# ax[0][0].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# ax[0][1].hist(df['petal length'], color='black')
# ax[0][1].set_ylabel('Count',fontsize = 12)
# ax[0][1].set_xlabel('Width', fontsize = 12)
# ax[0][1].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# ax[1][0].hist(df['sepal width'], color='black')
# ax[1][0].set_ylabel('Count',fontsize = 12)
# ax[1][0].set_xlabel('Width', fontsize = 12)
# ax[1][0].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# ax[1][1].hist(df['sepal length'], color='black')
# ax[1][1].set_ylabel('Count',fontsize = 12)
# ax[1][1].set_xlabel('Width', fontsize = 12)
# ax[1][1].set_title('Iris Petel Width', fontsize = 14, y=1.01)
# plt.show()

#散点图
# fig, ax = plt.subplots(figsize=(6,6))
# ax.scatter(df['petal width'],df['petal length'], color='red')
# ax.set_xlabel('Petal Width')
# ax.set_ylabel('Petal Length')
# ax.set_title('Petal Scatterplot')
# plt.show()

#折线图
# fig, ax = plt.subplots(figsize=(6,6))
# ax.plot(df['petal length'], color='blue')
# ax.set_xlabel('Specimen Number')
# ax.set_ylabel('Petal Length')
# ax.set_title('Petal Length Plot')
# plt.show()

#条形图
# fig, ax = plt.subplots(figsize=(6,6))
# bar_width = .8
# labels = [x for x in df.columns if 'length' in x or 'width' in x]
# ver_y = [df[df['class']=='Iris-versicolor'][x].mean() for x in labels]
# vir_y = [df[df['class']=='Iris-virginica'][x].mean() for x in labels]
# set_y = [df[df['class']=='Iris-setosa'][x].mean() for x in labels]
# x = np.arange(len(labels))
# ax.bar(x, vir_y, bar_width, bottom=set_y, color='darkgrey')
# ax.bar(x, set_y, bar_width, bottom=ver_y, color='white')
# ax.bar(x, ver_y, bar_width, color='black')
# ax.set_xticklabels(labels, rotation=-70, fontsize=12)
# ax.set_title('Mean Feature Measurement By Class', y=1.01)
# ax.legend(['Virginica','Setosa','Versicolor'])
# plt.show()

#Seaborn
# sns.pairplot(df, hue='class')
# plt.show()

#小提琴图
# fig,ax = plt.subplots(2,2, figsize=(7,7))
# sns.set(style='white', palette='muted')
# sns.violinplot(x=df['class'], y=df['sepal length'], ax=ax[0,0])
# sns.violinplot(x=df['class'], y=df['sepal width'], ax=ax[0,1])
# sns.violinplot(x=df['class'], y=df['petal length'], ax=ax[1,0])
# sns.violinplot(x=df['class'], y=df['petal width'], ax=ax[1,1])
# fig.suptitle('Violin Plots', fontsize = 16, y=1.03)
# for i in ax.flat:
#     plt.setp(i.get_xticklabels(),rotation=-90)
# fig.tight_layout()
# plt.show()

#Map
# df['class'] = df['class'].map({'Iris-setosa':'SET','Iris-virginica':'VIR','Iris-versicolor':'VER'})
# print(df['class'])

#Apply
# df['wide petal'] = df['petal width'].apply(lambda v:1 if v>=1.3 else 0)
# print(df)

# df['petal area'] = df.apply(lambda r:r['petal length']*r['petal width'], axis=1)
# print(df)

#Applymap
# print(df.applymap(lambda v:np.log(v) if isinstance(v,float) else v))

#Groupby
# print(df.groupby('class').mean())
# print(df.groupby('class').describe())
# print(df.groupby('petal width')['class'].unique())
# print(df.groupby('class')['petal width']
#       .agg({'delta':lambda x:x.max() - x.min(),'max':np.max, 'min':np.min}))

#Statsmodels
# fig, ax = plt.subplots(figsize=(7,7))
# ax.scatter(df['sepal width'][:50], df['sepal length'][:50])
# ax.set_ylabel('Sepal Length')
# ax.set_xlabel('Sepal Width')
# ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, y=1.02)
# plt.show()

# y = df['sepal length'][:50]
# x = df['sepal width'][:50]
# X = sm.add_constant(x)
# results = sm.OLS(y,X).fit()
# print(results.summary())
#
# fig, ax = plt.subplots(figsize=(7,7))
# ax.plot(x, results.fittedvalues, label='regression line')
# ax.scatter(x, y, label='data point', color='r')
# ax.set_ylabel('Sepal Length')
# ax.set_xlabel('Sepal Width')
# ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, y=1.02)
# ax.legend(loc=2)
# plt.show()

#scikit-learn
# clf = RandomForestClassifier(max_depth=5,n_estimators=10)
# X = df.ix[:,:4]
# y = df.ix[:,4]
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# rf = pd.DataFrame(list(zip(y_pred,y_test)),columns=['predicted','actual'])
# rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1)
# # print(rf)
# print(rf['correct'].sum()/rf['correct'].count())

#SVM
# clf = OneVsOneClassifier(SVC(kernel='linear'))
# X = df.ix[:,:4]
# y = np.array(df.ix[:,4]).astype(str)
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# rf = pd.DataFrame(list(zip(y_pred,y_test)),columns=['predicted','actual'])
# rf['correct'] = rf.apply(lambda r:1 if r['predicted'] == r['actual'] else 0,axis=1)
# # print(rf)
# print(rf['correct'].sum()/rf['correct'].count())