import requests import os import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns import statsmodels.api as sm from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import train_test_split from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import SVC from sklearn.cross_validation import train_test_split #获取 # r = requests.get(r"https://api.github.com/users/acombs/starred") # print(r.json()) #Pandas # PATH = r"/" # r = requests.get(r"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data") # with open(PATH+'iris.data','w') as f: # f.write(r.text) # os.chdir(PATH) # df = pd.read_csv(PATH + 'iris.data', names=['sepal length', 'sepal width', # 'petal length','petal width','class']) # print(df.head()) # print(df['sepal length']) # print(df.ix[:3,:2]) # print(df.ix[:2,[x for x in df.columns if 'width' in x]]) # print(df['class'].unique()) # print(df[df['class']=='Iris-virginica']) # print(df.count()) # print(df[df['class']=='Iris-virginica'].count()) # virginica = df[df['class']=='Iris-virginica'].reset_index(drop=True) # print(virginica) # print(df[(df['class']=='Iris-virginica')&(df['petal width'] > 2.2)]) # print(df.describe()) # print(df.describe(percentiles=[.20,.40,.80,.90,.95])) # print(df.corr()) #Matplotlib # 柱状图 # fig, ax = plt.subplots(figsize=(6,4)) # ax.hist(df['petal width'], color='black') # ax.set_ylabel('Count', fontsize = 12) # ax.set_xlabel('Width',fontsize = 12) # plt.title('Iris Petal Width', fontsize=14, y=1.01) # plt.show() # fig, ax = plt.subplots(2,2, figsize = (8,4)) # ax[0][0].hist(df['petal width'], color='black') # ax[0][0].set_ylabel('Count',fontsize = 12) # ax[0][0].set_xlabel('Width', fontsize = 12) # ax[0][0].set_title('Iris Petel Width', fontsize = 14, y=1.01) # ax[0][1].hist(df['petal length'], color='black') # ax[0][1].set_ylabel('Count',fontsize = 12) # ax[0][1].set_xlabel('Width', fontsize = 12) # ax[0][1].set_title('Iris Petel Width', fontsize = 14, y=1.01) # ax[1][0].hist(df['sepal width'], color='black') # ax[1][0].set_ylabel('Count',fontsize = 12) # ax[1][0].set_xlabel('Width', fontsize = 12) # ax[1][0].set_title('Iris Petel Width', fontsize = 14, y=1.01) # ax[1][1].hist(df['sepal length'], color='black') # ax[1][1].set_ylabel('Count',fontsize = 12) # ax[1][1].set_xlabel('Width', fontsize = 12) # ax[1][1].set_title('Iris Petel Width', fontsize = 14, y=1.01) # plt.show() #散点图 # fig, ax = plt.subplots(figsize=(6,6)) # ax.scatter(df['petal width'],df['petal length'], color='red') # ax.set_xlabel('Petal Width') # ax.set_ylabel('Petal Length') # ax.set_title('Petal Scatterplot') # plt.show() #折线图 # fig, ax = plt.subplots(figsize=(6,6)) # ax.plot(df['petal length'], color='blue') # ax.set_xlabel('Specimen Number') # ax.set_ylabel('Petal Length') # ax.set_title('Petal Length Plot') # plt.show() #条形图 # fig, ax = plt.subplots(figsize=(6,6)) # bar_width = .8 # labels = [x for x in df.columns if 'length' in x or 'width' in x] # ver_y = [df[df['class']=='Iris-versicolor'][x].mean() for x in labels] # vir_y = [df[df['class']=='Iris-virginica'][x].mean() for x in labels] # set_y = [df[df['class']=='Iris-setosa'][x].mean() for x in labels] # x = np.arange(len(labels)) # ax.bar(x, vir_y, bar_width, bottom=set_y, color='darkgrey') # ax.bar(x, set_y, bar_width, bottom=ver_y, color='white') # ax.bar(x, ver_y, bar_width, color='black') # ax.set_xticklabels(labels, rotation=-70, fontsize=12) # ax.set_title('Mean Feature Measurement By Class', y=1.01) # ax.legend(['Virginica','Setosa','Versicolor']) # plt.show() #Seaborn # sns.pairplot(df, hue='class') # plt.show() #小提琴图 # fig,ax = plt.subplots(2,2, figsize=(7,7)) # sns.set(style='white', palette='muted') # sns.violinplot(x=df['class'], y=df['sepal length'], ax=ax[0,0]) # sns.violinplot(x=df['class'], y=df['sepal width'], ax=ax[0,1]) # sns.violinplot(x=df['class'], y=df['petal length'], ax=ax[1,0]) # sns.violinplot(x=df['class'], y=df['petal width'], ax=ax[1,1]) # fig.suptitle('Violin Plots', fontsize = 16, y=1.03) # for i in ax.flat: # plt.setp(i.get_xticklabels(),rotation=-90) # fig.tight_layout() # plt.show() #Map # df['class'] = df['class'].map({'Iris-setosa':'SET','Iris-virginica':'VIR','Iris-versicolor':'VER'}) # print(df['class']) #Apply # df['wide petal'] = df['petal width'].apply(lambda v:1 if v>=1.3 else 0) # print(df) # df['petal area'] = df.apply(lambda r:r['petal length']*r['petal width'], axis=1) # print(df) #Applymap # print(df.applymap(lambda v:np.log(v) if isinstance(v,float) else v)) #Groupby # print(df.groupby('class').mean()) # print(df.groupby('class').describe()) # print(df.groupby('petal width')['class'].unique()) # print(df.groupby('class')['petal width'] # .agg({'delta':lambda x:x.max() - x.min(),'max':np.max, 'min':np.min})) #Statsmodels # fig, ax = plt.subplots(figsize=(7,7)) # ax.scatter(df['sepal width'][:50], df['sepal length'][:50]) # ax.set_ylabel('Sepal Length') # ax.set_xlabel('Sepal Width') # ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, y=1.02) # plt.show() # y = df['sepal length'][:50] # x = df['sepal width'][:50] # X = sm.add_constant(x) # results = sm.OLS(y,X).fit() # print(results.summary()) # # fig, ax = plt.subplots(figsize=(7,7)) # ax.plot(x, results.fittedvalues, label='regression line') # ax.scatter(x, y, label='data point', color='r') # ax.set_ylabel('Sepal Length') # ax.set_xlabel('Sepal Width') # ax.set_title('Setosa Sepal Width vs. Sepal Length', fontsize=14, y=1.02) # ax.legend(loc=2) # plt.show() #scikit-learn # clf = RandomForestClassifier(max_depth=5,n_estimators=10) # X = df.ix[:,:4] # y = df.ix[:,4] # X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3) # clf.fit(X_train,y_train) # y_pred = clf.predict(X_test) # rf = pd.DataFrame(list(zip(y_pred,y_test)),columns=['predicted','actual']) # rf['correct'] = rf.apply(lambda r: 1 if r['predicted'] == r['actual'] else 0, axis=1) # # print(rf) # print(rf['correct'].sum()/rf['correct'].count()) #SVM # clf = OneVsOneClassifier(SVC(kernel='linear')) # X = df.ix[:,:4] # y = np.array(df.ix[:,4]).astype(str) # X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3) # clf.fit(X_train,y_train) # y_pred = clf.predict(X_test) # rf = pd.DataFrame(list(zip(y_pred,y_test)),columns=['predicted','actual']) # rf['correct'] = rf.apply(lambda r:1 if r['predicted'] == r['actual'] else 0,axis=1) # # print(rf) # print(rf['correct'].sum()/rf['correct'].count())
Python Machine Learning Blueprints
最新推荐文章于 2022-03-17 17:54:19 发布