1.Python机器学习kaggle案例
Numpy-python科学计算库;Pandas-python数据分析处理库;Scikit-learn-python机器学习库;
2.泰坦尼克号数据介绍
乘客编号、是否幸存、等级、姓名、性别、年龄、兄弟姐妹个数、带老人孩子个数、船票、船票价格、上船地点;
3.数据预处理
import pandas #ipython notebook
titanic = pandas.read_csv("titanic_train.csv")
#titanic.head(3) //前3行打印出来
print titanic.describe() //统计特性:count、mean、std、min、25%、50%、75%、max
titanic ["Age"] = titanic ['Age'] . fillna(titanic['Age'].median()) //Age列中的缺失值用Age均值进行填充
printf titanic.describe()
print titanic ["Sex"].unique() //male用0,female用1
#Replace all the occurences of male with the number 0.
titanic.loc[titanic["Sex"] == "male","Sex"] = 0
titanic.loc[titanic["Sex"] == "female","Sex"] = 1
print titanic ["Embarked"].unique()
titanic["Embarked"] = titanic["Embarked"].fillna('S') //缺失值用最多的S进行填充
titanic.loc[titanic["Embarked"] == "S","Embarked"] = 0 //地点用0,1,2
titanic.loc[titanic["Embarked"] == "C","Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q","Embarked"] = 2
4.回归模型
#Import the linear regression class
from sklearn.linear_model import LinearRegression //线性回归
#Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold //训练集交叉验证,得到平均值
#The columns we'll use to predict the target
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]