比赛结果预测_决策树_随机森林(通用 数据挖掘入门与实践-实验5)

#数据导入
import pandas as pd

data_filename="datasets.csv"
dataset=pd.read_csv(data_filename)
#dataset.loc[:5]

#数据清洗
dataset=pd.read_csv(data_filename, parse_dates=["Date"])
#dataset.loc[:5]

#方法1:上场比赛情况特征提取,并用其进行预测

#主队获胜情况提取
from collections import defaultdict

dataset["HomeWin"]=dataset["VisitorPTS"]<dataset["HomePTS"]
y_true=dataset["HomeWin"].values

#队伍上一次比赛情况记录
won_last=defaultdict(int)

#HomeLastWin && VisitorLastWint填充
dataset["HomeLastWin"]=0
dataset["VisitorLastWin"]=0
for index, row in dataset.sort_values("Date").iterrows():
    home_team=row["Home Team"]
    visitor_team=row["Visitor Team"]
    row["HomeLastWin"]=won_last[home_team]
    row["VisitorLastWin"]=won_last[visitor_team]
    dataset.loc[index]=row
    won_last[home_team]=int(row["HomeWin"])
    won_last[visitor_team]=1-int(row["HomeWin"])

#决策树 交叉验证
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
clf=DecisionTreeClassifier(random_state=14)
X_previouswins=dataset[["HomeLastWin","VisitorLastWin"]].values

scores=cross_val_score(clf,X_previouswins,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

#方法2:上场比赛情况 && 球队排名用于预测

#双方相对排名提取
standings = pd.read_csv("standings.csv")
dataset["HomeTeamRanksHigher"]=0
for index,row in dataset.iterrows():
    home_team=row["Home Team"]
    visitor_team=row["Visitor Team"]
    
    
    home_rank=standings[standings["Team"]==home_team]["Rk"].values[0]
    visitor_rank=standings[standings["Team"]==visitor_team]["Rk"].values[0]
    row["HomeTeamRankingHigher"]=int(home_rank>visitor_rank)

# 决策树 交叉验证
X_homehigher = dataset[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf=DecisionTreeClassifier(random_state=14)

scores=cross_val_score(clf,X_homehigher,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

# 方法3 忽略主客场,记录最后的胜利

last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0 

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))
    row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == home_team else 0
    dataset.loc[index] = row
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

# 决策树 交叉验证
clf=DecisionTreeClassifier(random_state=14)
X_lastwinner=dataset[["HomeLastWin","VisitorLastWin"]].values

scores=cross_val_score(clf,X_lastwinner,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

#方法4 球队姓名数字化

from sklearn.preprocessing import LabelEncoder 
encoding = LabelEncoder()

#转换为整型
encoding.fit(dataset["Home Team"].values)

#球队名数据接收
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)

#拼接
X_teams = np.vstack([home_teams, visitor_teams]).T

#print(X_teams)

#独热编码
from sklearn.preprocessing import OneHotEncoder 
onehot = OneHotEncoder() 
X_teams_expanded = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14) 
scores = cross_val_score(clf, X_teams_expanded, y_true, scoring='accuracy') 
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))


#随机森林

# 仅用队名
from sklearn.ensemble import RandomForestClassifier 
clf = RandomForestClassifier(random_state=14) 
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy') 
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100)) 

# 队名 && 上一场胜者
#X_previouswins
X_all = np.hstack([X_previouswins, X_teams]) 
clf = RandomForestClassifier(random_state=14) 
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy') 
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

# GridSearchCV搜索最佳参数
from sklearn.model_selection import GridSearchCV

parameter_space = { 
 "max_features": [2, 10, 'auto'], 
 "n_estimators": [100,], 
 "criterion": ["gini", "entropy"], 
 "min_samples_leaf": [2, 4, 6], 
} 
clf = RandomForestClassifier(random_state=14) 
grid = GridSearchCV(clf, parameter_space) 
grid.fit(X_all, y_true) 
#print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

#随机森林 正确率最高的模型参数
clf = RandomForestClassifier(bootstrap=True,criterion='entropy', max_depth=None, max_features=2,max_leaf_nodes=None, min_samples_leaf=6,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=14, verbose=0) 
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy') 
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值