#数据导入
import pandas as pd
data_filename="datasets.csv"
dataset=pd.read_csv(data_filename)
#dataset.loc[:5]
#数据清洗
dataset=pd.read_csv(data_filename, parse_dates=["Date"])
#dataset.loc[:5]
#方法1:上场比赛情况特征提取,并用其进行预测
#主队获胜情况提取
from collections import defaultdict
dataset["HomeWin"]=dataset["VisitorPTS"]<dataset["HomePTS"]
y_true=dataset["HomeWin"].values
#队伍上一次比赛情况记录
won_last=defaultdict(int)
#HomeLastWin && VisitorLastWint填充
dataset["HomeLastWin"]=0
dataset["VisitorLastWin"]=0
for index, row in dataset.sort_values("Date").iterrows():
home_team=row["Home Team"]
visitor_team=row["Visitor Team"]
row["HomeLastWin"]=won_last[home_team]
row["VisitorLastWin"]=won_last[visitor_team]
dataset.loc[index]=row
won_last[home_team]=int(row["HomeWin"])
won_last[visitor_team]=1-int(row["HomeWin"])
#决策树 交叉验证
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
clf=DecisionTreeClassifier(random_state=14)
X_previouswins=dataset[["HomeLastWin","VisitorLastWin"]].values
scores=cross_val_score(clf,X_previouswins,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#方法2:上场比赛情况 && 球队排名用于预测
#双方相对排名提取
standings = pd.read_csv("standings.csv")
dataset["HomeTeamRanksHigher"]=0
for index,row in dataset.iterrows():
home_team=row["Home Team"]
visitor_team=row["Visitor Team"]
home_rank=standings[standings["Team"]==home_team]["Rk"].values[0]
visitor_rank=standings[standings["Team"]==visitor_team]["Rk"].values[0]
row["HomeTeamRankingHigher"]=int(home_rank>visitor_rank)
# 决策树 交叉验证
X_homehigher = dataset[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf=DecisionTreeClassifier(random_state=14)
scores=cross_val_score(clf,X_homehigher,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
# 方法3 忽略主客场,记录最后的胜利
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0
for index, row in dataset.iterrows():
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
teams = tuple(sorted([home_team, visitor_team]))
row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == home_team else 0
dataset.loc[index] = row
winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
last_match_winner[teams] = winner
# 决策树 交叉验证
clf=DecisionTreeClassifier(random_state=14)
X_lastwinner=dataset[["HomeLastWin","VisitorLastWin"]].values
scores=cross_val_score(clf,X_lastwinner,y_true,scoring="accuracy")
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#方法4 球队姓名数字化
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
#转换为整型
encoding.fit(dataset["Home Team"].values)
#球队名数据接收
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
#拼接
X_teams = np.vstack([home_teams, visitor_teams]).T
#print(X_teams)
#独热编码
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams_expanded, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#随机森林
# 仅用队名
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
# 队名 && 上一场胜者
#X_previouswins
X_all = np.hstack([X_previouswins, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
# GridSearchCV搜索最佳参数
from sklearn.model_selection import GridSearchCV
parameter_space = {
"max_features": [2, 10, 'auto'],
"n_estimators": [100,],
"criterion": ["gini", "entropy"],
"min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
#print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
#随机森林 正确率最高的模型参数
clf = RandomForestClassifier(bootstrap=True,criterion='entropy', max_depth=None, max_features=2,max_leaf_nodes=None, min_samples_leaf=6,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=14, verbose=0)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
#print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
比赛结果预测_决策树_随机森林(通用 数据挖掘入门与实践-实验5)
最新推荐文章于 2025-05-03 16:03:05 发布