research/machine_learning/scripts/ml/decisiontree.ipynb
2024-01-31 21:41:29 +01:00

239 lines
7.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "4d2a8b6c",
"metadata": {},
"source": [
"#### Database"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "7be9eeff",
"metadata": {},
"outputs": [],
"source": [
"PROJECT_PATH = '/home/md/Work/ligalytics/leagues_stable/'\n",
"import os, sys\n",
"sys.path.insert(0, PROJECT_PATH)\n",
"os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"leagues.settings\")\n",
"\n",
"from leagues import settings\n",
"settings.DATABASES['default']['NAME'] = PROJECT_PATH+'/db.sqlite3'\n",
"\n",
"import django\n",
"django.setup()\n",
"\n",
"from scheduler.models import *\n",
"from common.functions import distanceInKmByGPS\n",
"season = Season.objects.filter(nicename=\"Imported: Benchmark Season\").first()\n",
"import pandas as pd\n",
"import numpy as np\n",
"from django.db.models import Count, F, Value\n",
"games = Game.objects.filter(season=season)\n",
"df = pd.DataFrame.from_records(games.values())\n",
"games = Game.objects.filter(season=season).annotate(\n",
" home=F('homeTeam__shortname'),\n",
" away=F('awayTeam__shortname'),\n",
" home_lat=F('homeTeam__latitude'),\n",
" home_lon=F('homeTeam__longitude'),\n",
" home_attr=F('homeTeam__attractivity'),\n",
" away_lat=F('awayTeam__latitude'),\n",
" away_lon=F('awayTeam__longitude'),\n",
" away_attr=F('awayTeam__attractivity')\n",
").values()\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "bc191792",
"metadata": {},
"source": [
"#### Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1e404cf8",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# create dataset\n",
"df = pd.DataFrame.from_records(games.values())\n",
"\n",
"# pivots\n",
"pivot_homeTeam_mean = df.pivot_table('attendance','homeTeam_id',aggfunc='mean')\n",
"pivot_homeTeam_max = df.pivot_table('attendance','homeTeam_id',aggfunc='max')\n",
"\n",
"# add more features\n",
"df['weekday'] = df.apply(lambda r: r['date'].weekday(), axis=1)\n",
"df['day'] = df.apply(lambda r: r['date'].day, axis=1)\n",
"df['month'] = df.apply(lambda r: r['date'].month, axis=1)\n",
"df['year'] = df.apply(lambda r: r['date'].year, axis=1)\n",
"df['distance'] = df.apply(lambda r: distanceInKmByGPS(r['home_lon'],r['home_lat'],r['away_lon'],r['away_lat']), axis=1)\n",
"df['weekend'] = df.apply(lambda r: int(r['weekday'] in [6,7]), axis=1)\n",
"df['winter_season'] = df.apply(lambda r: int(r['month'] in [1,2,3,10,11,12]), axis=1)\n",
"df['home_base'] = df.apply(lambda r: pivot_homeTeam_mean.loc[r['homeTeam_id'],'attendance'], axis=1)\n",
"df['stadium_size'] = df.apply(lambda r: pivot_homeTeam_max.loc[r['homeTeam_id'],'attendance'], axis=1)\n",
"\n",
"# one hot encoding\n",
"ohe_fields = ['time', 'historic_season']\n",
"\n",
"for field in ohe_fields:\n",
" ohe = OneHotEncoder()\n",
" transformed = ohe.fit_transform(df[[field]])\n",
" df[ohe.categories_[0]] = transformed.toarray()\n",
"\n",
"# sort label to last index\n",
"cols = list(df.columns)\n",
"cols.append(cols.pop(cols.index('attendance')))\n",
"df = df[cols]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e69d24dc",
"metadata": {},
"outputs": [],
"source": [
"#Importing Libraries\n",
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing\n",
"import matplotlib.pyplot as plt # plotting library\n",
"from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict\n",
"from sklearn import metrics\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "markdown",
"id": "e2ea08e5",
"metadata": {},
"source": [
"#### Train/Test Data - Normalization"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "74e12f87",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"remove_columns = ['season_id', 'resultEntered', 'reversible', 'reschedule', 'homeGoals', 'awayGoals',\n",
" 'homeGoals2', 'awayGoals2', 'homeGoals3', 'awayGoals3', 'home', 'away', 'date', 'time', 'historic_season', 'id', 'homeTeam_id', 'awayTeam_id']\n",
"feature_cols = list(set(df.columns[:-1]) - set(remove_columns))\n",
"# feature_cols = ['weekday','weekend','home_base','distance','winter_season']\n",
"label = 'attendance'\n",
"\n",
"\n",
"X = df[feature_cols] # Features\n",
"y = df[label] # Target variable\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.3, random_state=1) # 70% training and 30% test"
]
},
{
"cell_type": "markdown",
"id": "94ade4b4",
"metadata": {},
"source": [
"#### Decision Tree"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4c9bdd0d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"FITTING...done\n",
"VISUALIZE\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pydotplus\n",
"from six import StringIO\n",
"from sklearn.tree import export_graphviz\n",
"from sklearn.tree import DecisionTreeRegressor \n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# Create Decision Tree classifer object\n",
"regr = DecisionTreeRegressor(max_depth=5, random_state=1234)\n",
"\n",
"# Train Decision Tree Classifer\n",
"print(\"FITTING...\", end=\"\")\n",
"regr = regr.fit(X_train, y_train)\n",
"print(\"done\")\n",
"\n",
"# Predict the response for test dataset\n",
"y_pred = regr.predict(X_test)\n",
"\n",
"print(\"VISUALIZE\")\n",
"dot_data = StringIO()\n",
"export_graphviz(regr, out_file=dot_data,\n",
" filled=True, rounded=True,\n",
" special_characters=True, feature_names=feature_cols)\n",
"graph = pydotplus.graph_from_dot_data(dot_data.getvalue())\n",
"graph.write_png('attendance.png')\n",
"# Image(graph.create_png())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.13 ('leagues')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"vscode": {
"interpreter": {
"hash": "a07b7f3079ca8c056705d3c757c4f3f92f9509f33eeab9ad5420dacec37bc01a"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}