239 lines
7.2 KiB
Plaintext
239 lines
7.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4d2a8b6c",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Database"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "7be9eeff",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"PROJECT_PATH = '/home/md/Work/ligalytics/leagues_stable/'\n",
|
|
"import os, sys\n",
|
|
"sys.path.insert(0, PROJECT_PATH)\n",
|
|
"os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"leagues.settings\")\n",
|
|
"\n",
|
|
"from leagues import settings\n",
|
|
"settings.DATABASES['default']['NAME'] = PROJECT_PATH+'/db.sqlite3'\n",
|
|
"\n",
|
|
"import django\n",
|
|
"django.setup()\n",
|
|
"\n",
|
|
"from scheduler.models import *\n",
|
|
"from common.functions import distanceInKmByGPS\n",
|
|
"season = Season.objects.filter(nicename=\"Imported: Benchmark Season\").first()\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from django.db.models import Count, F, Value\n",
|
|
"games = Game.objects.filter(season=season)\n",
|
|
"df = pd.DataFrame.from_records(games.values())\n",
|
|
"games = Game.objects.filter(season=season).annotate(\n",
|
|
" home=F('homeTeam__shortname'),\n",
|
|
" away=F('awayTeam__shortname'),\n",
|
|
" home_lat=F('homeTeam__latitude'),\n",
|
|
" home_lon=F('homeTeam__longitude'),\n",
|
|
" home_attr=F('homeTeam__attractivity'),\n",
|
|
" away_lat=F('awayTeam__latitude'),\n",
|
|
" away_lon=F('awayTeam__longitude'),\n",
|
|
" away_attr=F('awayTeam__attractivity')\n",
|
|
").values()\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bc191792",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Dataframe"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "1e404cf8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
"\n",
|
|
"# create dataset\n",
|
|
"df = pd.DataFrame.from_records(games.values())\n",
|
|
"\n",
|
|
"# pivots\n",
|
|
"pivot_homeTeam_mean = df.pivot_table('attendance','homeTeam_id',aggfunc='mean')\n",
|
|
"pivot_homeTeam_max = df.pivot_table('attendance','homeTeam_id',aggfunc='max')\n",
|
|
"\n",
|
|
"# add more features\n",
|
|
"df['weekday'] = df.apply(lambda r: r['date'].weekday(), axis=1)\n",
|
|
"df['day'] = df.apply(lambda r: r['date'].day, axis=1)\n",
|
|
"df['month'] = df.apply(lambda r: r['date'].month, axis=1)\n",
|
|
"df['year'] = df.apply(lambda r: r['date'].year, axis=1)\n",
|
|
"df['distance'] = df.apply(lambda r: distanceInKmByGPS(r['home_lon'],r['home_lat'],r['away_lon'],r['away_lat']), axis=1)\n",
|
|
"df['weekend'] = df.apply(lambda r: int(r['weekday'] in [6,7]), axis=1)\n",
|
|
"df['winter_season'] = df.apply(lambda r: int(r['month'] in [1,2,3,10,11,12]), axis=1)\n",
|
|
"df['home_base'] = df.apply(lambda r: pivot_homeTeam_mean.loc[r['homeTeam_id'],'attendance'], axis=1)\n",
|
|
"df['stadium_size'] = df.apply(lambda r: pivot_homeTeam_max.loc[r['homeTeam_id'],'attendance'], axis=1)\n",
|
|
"\n",
|
|
"# one hot encoding\n",
|
|
"ohe_fields = ['time', 'historic_season']\n",
|
|
"\n",
|
|
"for field in ohe_fields:\n",
|
|
" ohe = OneHotEncoder()\n",
|
|
" transformed = ohe.fit_transform(df[[field]])\n",
|
|
" df[ohe.categories_[0]] = transformed.toarray()\n",
|
|
"\n",
|
|
"# sort label to last index\n",
|
|
"cols = list(df.columns)\n",
|
|
"cols.append(cols.pop(cols.index('attendance')))\n",
|
|
"df = df[cols]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "e69d24dc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Importing Libraries\n",
|
|
"import numpy as np # linear algebra\n",
|
|
"import pandas as pd # data processing\n",
|
|
"import matplotlib.pyplot as plt # plotting library\n",
|
|
"from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict\n",
|
|
"from sklearn import metrics\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"from sklearn.preprocessing import PolynomialFeatures\n",
|
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
|
"from sklearn.ensemble import RandomForestRegressor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e2ea08e5",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Train/Test Data - Normalization"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "74e12f87",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"\n",
|
|
"remove_columns = ['season_id', 'resultEntered', 'reversible', 'reschedule', 'homeGoals', 'awayGoals',\n",
|
|
" 'homeGoals2', 'awayGoals2', 'homeGoals3', 'awayGoals3', 'home', 'away', 'date', 'time', 'historic_season', 'id', 'homeTeam_id', 'awayTeam_id']\n",
|
|
"feature_cols = list(set(df.columns[:-1]) - set(remove_columns))\n",
|
|
"# feature_cols = ['weekday','weekend','home_base','distance','winter_season']\n",
|
|
"label = 'attendance'\n",
|
|
"\n",
|
|
"\n",
|
|
"X = df[feature_cols] # Features\n",
|
|
"y = df[label] # Target variable\n",
|
|
"\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
|
" X, y, test_size=0.3, random_state=1) # 70% training and 30% test"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "94ade4b4",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Decision Tree"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "4c9bdd0d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"FITTING...done\n",
|
|
"VISUALIZE\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pydotplus\n",
|
|
"from six import StringIO\n",
|
|
"from sklearn.tree import export_graphviz\n",
|
|
"from sklearn.tree import DecisionTreeRegressor \n",
|
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
"\n",
|
|
"# Create Decision Tree classifer object\n",
|
|
"regr = DecisionTreeRegressor(max_depth=5, random_state=1234)\n",
|
|
"\n",
|
|
"# Train Decision Tree Classifer\n",
|
|
"print(\"FITTING...\", end=\"\")\n",
|
|
"regr = regr.fit(X_train, y_train)\n",
|
|
"print(\"done\")\n",
|
|
"\n",
|
|
"# Predict the response for test dataset\n",
|
|
"y_pred = regr.predict(X_test)\n",
|
|
"\n",
|
|
"print(\"VISUALIZE\")\n",
|
|
"dot_data = StringIO()\n",
|
|
"export_graphviz(regr, out_file=dot_data,\n",
|
|
" filled=True, rounded=True,\n",
|
|
" special_characters=True, feature_names=feature_cols)\n",
|
|
"graph = pydotplus.graph_from_dot_data(dot_data.getvalue())\n",
|
|
"graph.write_png('attendance.png')\n",
|
|
"# Image(graph.create_png())"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3.7.13 ('leagues')",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.13"
|
|
},
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "a07b7f3079ca8c056705d3c757c4f3f92f9509f33eeab9ad5420dacec37bc01a"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|