{ "cells": [ { "cell_type": "markdown", "id": "4d2a8b6c", "metadata": {}, "source": [ "#### Database" ] }, { "cell_type": "code", "execution_count": 1, "id": "7be9eeff", "metadata": {}, "outputs": [], "source": [ "PROJECT_PATH = '/home/md/Work/ligalytics/leagues_stable/'\n", "import os, sys\n", "sys.path.insert(0, PROJECT_PATH)\n", "os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"leagues.settings\")\n", "\n", "from leagues import settings\n", "settings.DATABASES['default']['NAME'] = PROJECT_PATH+'/db.sqlite3'\n", "\n", "import django\n", "django.setup()\n", "\n", "from scheduler.models import *\n", "from common.functions import distanceInKmByGPS\n", "season = Season.objects.filter(nicename=\"Imported: Benchmark Season\").first()\n", "import pandas as pd\n", "import numpy as np\n", "from django.db.models import F\n", "games = Game.objects.filter(season=season)\n", "df = pd.DataFrame.from_records(games.values())\n", "games = Game.objects.filter(season=season).annotate(\n", " home=F('homeTeam__shortname'),\n", " away=F('awayTeam__shortname'),\n", " home_lat=F('homeTeam__latitude'),\n", " home_lon=F('homeTeam__longitude'),\n", " home_attr=F('homeTeam__attractivity'),\n", " away_lat=F('awayTeam__latitude'),\n", " away_lon=F('awayTeam__longitude'),\n", " away_attr=F('awayTeam__attractivity'),\n", " home_country=F('homeTeam__country'),\n", " away_country=F('awayTeam__country'),\n", ").values()\n", "\n" ] }, { "cell_type": "markdown", "id": "bc191792", "metadata": {}, "source": [ "#### Dataframe" ] }, { "cell_type": "code", "execution_count": 2, "id": "1e404cf8", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "# create dataset\n", "df = pd.DataFrame.from_records(games.values())\n", "\n", "# data cleaning\n", "df['time'] = df['time'].replace('','0')\n", "df = df[df['attendance'] != 0]\n", "\n", "\n", "# pivots\n", "pivot_homeTeam_mean = df.pivot_table('attendance','homeTeam_id',aggfunc='mean')\n", "pivot_homeTeam_max = df.pivot_table('attendance','homeTeam_id',aggfunc='max')\n", "\n", "# add more features\n", "df['weekday'] = df.apply(lambda r: r['date'].weekday(), axis=1)\n", "df['day'] = df.apply(lambda r: r['date'].day, axis=1)\n", "df['month'] = df.apply(lambda r: r['date'].month, axis=1)\n", "df['year'] = df.apply(lambda r: r['date'].year, axis=1)\n", "df['distance'] = df.apply(lambda r: distanceInKmByGPS(r['home_lon'],r['home_lat'],r['away_lon'],r['away_lat']), axis=1)\n", "df['weekend'] = df.apply(lambda r: int(r['weekday'] in [6,7]), axis=1)\n", "df['winter_season'] = df.apply(lambda r: int(r['month'] in [1,2,3,10,11,12]), axis=1)\n", "df['home_base'] = df.apply(lambda r: pivot_homeTeam_mean.loc[r['homeTeam_id'],'attendance'], axis=1)\n", "df['stadium_size'] = df.apply(lambda r: pivot_homeTeam_max.loc[r['homeTeam_id'],'attendance'], axis=1)\n", "df['early'] = df.apply(lambda r: r['time'].replace(':','') < \"1800\", axis=1)\n", "df['before2010'] = df.apply(lambda r: r['historic_season'].split('-')[0] < \"2010\", axis=1)\n", "\n", "\n", "# one hot encoding\n", "ohe_fields = ['home_country']\n", "\n", "for field in ohe_fields:\n", " ohe = OneHotEncoder()\n", " transformed = ohe.fit_transform(df[[field]])\n", " df[ohe.categories_[0]] = transformed.toarray()\n", "\n", "# sort label to last index\n", "cols = list(df.columns)\n", "cols.append(cols.pop(cols.index('attendance')))\n", "df = df[cols]" ] }, { "cell_type": "markdown", "id": "e2ea08e5", "metadata": {}, "source": [ "#### Train/Test Data - Normalization" ] }, { "cell_type": "code", "execution_count": 3, "id": "74e12f87", "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd \n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split, cross_val_predict\n", "from sklearn import metrics\n", "from sklearn.ensemble import RandomForestRegressor\n", "\n", "\n", "remove_columns = ['season_id', 'resultEntered', 'reversible', 'reschedule', 'homeGoals', 'awayGoals',\n", " 'homeGoals2', 'awayGoals2', 'homeGoals3', 'awayGoals3', 'home', 'away', 'date', 'time',\n", " 'id', 'homeTeam_id', 'awayTeam_id', 'historic_season',\n", " 'home_country','home_lat','home_lon','away_lat','away_lon','away_country']\n", "feature_cols = list(set(df.columns[:-1]) - set(remove_columns))\n", "# feature_cols = ['weekday','weekend','home_base','distance','winter_season']\n", "label = 'attendance'\n", "\n", "\n", "X = df[feature_cols] # Features\n", "y = df[label] # Target variable\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.3, random_state=1) # 70% training and 30% test" ] }, { "cell_type": "code", "execution_count": 4, "id": "53545faa", "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd \n", "import matplotlib.pyplot as plt \n", "import seaborn as sns \n", "from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict\n", "from sklearn import metrics\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "code", "execution_count": 5, "id": "45e08026", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mutiple Linear Regression Accuracy: 0.3819963751047786\n", "Cross-Predicted(KFold) Mutiple Linear Regression Accuracy: 0.33440778552391626\n" ] } ], "source": [ "lin_reg = LinearRegression()\n", "lin_reg.fit(X_train,y_train)\n", "\n", "#Predicting the SalePrices using test set \n", "y_pred_lr = lin_reg.predict(X_test)\n", "\n", "#Mutiple Linear Regression Accuracy with test set\n", "accuracy_lf = metrics.r2_score(y_test, y_pred_lr)\n", "print('Mutiple Linear Regression Accuracy: ', accuracy_lf)\n", "\n", "#Predicting the SalePrice using cross validation (KFold method)\n", "y_pred_kf_lr = cross_val_predict(lin_reg, X, y, cv=10 )\n", "\n", "#Mutiple Linear Regression Accuracy with cross validation (KFold method)\n", "accuracy_lf = metrics.r2_score(y, y_pred_kf_lr)\n", "print('Cross-Predicted(KFold) Mutiple Linear Regression Accuracy: ', accuracy_lf)" ] }, { "cell_type": "code", "execution_count": 6, "id": "0de49b8a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cross-Predicted(KFold) Polynominal Regression Accuracy: -261.39170432313074\n" ] } ], "source": [ "poly_reg = PolynomialFeatures(degree = 2)\n", "X_poly = poly_reg.fit_transform(X)\n", "lin_reg_pl = LinearRegression()\n", "\n", "#Predicting the SalePrice using cross validation (KFold method)\n", "y_pred_pl = cross_val_predict(lin_reg_pl, X_poly, y, cv=10 )\n", "#Polynominal Regression Accuracy with cross validation\n", "accuracy_pl = metrics.r2_score(y, y_pred_pl)\n", "print('Cross-Predicted(KFold) Polynominal Regression Accuracy: ', accuracy_pl)" ] }, { "cell_type": "code", "execution_count": 7, "id": "470425b6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Decision Tree Regression Accuracy: 0.23642868476932866\n", "Cross-Predicted(KFold) Decision Tree Regression Accuracy: 0.4183541357709245\n" ] } ], "source": [ "dt_regressor = DecisionTreeRegressor(random_state = 0)\n", "dt_regressor.fit(X_train,y_train)\n", "\n", "#Predicting the SalePrices using test set \n", "y_pred_dt = dt_regressor.predict(X_test)\n", "\n", "#Decision Tree Regression Accuracy with test set\n", "print('Decision Tree Regression Accuracy: ', dt_regressor.score(X_test,y_test))\n", "\n", "#Predicting the SalePrice using cross validation (KFold method)\n", "y_pred_dt = cross_val_predict(dt_regressor, X, y, cv=10 )\n", "#Decision Tree Regression Accuracy with cross validation\n", "accuracy_dt = metrics.r2_score(y, y_pred_dt)\n", "print('Cross-Predicted(KFold) Decision Tree Regression Accuracy: ', accuracy_dt)" ] }, { "cell_type": "code", "execution_count": null, "id": "6629826f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7.13 ('leagues')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" }, "vscode": { "interpreter": { "hash": "a07b7f3079ca8c056705d3c757c4f3f92f9509f33eeab9ad5420dacec37bc01a" } } }, "nbformat": 4, "nbformat_minor": 5 }