research/machine_learning/scripts/ml/simple_models.ipynb
2024-01-31 21:41:29 +01:00

296 lines
9.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "4d2a8b6c",
"metadata": {},
"source": [
"#### Database"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "7be9eeff",
"metadata": {},
"outputs": [],
"source": [
"PROJECT_PATH = '/home/md/Work/ligalytics/leagues_stable/'\n",
"import os, sys\n",
"sys.path.insert(0, PROJECT_PATH)\n",
"os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"leagues.settings\")\n",
"\n",
"from leagues import settings\n",
"settings.DATABASES['default']['NAME'] = PROJECT_PATH+'/db.sqlite3'\n",
"\n",
"import django\n",
"django.setup()\n",
"\n",
"from scheduler.models import *\n",
"from common.functions import distanceInKmByGPS\n",
"season = Season.objects.filter(nicename=\"Imported: Benchmark Season\").first()\n",
"import pandas as pd\n",
"import numpy as np\n",
"from django.db.models import F\n",
"games = Game.objects.filter(season=season)\n",
"df = pd.DataFrame.from_records(games.values())\n",
"games = Game.objects.filter(season=season).annotate(\n",
" home=F('homeTeam__shortname'),\n",
" away=F('awayTeam__shortname'),\n",
" home_lat=F('homeTeam__latitude'),\n",
" home_lon=F('homeTeam__longitude'),\n",
" home_attr=F('homeTeam__attractivity'),\n",
" away_lat=F('awayTeam__latitude'),\n",
" away_lon=F('awayTeam__longitude'),\n",
" away_attr=F('awayTeam__attractivity'),\n",
" home_country=F('homeTeam__country'),\n",
" away_country=F('awayTeam__country'),\n",
").values()\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "bc191792",
"metadata": {},
"source": [
"#### Dataframe"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1e404cf8",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# create dataset\n",
"df = pd.DataFrame.from_records(games.values())\n",
"\n",
"# data cleaning\n",
"df['time'] = df['time'].replace('','0')\n",
"df = df[df['attendance'] != 0]\n",
"\n",
"\n",
"# pivots\n",
"pivot_homeTeam_mean = df.pivot_table('attendance','homeTeam_id',aggfunc='mean')\n",
"pivot_homeTeam_max = df.pivot_table('attendance','homeTeam_id',aggfunc='max')\n",
"\n",
"# add more features\n",
"df['weekday'] = df.apply(lambda r: r['date'].weekday(), axis=1)\n",
"df['day'] = df.apply(lambda r: r['date'].day, axis=1)\n",
"df['month'] = df.apply(lambda r: r['date'].month, axis=1)\n",
"df['year'] = df.apply(lambda r: r['date'].year, axis=1)\n",
"df['distance'] = df.apply(lambda r: distanceInKmByGPS(r['home_lon'],r['home_lat'],r['away_lon'],r['away_lat']), axis=1)\n",
"df['weekend'] = df.apply(lambda r: int(r['weekday'] in [6,7]), axis=1)\n",
"df['winter_season'] = df.apply(lambda r: int(r['month'] in [1,2,3,10,11,12]), axis=1)\n",
"df['home_base'] = df.apply(lambda r: pivot_homeTeam_mean.loc[r['homeTeam_id'],'attendance'], axis=1)\n",
"df['stadium_size'] = df.apply(lambda r: pivot_homeTeam_max.loc[r['homeTeam_id'],'attendance'], axis=1)\n",
"df['early'] = df.apply(lambda r: r['time'].replace(':','') < \"1800\", axis=1)\n",
"df['before2010'] = df.apply(lambda r: r['historic_season'].split('-')[0] < \"2010\", axis=1)\n",
"\n",
"\n",
"# one hot encoding\n",
"ohe_fields = ['home_country']\n",
"\n",
"for field in ohe_fields:\n",
" ohe = OneHotEncoder()\n",
" transformed = ohe.fit_transform(df[[field]])\n",
" df[ohe.categories_[0]] = transformed.toarray()\n",
"\n",
"# sort label to last index\n",
"cols = list(df.columns)\n",
"cols.append(cols.pop(cols.index('attendance')))\n",
"df = df[cols]"
]
},
{
"cell_type": "markdown",
"id": "e2ea08e5",
"metadata": {},
"source": [
"#### Train/Test Data - Normalization"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "74e12f87",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split, cross_val_predict\n",
"from sklearn import metrics\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"\n",
"\n",
"remove_columns = ['season_id', 'resultEntered', 'reversible', 'reschedule', 'homeGoals', 'awayGoals',\n",
" 'homeGoals2', 'awayGoals2', 'homeGoals3', 'awayGoals3', 'home', 'away', 'date', 'time',\n",
" 'id', 'homeTeam_id', 'awayTeam_id', 'historic_season',\n",
" 'home_country','home_lat','home_lon','away_lat','away_lon','away_country']\n",
"feature_cols = list(set(df.columns[:-1]) - set(remove_columns))\n",
"# feature_cols = ['weekday','weekend','home_base','distance','winter_season']\n",
"label = 'attendance'\n",
"\n",
"\n",
"X = df[feature_cols] # Features\n",
"y = df[label] # Target variable\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.3, random_state=1) # 70% training and 30% test"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "53545faa",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt \n",
"import seaborn as sns \n",
"from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict\n",
"from sklearn import metrics\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "45e08026",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mutiple Linear Regression Accuracy: 0.3819963751047786\n",
"Cross-Predicted(KFold) Mutiple Linear Regression Accuracy: 0.33440778552391626\n"
]
}
],
"source": [
"lin_reg = LinearRegression()\n",
"lin_reg.fit(X_train,y_train)\n",
"\n",
"#Predicting the SalePrices using test set \n",
"y_pred_lr = lin_reg.predict(X_test)\n",
"\n",
"#Mutiple Linear Regression Accuracy with test set\n",
"accuracy_lf = metrics.r2_score(y_test, y_pred_lr)\n",
"print('Mutiple Linear Regression Accuracy: ', accuracy_lf)\n",
"\n",
"#Predicting the SalePrice using cross validation (KFold method)\n",
"y_pred_kf_lr = cross_val_predict(lin_reg, X, y, cv=10 )\n",
"\n",
"#Mutiple Linear Regression Accuracy with cross validation (KFold method)\n",
"accuracy_lf = metrics.r2_score(y, y_pred_kf_lr)\n",
"print('Cross-Predicted(KFold) Mutiple Linear Regression Accuracy: ', accuracy_lf)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0de49b8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cross-Predicted(KFold) Polynominal Regression Accuracy: -261.39170432313074\n"
]
}
],
"source": [
"poly_reg = PolynomialFeatures(degree = 2)\n",
"X_poly = poly_reg.fit_transform(X)\n",
"lin_reg_pl = LinearRegression()\n",
"\n",
"#Predicting the SalePrice using cross validation (KFold method)\n",
"y_pred_pl = cross_val_predict(lin_reg_pl, X_poly, y, cv=10 )\n",
"#Polynominal Regression Accuracy with cross validation\n",
"accuracy_pl = metrics.r2_score(y, y_pred_pl)\n",
"print('Cross-Predicted(KFold) Polynominal Regression Accuracy: ', accuracy_pl)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "470425b6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Decision Tree Regression Accuracy: 0.23642868476932866\n",
"Cross-Predicted(KFold) Decision Tree Regression Accuracy: 0.4183541357709245\n"
]
}
],
"source": [
"dt_regressor = DecisionTreeRegressor(random_state = 0)\n",
"dt_regressor.fit(X_train,y_train)\n",
"\n",
"#Predicting the SalePrices using test set \n",
"y_pred_dt = dt_regressor.predict(X_test)\n",
"\n",
"#Decision Tree Regression Accuracy with test set\n",
"print('Decision Tree Regression Accuracy: ', dt_regressor.score(X_test,y_test))\n",
"\n",
"#Predicting the SalePrice using cross validation (KFold method)\n",
"y_pred_dt = cross_val_predict(dt_regressor, X, y, cv=10 )\n",
"#Decision Tree Regression Accuracy with cross validation\n",
"accuracy_dt = metrics.r2_score(y, y_pred_dt)\n",
"print('Cross-Predicted(KFold) Decision Tree Regression Accuracy: ', accuracy_dt)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6629826f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.13 ('leagues')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"vscode": {
"interpreter": {
"hash": "a07b7f3079ca8c056705d3c757c4f3f92f9509f33eeab9ad5420dacec37bc01a"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}