import sqlite3
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
In [4]:
cnx = sqlite3.connect('database.sqlite')
df  = pd.read_sql_query("select * from Player_Attributes", cnx)
In [5]:
df.head()
Out[5]:
idplayer_fifa_api_idplayer_api_iddateoverall_ratingpotentialpreferred_footattacking_work_ratedefensive_work_ratecrossing...visionpenaltiesmarkingstanding_tacklesliding_tacklegk_divinggk_handlinggk_kickinggk_positioninggk_reflexes
012183535059422016-02-18 00:00:0067.071.0rightmediummedium49.0...54.048.065.069.069.06.011.010.08.08.0
122183535059422015-11-19 00:00:0067.071.0rightmediummedium49.0...54.048.065.069.069.06.011.010.08.08.0
232183535059422015-09-21 00:00:0062.066.0rightmediummedium49.0...54.048.065.066.069.06.011.010.08.08.0
342183535059422015-03-20 00:00:0061.065.0rightmediummedium48.0...53.047.062.063.066.05.010.09.07.07.0
452183535059422007-02-22 00:00:0061.065.0rightmediummedium48.0...53.047.062.063.066.05.010.09.07.07.0

5 rows × 42 columns

In [6]:
df.shape
Out[6]:
(183978, 42)
In [7]:
features = [
       'potential', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']
In [8]:
target = ['overall_rating']
In [9]:
df = df.dropna()
In [10]:
x  = df[features]
x.head()
Out[10]:
potentialcrossingfinishingheading_accuracyshort_passingvolleysdribblingcurvefree_kick_accuracylong_passing...visionpenaltiesmarkingstanding_tacklesliding_tacklegk_divinggk_handlinggk_kickinggk_positioninggk_reflexes
071.049.044.071.061.044.051.045.039.064.0...54.048.065.069.069.06.011.010.08.08.0
171.049.044.071.061.044.051.045.039.064.0...54.048.065.069.069.06.011.010.08.08.0
266.049.044.071.061.044.051.045.039.064.0...54.048.065.066.069.06.011.010.08.08.0
365.048.043.070.060.043.050.044.038.063.0...53.047.062.063.066.05.010.09.07.07.0
465.048.043.070.060.043.050.044.038.063.0...53.047.062.063.066.05.010.09.07.07.0

5 rows × 34 columns

In [11]:
y = df[target]
In [12]:
x.iloc[2]
Out[12]:
potential             66.0
crossing              49.0
finishing             44.0
heading_accuracy      71.0
short_passing         61.0
volleys               44.0
dribbling             51.0
curve                 45.0
free_kick_accuracy    39.0
long_passing          64.0
ball_control          49.0
acceleration          60.0
sprint_speed          64.0
agility               59.0
reactions             47.0
balance               65.0
shot_power            55.0
jumping               58.0
stamina               54.0
strength              76.0
long_shots            35.0
aggression            63.0
interceptions         41.0
positioning           45.0
vision                54.0
penalties             48.0
marking               65.0
standing_tackle       66.0
sliding_tackle        69.0
gk_diving              6.0
gk_handling           11.0
gk_kicking            10.0
gk_positioning         8.0
gk_reflexes            8.0
Name: 2, dtype: float64
In [14]:
y.head()
Out[14]:
overall_rating
067.0
167.0
262.0
361.0
461.0
In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 324)
In [16]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)
Out[16]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
In [18]:
y_prediction = regressor.predict(x_test)
y_prediction[ : 5]
Out[18]:
array([[66.51284879],
       [79.77234615],
       [66.57371825],
       [74.99042163],
       [66.20353346]])
In [22]:
RMSE = sqrt( mean_squared_error( y_true = y_test , y_pred = y_prediction))
RMSE
Out[22]:
2.805303046855209
In [24]:
# Decision Tree Regressor : Fit A New Regression Model To Training Set

regressor = DecisionTreeRegressor(max_depth = 20)
regressor.fit(x_train, y_train)
Out[24]:
DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
####### Script About DecisionTreeRegressor

That would be, let's say, a decision tree regressor. A decision tree regressor builds a model in a top-down manner by splitting data set on an attribute. So the algorithm chooses the attribute which gives maximum reduction in standard deviation.

In [25]:
y_prediction = regressor.predict(x_test)
y_prediction
Out[25]:
array([62.        , 84.        , 62.38666667, ..., 71.        ,
       62.        , 73.        ])
####### Script About RMSE

To get an idea of the RMSE, we know that a Root Mean Square Error of 100, for example, would be too high because our mean is 68. And our RMSE is higher than our mean value.

In [26]:
y_test.describe()
Out[26]:
overall_rating
count59517.000000
mean68.635818
std7.041297
min33.000000
25%64.000000
50%69.000000
75%73.000000
max94.000000
In [27]:
RMSE = sqrt( mean_squared_error( y_true = y_test , y_pred = y_prediction))
RMSE
Out[27]:
1.4592237894003544
In [ ]:


+ Recent posts