import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
In [6]:
data = pd.read_csv('./daily_weather.csv')
In [8]:
data.describe()
Out[8]:
numberair_pressure_9amair_temp_9amavg_wind_direction_9amavg_wind_speed_9ammax_wind_direction_9ammax_wind_speed_9amrain_accumulation_9amrain_duration_9amrelative_humidity_9amrelative_humidity_3pm
count1095.0000001092.0000001090.0000001091.0000001092.0000001092.0000001091.0000001089.0000001092.0000001095.0000001095.000000
mean547.000000918.88255164.933001142.2355115.508284148.9535187.0195140.203079294.10805234.24140235.344727
std316.2435773.18416111.17551469.1378594.55281367.2380135.5982091.5939521598.07877925.47206722.524079
min0.000000907.99000036.75200015.5000000.69345128.9000001.1855780.0000000.0000006.0900005.300000
25%273.500000916.55000057.28100065.9725062.24876876.5530033.0674770.0000000.00000015.09224317.395000
50%547.000000918.92104565.715479166.0000003.871333177.3000004.9436370.0000000.00000023.17925924.380000
75%820.500000921.16007373.450974191.0000007.337163201.2331538.9477600.0000000.00000045.40000052.060000
max1094.000000929.32000098.906000343.40000023.554978312.20000029.84078024.02000017704.00000092.62000092.250000
In [9]:
data.isnull().any()
Out[9]:
number                    False
air_pressure_9am           True
air_temp_9am               True
avg_wind_direction_9am     True
avg_wind_speed_9am         True
max_wind_direction_9am     True
max_wind_speed_9am         True
rain_accumulation_9am      True
rain_duration_9am          True
relative_humidity_9am     False
relative_humidity_3pm     False
dtype: bool
In [24]:
data[data.isnull().any(axis = 1)].count()
Out[24]:
number                    31
air_pressure_9am          28
air_temp_9am              26
avg_wind_direction_9am    27
avg_wind_speed_9am        28
max_wind_direction_9am    28
max_wind_speed_9am        27
rain_accumulation_9am     25
rain_duration_9am         28
relative_humidity_9am     31
relative_humidity_3pm     31
dtype: int64
In [25]:
del data['number']
In [26]:
#Store number of rows
before_rows = data.shape[0]
before_rows
Out[26]:
1095
In [28]:
data = data.dropna()
In [29]:
before_rows - data.shape[0]
Out[29]:
31
In [30]:
clean_data = data.copy()
In [31]:
#Turn Boolean Value To Integer My Multipling 1 
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] > 24.99 ) * 1
clean_data['high_humidity_label']
Out[31]:
0       1
1       0
2       0
3       0
4       1
5       1
6       0
7       1
8       0
9       1
10      1
11      1
12      1
13      1
14      0
15      0
17      0
18      1
19      0
20      0
21      1
22      0
23      1
24      0
25      1
26      1
27      1
28      1
29      1
30      1
       ..
1064    1
1065    1
1067    1
1068    1
1069    1
1070    1
1071    1
1072    0
1073    1
1074    1
1075    0
1076    0
1077    1
1078    0
1079    1
1080    0
1081    0
1082    1
1083    1
1084    1
1085    1
1086    1
1087    1
1088    1
1089    1
1090    1
1091    1
1092    1
1093    1
1094    0
Name: high_humidity_label, Length: 1064, dtype: int64
In [36]:
# df[] = Serise , df[[]]  = dataframe 
y = clean_data[['high_humidity_label']].copy()
type(y)
Out[36]:
pandas.core.frame.DataFrame
In [37]:
y.head()
Out[37]:
high_humidity_label
01
10
20
30
41
In [38]:
clean_data['relative_humidity_3pm'].head()
Out[38]:
0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64
In [40]:
data.columns
Out[40]:
Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')
In [41]:
morning_features = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am',
                   'avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am',
                   'rain_accumulation_9am','rain_duration_9am']
In [42]:
clean_data.columns
Out[42]:
Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm', 'high_humidity_label'],
      dtype='object')
In [43]:
x = clean_data[morning_features].copy()
In [44]:
x.columns
Out[44]:
Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')
In [45]:
y.columns
Out[45]:
Index(['high_humidity_label'], dtype='object')
In [46]:
# Take Two DataFrame And Split Those Into Four
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.33, random_state = 324 )
In [47]:
type(x_train)
Out[47]:
pandas.core.frame.DataFrame
In [48]:
type(x_test)
Out[48]:
pandas.core.frame.DataFrame
In [49]:
type(y_train)
Out[49]:
pandas.core.frame.DataFrame
In [50]:
type(y_test)
Out[50]:
pandas.core.frame.DataFrame
In [51]:
x_train.head()
Out[51]:
air_pressure_9amair_temp_9amavg_wind_direction_9amavg_wind_speed_9ammax_wind_direction_9ammax_wind_speed_9amrain_accumulation_9amrain_duration_9am
841918.37000072.932000184.5000002.013246186.7000002.7738060.00.0
75920.10000053.492000186.10000013.444009193.80000015.3677780.00.0
95927.61000054.89600055.0000004.98837653.4000007.2029470.00.0
895919.23515365.951112194.3433332.942019216.5697923.6588100.00.0
699919.88812868.687822228.5177303.960858247.9540285.1855470.00.0
In [52]:
y_train.describe()
Out[52]:
high_humidity_label
count712.000000
mean0.494382
std0.500320
min0.000000
25%0.000000
50%0.000000
75%1.000000
max1.000000
In [53]:
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes = 10 , random_state = 0)
humidity_classifier.fit(x_train, y_train)
Out[53]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')
In [54]:
type(humidity_classifier)
Out[54]:
sklearn.tree.tree.DecisionTreeClassifier
In [55]:
predictions = humidity_classifier.predict(x_test)
In [56]:
predictions[ : 10]
Out[56]:
array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])
In [57]:
y_test[ : 10]
Out[57]:
high_humidity_label
4560
8450
6931
2591
7231
2241
3001
4420
5851
10571
In [58]:
accuracy_score( y_true = y_test, y_pred = predictions )
Out[58]:
0.8153409090909091
In [ ]:


+ Recent posts