from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import utils
import pandas as pd
import numpy as np
from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates

%matplotlib inline
In [5]:
data = pd.read_csv('./minute_weather.csv')
In [6]:
data.head()
Out[6]:
rowIDhpwren_timestampair_pressureair_tempavg_wind_directionavg_wind_speedmax_wind_directionmax_wind_speedmin_wind_directionmin_wind_speedrain_accumulationrain_durationrelative_humidity
002011-09-10 00:00:49912.364.7697.01.2106.01.685.01.0NaNNaN60.5
112011-09-10 00:01:49912.363.86161.00.8215.01.543.00.20.00.039.9
222011-09-10 00:02:49912.364.2277.00.7143.01.2324.00.30.00.043.0
332011-09-10 00:03:49912.364.4089.01.2112.01.612.00.70.00.049.5
442011-09-10 00:04:49912.364.40185.00.4260.01.0100.00.10.00.058.8
In [7]:
data.describe()
Out[7]:
rowIDair_pressureair_tempavg_wind_directionavg_wind_speedmax_wind_directionmax_wind_speedmin_wind_directionmin_wind_speedrain_accumulationrain_durationrelative_humidity
count1.587257e+061.587257e+061.587257e+061.586824e+061.586824e+061.586824e+061.586824e+061.586824e+061.586824e+061.587256e+061.587256e+061.587257e+06
mean7.936280e+059.168301e+026.185144e+011.619654e+022.774272e+001.634030e+023.399813e+001.668264e+022.133130e+001.854836e-035.361460e-014.760837e+01
std4.582018e+053.051593e+001.183362e+019.520812e+012.060758e+009.236723e+012.423167e+009.746275e+011.745345e+009.609716e-018.114766e+012.621454e+01
min0.000000e+009.050000e+023.164000e+010.000000e+000.000000e+000.000000e+001.000000e-010.000000e+000.000000e+000.000000e+000.000000e+007.000000e-01
25%3.968140e+059.148000e+025.270000e+016.200000e+011.300000e+006.800000e+011.600000e+007.700000e+018.000000e-010.000000e+000.000000e+002.470000e+01
50%7.936280e+059.167000e+026.224000e+011.820000e+022.200000e+001.870000e+022.700000e+001.800000e+021.600000e+000.000000e+000.000000e+004.470000e+01
75%1.190442e+069.187000e+027.088000e+012.170000e+023.800000e+002.230000e+024.600000e+002.120000e+023.000000e+000.000000e+000.000000e+006.800000e+01
max1.587256e+069.295000e+029.950000e+013.590000e+023.230000e+013.590000e+023.600000e+013.590000e+023.200000e+016.550100e+026.330500e+049.300000e+01
In [8]:
data.columns
Out[8]:
Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'rain_accumulation', 'rain_duration', 'relative_humidity'],
      dtype='object')
In [11]:
data[data.isnull().any(axis=1)]
Out[11]:
rowIDhpwren_timestampair_pressureair_tempavg_wind_directionavg_wind_speedmax_wind_directionmax_wind_speedmin_wind_directionmin_wind_speedrain_accumulationrain_durationrelative_humidity
002011-09-10 00:00:49912.364.7697.01.2106.01.685.01.0NaNNaN60.5
34790347902011-10-04 10:25:48915.751.08NaNNaNNaNNaNNaNNaN0.00.091.0
35929359292011-10-05 05:24:48915.249.64NaNNaNNaNNaNNaNNaN0.00.092.0
36320363202011-10-05 11:55:49914.750.00NaNNaNNaNNaNNaNNaN0.00.091.9
36321363212011-10-05 11:56:49914.750.00NaNNaNNaNNaNNaNNaN0.00.091.9
36322363222011-10-05 11:57:49914.750.00NaNNaNNaNNaNNaNNaN0.00.091.9
36323363232011-10-05 11:58:49914.650.00NaNNaNNaNNaNNaNNaN0.00.091.9
36324363242011-10-05 11:59:49914.750.00NaNNaNNaNNaNNaNNaN0.00.091.9
36325363252011-10-05 12:00:49914.650.00NaNNaNNaNNaNNaNNaN0.00.091.9
36326363262011-10-05 12:01:49914.650.00NaNNaNNaNNaNNaNNaN0.00.092.0
36327363272011-10-05 12:02:49914.550.18NaNNaNNaNNaNNaNNaN0.00.092.0
36328363282011-10-05 12:03:49914.550.18NaNNaNNaNNaNNaNNaN0.00.092.0
36329363292011-10-05 12:04:49914.550.18NaNNaNNaNNaNNaNNaN0.00.092.0
36330363302011-10-05 12:05:49914.450.18NaNNaNNaNNaNNaNNaN0.00.092.0
36331363312011-10-05 12:06:49914.450.18NaNNaNNaNNaNNaNNaN0.00.091.9
64745647452011-10-25 05:40:49918.651.08NaNNaNNaNNaNNaNNaN0.00.091.9
79098790982011-11-04 04:53:50911.048.92NaNNaNNaNNaNNaNNaN0.00.091.5
79099790992011-11-04 04:54:50911.048.92NaNNaNNaNNaNNaNNaN0.00.091.5
79100791002011-11-04 04:55:50911.148.92NaNNaNNaNNaNNaNNaN0.00.091.4
79101791012011-11-04 04:56:50911.148.92NaNNaNNaNNaNNaNNaN0.00.091.5
79102791022011-11-04 04:57:50911.148.92NaNNaNNaNNaNNaNNaN0.00.091.5
79103791032011-11-04 04:58:50911.048.92NaNNaNNaNNaNNaNNaN0.00.091.5
79104791042011-11-04 04:59:50911.048.92NaNNaNNaNNaNNaNNaN0.00.091.5
79105791052011-11-04 05:00:50910.948.92NaNNaNNaNNaNNaNNaN0.00.091.5
79106791062011-11-04 05:01:50911.048.92NaNNaNNaNNaNNaNNaN0.00.091.5
79107791072011-11-04 05:02:50910.948.92NaNNaNNaNNaNNaNNaN0.00.091.6
79108791082011-11-04 05:03:50910.948.92NaNNaNNaNNaNNaNNaN0.00.091.6
79250792502011-11-04 07:25:50910.648.02NaNNaNNaNNaNNaNNaN0.00.091.8
79609796092011-11-04 13:24:50908.645.14NaNNaNNaNNaNNaNNaN0.00.091.7
79723797232011-11-04 15:18:50906.946.04NaNNaNNaNNaNNaNNaN0.010.091.6
..........................................
134616413461642014-03-27 08:46:32917.144.78NaNNaNNaNNaNNaNNaN0.00.091.2
134616513461652014-03-27 08:47:32917.144.78NaNNaNNaNNaNNaNNaN0.00.091.2
134616613461662014-03-27 08:48:32917.144.78NaNNaNNaNNaNNaNNaN0.00.091.2
134616713461672014-03-27 08:49:32917.144.96NaNNaNNaNNaNNaNNaN0.00.091.2
134616813461682014-03-27 08:50:32917.144.78NaNNaNNaNNaNNaNNaN0.00.091.1
134616913461692014-03-27 08:51:32917.244.78NaNNaNNaNNaNNaNNaN0.00.091.1
134617013461702014-03-27 08:52:32917.144.60NaNNaNNaNNaNNaNNaN0.00.091.1
134617113461712014-03-27 08:53:32917.144.60NaNNaNNaNNaNNaNNaN0.00.091.1
134617213461722014-03-27 08:54:32917.144.78NaNNaNNaNNaNNaNNaN0.00.091.2
134617313461732014-03-27 08:55:32917.144.96NaNNaNNaNNaNNaNNaN0.00.091.2
134617413461742014-03-27 08:56:32917.144.78NaNNaNNaNNaNNaNNaN0.00.091.2
134617513461752014-03-27 08:57:32917.044.78NaNNaNNaNNaNNaNNaN0.00.091.1
134617613461762014-03-27 08:58:32917.044.96NaNNaNNaNNaNNaNNaN0.00.091.2
134617713461772014-03-27 08:59:32917.145.14NaNNaNNaNNaNNaNNaN0.00.091.2
134617813461782014-03-27 09:00:32917.144.96NaNNaNNaNNaNNaNNaN0.00.091.2
134617913461792014-03-27 09:01:32917.244.96NaNNaNNaNNaNNaNNaN0.00.091.1
134618013461802014-03-27 09:02:32917.144.96NaNNaNNaNNaNNaNNaN0.00.091.1
134618113461812014-03-27 09:03:32917.244.78NaNNaNNaNNaNNaNNaN0.00.091.1
134618213461822014-03-27 09:04:32917.244.78NaNNaNNaNNaNNaNNaN0.00.091.1
134618313461832014-03-27 09:05:32917.244.60NaNNaNNaNNaNNaNNaN0.00.091.1
134618413461842014-03-27 09:06:32917.344.60NaNNaNNaNNaNNaNNaN0.00.091.1
134618513461852014-03-27 09:07:32917.344.60NaNNaNNaNNaNNaNNaN0.00.091.1
134618613461862014-03-27 09:08:32917.344.78NaNNaNNaNNaNNaNNaN0.00.091.2
134618713461872014-03-27 09:09:32917.444.96NaNNaNNaNNaNNaNNaN0.00.091.2
134618813461882014-03-27 09:10:32917.345.14NaNNaNNaNNaNNaNNaN0.00.091.2
134618913461892014-03-27 09:11:32917.445.14NaNNaNNaNNaNNaNNaN0.00.091.2
134619013461902014-03-27 09:12:32917.445.14NaNNaNNaNNaNNaNNaN0.00.091.2
134619113461912014-03-27 09:13:32917.444.96NaNNaNNaNNaNNaNNaN0.00.091.1
134619213461922014-03-27 09:14:32917.544.96NaNNaNNaNNaNNaNNaN0.00.091.2
139484413948442014-04-30 06:21:49916.762.06NaNNaNNaNNaNNaNNaN0.010.013.8

434 rows × 13 columns

In [12]:
data.shape
Out[12]:
(1587257, 13)
In [14]:
sampled_df = data[ (data['rowID'] % 10 ) == 0 ]
sampled_df.shape
Out[14]:
(158726, 13)
In [17]:
# Use Tanspose() Here To Focus Mean Values Of Every Columns
sampled_df.describe().transpose()
Out[17]:
countmeanstdmin25%50%75%max
rowID158726.0793625.000000458203.9375090.00396812.5793625.001190437.501587250.00
air_pressure158726.0916.8301613.051717905.00914.8916.70918.70929.50
air_temp158726.061.85158911.83356931.6452.762.2470.8899.50
avg_wind_direction158680.0162.15610095.2782010.0062.0182.00217.00359.00
avg_wind_speed158680.02.7752152.0576240.001.32.203.8031.90
max_wind_direction158680.0163.46214492.4521390.0068.0187.00223.00359.00
max_wind_speed158680.03.4005582.4188020.101.62.704.6036.00
min_wind_direction158680.0166.77401797.4411090.0076.0180.00212.00359.00
min_wind_speed158680.02.1346641.7421130.000.81.603.0031.60
rain_accumulation158725.00.0003180.0112360.000.00.000.003.12
rain_duration158725.00.4096278.6655230.000.00.000.002960.00
relative_humidity158726.047.60947026.2144090.9024.744.7068.0093.00
In [21]:
n_NoRainDays = sampled_df[sampled_df['rain_accumulation' ] == 0 ].shape
In [20]:
sampled_df[sampled_df['rain_duration' ] == 0 ].shape
Out[20]:
(157237, 13)
In [24]:
(sampled_df.shape[0] - n_NoRainDays[0]) / sampled_df.shape[0]
Out[24]:
0.005758350868792762
In [25]:
# It's Better To Delete Very Common Feature In Dataset For Proper Clustering

del sampled_df['rain_accumulation']
del sampled_df['rain_duration'    ]
In [27]:
sampled_df.columns
Out[27]:
Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')
In [28]:
n_rows_before = sampled_df.shape[0]
sampled_df    = sampled_df.dropna()
In [29]:
n_rows_before - sampled_df.shape[0]
Out[29]:
46
In [30]:
sampled_df.columns
Out[30]:
Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')
In [37]:
feature = [ 'air_pressure','air_temp','avg_wind_direction','avg_wind_speed',
          'max_wind_speed','max_wind_speed', 'relative_humidity']
In [38]:
select_df = sampled_df[feature]
In [39]:
select_df.columns
Out[39]:
Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_speed', 'max_wind_speed', 'relative_humidity'],
      dtype='object')
In [40]:
select_df.shape
Out[40]:
(158680, 7)
In [41]:
select_df.head()
Out[41]:
air_pressureair_tempavg_wind_directionavg_wind_speedmax_wind_speedmax_wind_speedrelative_humidity
0912.364.7697.01.21.61.660.5
10912.362.24144.01.21.81.838.5
20912.263.32100.02.02.52.558.3
30912.262.6091.02.02.42.457.9
40912.264.0481.02.62.92.957.4
In [45]:
# To Keep Values Of Columns Comparable, Scaling Is Needed

# Fit_Transform Function Is Combined Fit And Transfrom Operations
# Which means that It will Find How To Scale The DataFrame And Apply It

x = StandardScaler().fit_transform(select_df)
x[ : 5]
Out[45]:
array([[-1.48456281,  0.24544455, -0.68385323, -0.76555283, -0.74440309,
        -0.74440309,  0.49233835],
       [-1.48456281,  0.03247142, -0.19055941, -0.76555283, -0.66171726,
        -0.66171726, -0.34710804],
       [-1.51733167,  0.12374562, -0.65236639, -0.37675365, -0.37231683,
        -0.37231683,  0.40839371],
       [-1.51733167,  0.06289616, -0.74682691, -0.37675365, -0.41365975,
        -0.41365975,  0.39313105],
       [-1.51733167,  0.18459509, -0.85178304, -0.08515426, -0.20694517,
        -0.20694517,  0.37405273]])
In [49]:
kmeans = KMeans(n_clusters = 12 )
model  = kmeans.fit(x)
print("model \n", model)
model 
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
In [50]:
centers = model.cluster_centers_
len(centers)
Out[50]:
12

Plots

In [51]:
# Function That Creates A DataFrame With A Column For Cluster Number
def pd_centers(featureUsed, centers):
    colNames = list(featureUsed)
    colNames.append('prediction')
    
    # Zip With A Column Called 'prediction' (Index)
    
    Z = [ np.append(A, index) for index, A in enumerate(centers)]
    
    # Convert To Pandas DataFrame For Plotting
    P = pd.DataFrame( Z, columns = colNames )
    P[ 'prediction' ] = P[ 'prediction' ].astype(int)
    return P
In [57]:
# Function That Creates Parallel Plots

def parallel_plot(data):
    my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k' ]), None , len(data)))
    plt.figure(figsize = (15, 8)).gca().axes.set_ylim([-3, +3])
    parallel_coordinates(data, 'prediction', color = my_colors, marker = 'o')
    
In [55]:
# 다수의 피처를 가진 레코드들을 한번에 표현하기에 parallel polt이 가장 유용하다.

P = pd_centers(feature, centers)
P.head()
Out[55]:
air_pressureair_tempavg_wind_directionavg_wind_speedmax_wind_speedmax_wind_speedrelative_humidityprediction
00.140687-0.853064-1.154650-0.650211-0.660119-0.6601190.9621320
1-1.169510-0.8169180.4394571.9716231.9378981.9378980.8655871
2-0.2032570.920037-1.265776-0.659689-0.673215-0.673215-0.6404062
3-0.1080181.1844411.069376-0.645328-0.620734-0.620734-0.7443693
4-0.7527350.2262430.112486-0.564372-0.578224-0.5782240.1358274
In [58]:
# Dry Days
parallel_plot(P[P['relative_humidity'] < -0.5 ])
In [60]:
# Warm Days
parallel_plot(P[P['air_temp'] > 0.5 ])
In [61]:
# Cool Days
parallel_plot(P[ (P['relative_humidity'] > 0.5) & (P['air_temp'] < 0.5) ])
  • 특정 구간에서 비슷하다가 한 피처에서 정 반대의 양상을 보이는 두개의 클러스터는 뚜렷히 구분이 되는 클러스터라고 볼 수 있다.

  • 특정 피처에서 다른 모든 클러스터와 정 반대의 양상을 보이는 하나의 클러스터는 다른 클러스터와 다른 특별한 특징을 가지고 있다고 볼 수 있다.


+ Recent posts