from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import utils
import pandas as pd
import numpy as np
from itertools import cycle, islice
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
%matplotlib inline
In [5]:
data = pd.read_csv('./minute_weather.csv')
In [6]:
data.head()
Out[6]:
In [7]:
data.describe()
Out[7]:
In [8]:
data.columns
Out[8]:
In [11]:
data[data.isnull().any(axis=1)]
Out[11]:
In [12]:
data.shape
Out[12]:
In [14]:
sampled_df = data[ (data['rowID'] % 10 ) == 0 ]
sampled_df.shape
Out[14]:
In [17]:
# Use Tanspose() Here To Focus Mean Values Of Every Columns
sampled_df.describe().transpose()
Out[17]:
In [21]:
n_NoRainDays = sampled_df[sampled_df['rain_accumulation' ] == 0 ].shape
In [20]:
sampled_df[sampled_df['rain_duration' ] == 0 ].shape
Out[20]:
In [24]:
(sampled_df.shape[0] - n_NoRainDays[0]) / sampled_df.shape[0]
Out[24]:
In [25]:
# It's Better To Delete Very Common Feature In Dataset For Proper Clustering
del sampled_df['rain_accumulation']
del sampled_df['rain_duration' ]
In [27]:
sampled_df.columns
Out[27]:
In [28]:
n_rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
In [29]:
n_rows_before - sampled_df.shape[0]
Out[29]:
In [30]:
sampled_df.columns
Out[30]:
In [37]:
feature = [ 'air_pressure','air_temp','avg_wind_direction','avg_wind_speed',
'max_wind_speed','max_wind_speed', 'relative_humidity']
In [38]:
select_df = sampled_df[feature]
In [39]:
select_df.columns
Out[39]:
In [40]:
select_df.shape
Out[40]:
In [41]:
select_df.head()
Out[41]:
In [45]:
# To Keep Values Of Columns Comparable, Scaling Is Needed
# Fit_Transform Function Is Combined Fit And Transfrom Operations
# Which means that It will Find How To Scale The DataFrame And Apply It
x = StandardScaler().fit_transform(select_df)
x[ : 5]
Out[45]:
In [49]:
kmeans = KMeans(n_clusters = 12 )
model = kmeans.fit(x)
print("model \n", model)
In [50]:
centers = model.cluster_centers_
len(centers)
Out[50]:
In [51]:
# Function That Creates A DataFrame With A Column For Cluster Number
def pd_centers(featureUsed, centers):
colNames = list(featureUsed)
colNames.append('prediction')
# Zip With A Column Called 'prediction' (Index)
Z = [ np.append(A, index) for index, A in enumerate(centers)]
# Convert To Pandas DataFrame For Plotting
P = pd.DataFrame( Z, columns = colNames )
P[ 'prediction' ] = P[ 'prediction' ].astype(int)
return P
In [57]:
# Function That Creates Parallel Plots
def parallel_plot(data):
my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k' ]), None , len(data)))
plt.figure(figsize = (15, 8)).gca().axes.set_ylim([-3, +3])
parallel_coordinates(data, 'prediction', color = my_colors, marker = 'o')
In [55]:
# 다수의 피처를 가진 레코드들을 한번에 표현하기에 parallel polt이 가장 유용하다.
P = pd_centers(feature, centers)
P.head()
Out[55]:
In [58]:
# Dry Days
parallel_plot(P[P['relative_humidity'] < -0.5 ])
In [60]:
# Warm Days
parallel_plot(P[P['air_temp'] > 0.5 ])
In [61]:
# Cool Days
parallel_plot(P[ (P['relative_humidity'] > 0.5) & (P['air_temp'] < 0.5) ])
'Python Library > Pandas' 카테고리의 다른 글
Day 7. Drawing Graphs With Pandas (0) | 2019.07.02 |
---|---|
Day 7. Machine Learning [ Linear Regression ] ( European Soccer Data ) (0) | 2019.06.16 |
Day 7. Machine Learning [ Decision Trees ] ( Weather Classification ) (0) | 2019.06.16 |
Day 6. Handling Timestamps with Pandas (0) | 2019.06.16 |
Day 6. String Operations with Pandas (0) | 2019.06.16 |