import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
data = pd.read_csv("./Indicators.csv")
data.shape
data.describe()
data.head()
data.columns
countries = data['CountryName'].unique() ## return numpy ndarray
countries = data['CountryName'].unique().tolist() ## make it list so that we can calulate a length
len(countries)
# Number Of Country codes
ountryCode = data['CountryCode'].unique().tolist() ## make it list so that we can calulate a length
len(countryCode)
# Number Of Indicators
indicators = data['IndicatorName'].unique().tolist()
len(indicators)
years = data['Year'].unique().tolist()
len(years)
years.sort()
years[-10 : ]
print(min(years), "to ", max(years))
# Pick Only CO2 Emissions For USA
hist_indicator = 'CO2 emissions \(metric'
hist_country = 'USA'
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode' ].str.contains(hist_country )
stage = data[mask1 & mask2]
l_stage = stage['Year'].unique().tolist()
print(min(l_stage) ,"to " , max(l_stage))
years = stage['Year' ].values # make pd.Series to ndarray
co2 = stage['Value'].values
#visualization
plt.bar(years, co2)
plt.show()
# Switch To a Line Plot
plt.plot( stage['Year'].values, stage['Value'].values )
# Lable the axes
plt.xlabel('Year')
#the differencies between [0] and iloc[0] is a type of retrun
plt.ylabel(stage['IndicatorName'].iloc[0])
plt.title('CO2 Emissions in USA')
#to make more honest, start they y axis ay 0
plt.axis([1959, 2011, 0, 25])
plt.show()
# Histogram
hist_data = stage['Value'].values
# 10 Is a Number Of Bins, density scales the data
plt.hist(hist_data, 10 , density = False, facecolor = 'green')
plt.xlabel( stage['IndicatorName'].iloc[0])
plt.ylabel( '# of Years' )
plt.title ( 'Histogram Example')
plt.grid(True)
plt.show()
# We Can Guess There Could Be Outliers
# Let's Compare With Other Countries
# Select Co2 Emissions For All Countries in 2011
hist_indicator = 'CO2 emissions \(metric'
hist_year = 2011
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['Year'].isin([hist_year]) #isin returns boolean values
co2_2011 = data[mask1 & mask2]
co2_2011.head()
# Let's Plot A Histogram Of The Emmissions Per Capita By Country
# Subplots Returns A Tuple With The Figure, Axis Attributes.
fig, ax = plt.subplots()
ax.annotate("USA", xy = (18, 5), xycoords = 'data',
xytext = (18, 30), textcoords = 'data',
arrowprops = dict( arrowstyle = "->",
connectionstyle = "arc3"),
)
plt.hist(co2_2011['Value'], 10, normed = False, facecolor = 'green')
plt.xlabel(stage['IndicatorName'].iloc[0])
plt.ylabel('# of Countries')
plt.title ('Histogram of CO2 Emissions Per Capita')
#plt.axis([10, 22, 0, 14])
plt.grid(True)
plt.show()
# Select GDP Per Capita Emissions For The United States
hist_indicator = 'GDP per capita \(constant 2005'
hist_country = 'USA'
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode' ].str.contains(hist_country)
# Stage Is Just Indicators Matching The USA For Country Code
gdp_stage = data[ mask1 & mask2]
# Plot gdp_stage versus stage which contains information about CO2 Emission
gdp_stage.head(2)
stage.head(2)
# Switch To a Line Plot
plt.plot(gdp_stage['Year'].values, gdp_stage['Value'].values)
# Label The Axis
plt.xlabel('Year')
plt.ylabel(gdp_stage['IndicatorName'].iloc[0])
# Label The Figure
plt.title( 'GDP Per Capita USA ')
plt.axis([1956, 2013, 0, 46000])
plt.show()
print("GDP Min Year = ", gdp_stage['Year'].min(), "max :", gdp_stage['Year'].max())
print("CO2 Min Year = ", stage['Year'].min(), "max :", stage['Year'].max())
gdp_stage_trunc = gdp_stage[ gdp_stage['Year'] < 2012 ]
print(len(gdp_stage_trunc))
print(len(stage))
%matplotlib inline
fig , axis = plt.subplots()
# Grid Lines, Xticks , XLabel, YLabel
axis.yaxis.grid(True)
axis.set_title('CO2 Emissions vs. GOD (per capita)', fontsize = 10)
axis.set_xlabel(gdp_stage_trunc['IndicatorName'].iloc[0], fontsize = 10)
axis.set_ylabel(stage['IndicatorName'].iloc[0], fontsize = 10)
X = gdp_stage_trunc['Value']
Y = stage['Value']
axis.scatter(X, Y)
plt.show()
# Check Coefficient Between GDP and CO2 Emission
np.corrcoef(gdp_stage_trunc['Value'], stage['Value'])
Visualization Libraries
The following list provides a few plotting libraries for you to get started based on their use case(s). This list is focused on providing a few solid options for each case rather than overwhelming you with the variety of options available.
The foundation: Matplotlib, most used plotting library, best for two-dimensional non-interactive plots. A possible replacement is pygal
, it provides similar functionality but generates vector graphics SVG output and has a more user-friendly interface.
Specific use cases:
Specialized statistical plots, like automatically fitting a linear regression with confidence interval or like scatter plots color-coded by category.
seaborn
: it builds on top of Matplotlib and it can also be used as a replacement formatplotlib
just for an easier way to specify color palettes and plotting aestetics
Grammar of graphics plotting, if you find the interface of Matplotlib too verbose, Python provides packages based on a different paradigm of plot syntax based on R's
ggplot2
Interactive plots, i.e. pan, zoom that work in the Jupyter Notebooks but also can be exported as Javascript to work standalone on a webpage.
Interactive map visualization
*
folium
: Creates HTML pages that include the Leaflet.js javascript plotting library to display data on top of maps. *plotly
: it supports color-coded country/world maps embedded in the Jupyter Notebook.Realtime plots that update with streaming data, even integrated in a dashboard with user interaction.
bokeh plot server
: it is part of Bokeh but requires to launch a separate Python process that takes care of responding to events from User Interface or from streaming data updates.
3D plots are not easy to interpret, it is worth first consider if a combination of 2D plots could provide a better insight into the data
mplot3d
: Matplotlib tookit for 3D visualization