import pandas as pd
import numpy  as np
import random
import matplotlib.pyplot as plt
In [2]:
data = pd.read_csv("./Indicators.csv")
In [3]:
data.shape
Out[3]:
(5656458, 6)
In [4]:
data.describe()
Out[4]:
YearValue
count5.656458e+065.656458e+06
mean1.994464e+031.070501e+12
std1.387895e+014.842469e+13
min1.960000e+03-9.824821e+15
25%1.984000e+035.566242e+00
50%1.997000e+036.357450e+01
75%2.006000e+031.346722e+07
max2.015000e+031.103367e+16
In [5]:
data.head()
Out[5]:
CountryNameCountryCodeIndicatorNameIndicatorCodeYearValue
0Arab WorldARBAdolescent fertility rate (births per 1,000 wo...SP.ADO.TFRT19601.335609e+02
1Arab WorldARBAge dependency ratio (% of working-age populat...SP.POP.DPND19608.779760e+01
2Arab WorldARBAge dependency ratio, old (% of working-age po...SP.POP.DPND.OL19606.634579e+00
3Arab WorldARBAge dependency ratio, young (% of working-age ...SP.POP.DPND.YG19608.102333e+01
4Arab WorldARBArms exports (SIPRI trend indicator values)MS.MIL.XPRT.KD19603.000000e+06
In [7]:
data.columns
Out[7]:
Index(['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode', 'Year',
       'Value'],
      dtype='object')
In [15]:
countries = data['CountryName'].unique() ## return numpy ndarray
countries = data['CountryName'].unique().tolist() ## make it list so that we can calulate a length

len(countries)
Out[15]:
247
In [18]:
# Number Of Country codes
ountryCode = data['CountryCode'].unique().tolist() ## make it list so that we can calulate a length
len(countryCode)
Out[18]:
247
In [19]:
# Number Of Indicators
indicators = data['IndicatorName'].unique().tolist()
len(indicators)
Out[19]:
1344
In [20]:
years = data['Year'].unique().tolist()
len(years)
Out[20]:
56
In [22]:
years.sort()
In [23]:
years[-10 : ]
Out[23]:
[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
In [24]:
print(min(years), "to ", max(years))
1960 to  2015
  • Visualization Start From HERE!!
In [29]:
# Pick Only CO2 Emissions For USA

hist_indicator = 'CO2 emissions \(metric'
hist_country   = 'USA'

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'  ].str.contains(hist_country  )

stage = data[mask1 & mask2]
Out[29]:
52
In [31]:
l_stage = stage['Year'].unique().tolist()
print(min(l_stage) ,"to " , max(l_stage))
1960 to  2011
In [36]:
years = stage['Year' ].values  # make pd.Series to ndarray
co2  = stage['Value'].values  

#visualization
plt.bar(years, co2)
plt.show()
In [41]:
# Switch To a Line Plot

plt.plot( stage['Year'].values, stage['Value'].values )

# Lable the axes

plt.xlabel('Year')

#the differencies between [0] and iloc[0] is a type of retrun
plt.ylabel(stage['IndicatorName'].iloc[0])

plt.title('CO2 Emissions in USA')

#to make more honest, start they y axis ay 0
plt.axis([1959, 2011, 0, 25])

plt.show()
In [47]:
# Histogram

hist_data = stage['Value'].values

# 10 Is a Number Of Bins, density scales the data
plt.hist(hist_data, 10 , density = False, facecolor = 'green')

plt.xlabel( stage['IndicatorName'].iloc[0])
plt.ylabel( '# of Years'       )
plt.title ( 'Histogram Example')

plt.grid(True)


plt.show()

# We Can Guess There Could Be Outliers
In [48]:
# Let's Compare With Other Countries

# Select Co2 Emissions For All Countries in 2011

hist_indicator = 'CO2 emissions \(metric'
hist_year      = 2011

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['Year'].isin([hist_year]) #isin returns boolean values

co2_2011 = data[mask1 & mask2]
co2_2011.head()
Out[48]:
CountryNameCountryCodeIndicatorNameIndicatorCodeYearValue
5026275Arab WorldARBCO2 emissions (metric tons per capita)EN.ATM.CO2E.PC20114.724500
5026788Caribbean small statesCSSCO2 emissions (metric tons per capita)EN.ATM.CO2E.PC20119.692960
5027295Central Europe and the BalticsCEBCO2 emissions (metric tons per capita)EN.ATM.CO2E.PC20116.911131
5027870East Asia & Pacific (all income levels)EASCO2 emissions (metric tons per capita)EN.ATM.CO2E.PC20115.859548
5028456East Asia & Pacific (developing only)EAPCO2 emissions (metric tons per capita)EN.ATM.CO2E.PC20115.302499
In [50]:
# Let's Plot A Histogram Of The Emmissions Per Capita By Country

# Subplots Returns A Tuple With The Figure, Axis Attributes.

fig, ax = plt.subplots()

ax.annotate("USA", xy = (18, 5), xycoords = 'data',
           xytext = (18, 30), textcoords = 'data',
           arrowprops = dict( arrowstyle = "->",
                            connectionstyle = "arc3"),
           )


plt.hist(co2_2011['Value'], 10, normed = False, facecolor = 'green')

plt.xlabel(stage['IndicatorName'].iloc[0])
plt.ylabel('# of Countries')
plt.title ('Histogram of CO2 Emissions Per Capita')

#plt.axis([10, 22, 0, 14])
plt.grid(True)

plt.show()

Relationship Between GDP and CO2 Emissions in USA

In [51]:
# Select GDP Per Capita Emissions For The United States

hist_indicator = 'GDP per capita \(constant 2005'
hist_country   = 'USA'

mask1  = data['IndicatorName'].str.contains(hist_indicator)
mask2  = data['CountryCode'  ].str.contains(hist_country)

# Stage Is Just Indicators Matching The USA For Country Code
gdp_stage = data[ mask1 & mask2]


# Plot gdp_stage versus stage which contains information about CO2 Emission
In [52]:
gdp_stage.head(2)
Out[52]:
CountryNameCountryCodeIndicatorNameIndicatorCodeYearValue
22282United StatesUSAGDP per capita (constant 2005 US$)NY.GDP.PCAP.KD196015482.707760
48759United StatesUSAGDP per capita (constant 2005 US$)NY.GDP.PCAP.KD196115578.409657
In [53]:
stage.head(2)
Out[53]:
CountryNameCountryCodeIndicatorNameIndicatorCodeYearValue
22232United StatesUSACO2 emissions (metric tons per capita)EN.ATM.CO2E.PC196015.999779
48708United StatesUSACO2 emissions (metric tons per capita)EN.ATM.CO2E.PC196115.681256
In [59]:
# Switch To a Line Plot  

plt.plot(gdp_stage['Year'].values, gdp_stage['Value'].values)

# Label The Axis
plt.xlabel('Year')
plt.ylabel(gdp_stage['IndicatorName'].iloc[0])

# Label The Figure
plt.title( 'GDP Per Capita USA ')

plt.axis([1956, 2013, 0, 46000])

plt.show()

Make Sure That Time Period Are Same In Two DataFrame

Scatter Plot Require Same Number Of Years In Dataset

In [61]:
print("GDP Min Year = ", gdp_stage['Year'].min(), "max :", gdp_stage['Year'].max())
print("CO2 Min Year = ",     stage['Year'].min(), "max :",     stage['Year'].max())
GDP Min Year =  1960 max : 2014
CO2 Min Year =  1960 max : 2011
In [62]:
gdp_stage_trunc = gdp_stage[ gdp_stage['Year'] < 2012 ]
print(len(gdp_stage_trunc))
print(len(stage))
52
52
In [66]:
%matplotlib inline

fig , axis = plt.subplots()
# Grid Lines, Xticks , XLabel, YLabel

axis.yaxis.grid(True)
axis.set_title('CO2 Emissions vs. GOD (per capita)', fontsize = 10)
axis.set_xlabel(gdp_stage_trunc['IndicatorName'].iloc[0], fontsize = 10)
axis.set_ylabel(stage['IndicatorName'].iloc[0], fontsize = 10)

X = gdp_stage_trunc['Value']
Y = stage['Value']

axis.scatter(X, Y)
plt.show()
In [67]:
# Check Coefficient Between GDP and CO2 Emission
np.corrcoef(gdp_stage_trunc['Value'], stage['Value'])
Out[67]:
array([[1.        , 0.07676005],
       [0.07676005, 1.        ]])
In [ ]:



Visualization Libraries

The following list provides a few plotting libraries for you to get started based on their use case(s).  This list is focused on providing a few solid options for each case rather than overwhelming you with the variety of options available.

The foundation: Matplotlib, most used plotting library, best for two-dimensional non-interactive plots. A possible replacement is pygal, it provides similar functionality but generates vector graphics SVG output and has a more user-friendly interface.

Specific use cases:

  • Specialized statistical plots, like automatically fitting a linear regression with confidence interval or like scatter plots color-coded by category.

    • seaborn: it builds on top of Matplotlib and it can also be used as a replacement for matplotlib just for an easier way to specify color palettes and plotting aestetics
  • Grammar of graphics plotting, if you find the interface of Matplotlib too verbose, Python provides packages based on a different paradigm of plot syntax based on R's ggplot2

    • ggplot: it provides similar functionality to Matplotlib and is also based on Matplotlib but provides a different interface.
    • altair: it has a simpler interface compared to ggplot and generates Javascript based plots easily embeddable into the Jupyter Notebook or exported as PNG.
  • Interactive plots, i.e. pan, zoom that work in the Jupyter Notebooks but also can be exported as Javascript to work standalone on a webpage.

    • bokeh: maintained by Continuum Analytics, the company behind Anaconda
    • plotly: is both a library and a cloud service where you can store and share your visualizations (it has free/paid accounts)
  • Interactive map visualization

    *folium: Creates HTML pages that include the Leaflet.js javascript plotting library to display data on top of maps. *plotly: it supports color-coded country/world maps embedded in the Jupyter Notebook.

  • Realtime plots that update with streaming data, even integrated in a dashboard with user interaction.

    • bokeh plot server: it is part of Bokeh but requires to launch a separate Python process that takes care of responding to events from User Interface or from streaming data updates.
  • 3D plots are not easy to interpret, it is worth first consider if a combination of 2D plots could provide a better insight into the data

    • mplot3d: Matplotlib tookit for 3D visualization


+ Recent posts