Which factors contribute to the hapiness of a nation? 2022 is coming, let us look back the global hapiness over time!

In this post, we are studying the World Hapiness Report dataset, which is a landmark survey of the state of global happiness. The dataset is useful for interdisciplinary fields, ranging from economics, public health to psycholgy and development policy. The dataset contains 2 parts, the world hapiness report over time and the report for the year 2021.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model

import matplotlib.gridspec as gs
import seaborn as sns

df = pd.read_csv("world-happiness-report.csv")
df.head()

	Country name	year	Life Ladder	Log GDP per capita	Social support	Healthy life expectancy at birth	Freedom to make life choices	Generosity	Perceptions of corruption	Positive affect	Negative affect
0	Afghanistan	2008	3.724	7.370	0.451	50.80	0.718	0.168	0.882	0.518	0.258
1	Afghanistan	2009	4.402	7.540	0.552	51.20	0.679	0.190	0.850	0.584	0.237
2	Afghanistan	2010	4.758	7.647	0.539	51.60	0.600	0.121	0.707	0.618	0.275
3	Afghanistan	2011	3.832	7.620	0.521	51.92	0.496	0.162	0.731	0.611	0.267
4	Afghanistan	2012	3.783	7.705	0.521	52.24	0.531	0.236	0.776	0.710	0.268

df2021 = pd.read_csv("world-happiness-report-2021.csv")
df2021.head()

	Country name	Regional indicator	Ladder score	Standard error of ladder score	upperwhisker	lowerwhisker	Logged GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption	Ladder score in Dystopia	Explained by: Log GDP per capita	Explained by: Social support	Explained by: Healthy life expectancy	Explained by: Freedom to make life choices	Explained by: Generosity	Explained by: Perceptions of corruption	Dystopia + residual
0	Finland	Western Europe	7.842	0.032	7.904	7.780	10.775	0.954	72.0	0.949	-0.098	0.186	2.43	1.446	1.106	0.741	0.691	0.124	0.481	3.253
1	Denmark	Western Europe	7.620	0.035	7.687	7.552	10.933	0.954	72.7	0.946	0.030	0.179	2.43	1.502	1.108	0.763	0.686	0.208	0.485	2.868
2	Switzerland	Western Europe	7.571	0.036	7.643	7.500	11.117	0.942	74.4	0.919	0.025	0.292	2.43	1.566	1.079	0.816	0.653	0.204	0.413	2.839
3	Iceland	Western Europe	7.554	0.059	7.670	7.438	10.878	0.983	73.0	0.955	0.160	0.673	2.43	1.482	1.172	0.772	0.698	0.293	0.170	2.967
4	Netherlands	Western Europe	7.464	0.027	7.518	7.410	10.932	0.942	72.4	0.913	0.175	0.338	2.43	1.501	1.079	0.753	0.647	0.302	0.384	2.798

df.rename(columns={'Country name': 'Country'}, inplace=True)
df2021.rename(columns={'Country name': 'Country'}, inplace=True)

Explanatory data analysis for the year 2021

c1 = "mediumturquoise"
c2 = "lightpink"
c3 = "sandybrown"

high_c = c1
low_c = c2

Countries with the highest and lowest happiness index in 2021

#top 10 and lowesttom 10 countries in 2021 report
fig= plt.figure(figsize=(15,15))
g=gs.GridSpec(ncols=1, nrows=2, figure=fig)
plt.suptitle("Countries with the highest and lowest happiness index in 2021", family='Serif', weight='bold', size=20)
ax1=plt.subplot(g[0,0])

top_10=df2021.head(10)
lowest_10= df2021.tail(10)
ax1=sns.barplot(data=top_10, x=top_10['Ladder score'],
                y=top_10['Country']+ " (" + top_10['Regional indicator'] + ")", color = c1)
#ax1.set_xlabel('')
ax1.xaxis.set_visible(False)
ax1.annotate(" 10 countries with highest happiness index",xy=(8,2), family='Serif', weight='bold', size=12)
ax2=plt.subplot(g[1,0], sharex=ax1)
ax2=sns.barplot(data=lowest_10, x=lowest_10['Ladder score'],
                y=lowest_10['Country']+ " (" + lowest_10['Regional indicator'] + ")",  color = c2)
ax2.annotate(" 10 countries with lowest happiness index",xy=(8,2), family='Serif', weight='bold', size=12)
for s in ['left','right','top','bottom']:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)

png

Ladder score distribution by Regional indicator

fig=plt.figure(figsize=(15,8))
plt.title("Ladder score distribution by Regional indicator")
sns.kdeplot(df2021['Ladder score'], fill=True,hue=df2021['Regional indicator'], shade=True, linewidth=2, multiple='layer')
plt.axvline(df2021['Ladder score'].mean(), c='black',ls='--')
plt.text(x=df2021['Ladder score'].mean(),y=-0.01,s='Population mean', size=15)
for s in ['left','right','top','bottom']:
    plt.gca().spines[s].set_visible(False)

png

Correlaztion analysis for the year 2021

corr = df2021.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(15,15))
sns.heatmap(corr, mask=mask, center=0, annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

png

#refined
continuous = ['Logged GDP per capita',
 'Social support',
 'Healthy life expectancy',
 'Freedom to make life choices',
 'Generosity',
 'Perceptions of corruption']

sns.pairplot(df2021, vars = continuous,  hue = 'Ladder score')

png

df2021[continuous].hist(bins=50, figsize=(20,15))
plt.show()

png

Correlation GDP-Hapiness index

# get GDP for the x-axis and Life Satisfaction for the y-axis
X = df2021['Logged GDP per capita'].values.reshape(-1, 1)
y = df2021['Ladder score'].values.reshape(-1, 1)
X.shape, y.shape

((149, 1), (149, 1))

# Select a Linear Model
model = sklearn.linear_model.LinearRegression()
# Train the Model
model.fit(X, y)

LinearRegression()

# let's visualize our model, because it's a linear one, we can plot it using two points
X = [[7], [11]]
y_hat = model.predict(X)
df2021.plot(kind='scatter', x='Logged GDP per capita', y='Ladder score')
plt.plot(X, y_hat, c='red')
plt.show()

png

Distribution of ladder score compared to other factors

happiness_mean = df2021['Ladder score'].mean()

df2021['lower_happy'] = df2021['Ladder score'].apply(lambda x: 0 if x < happiness_mean else 1)

background_color = '#fbfbfb'
fig = plt.figure(figsize=(12, 6), dpi=150,facecolor=background_color)
gs = fig.add_gridspec(2, 3)
gs.update(wspace=0.2, hspace=0.5)


plot = 0
for row in range(0, 2):
    for col in range(0, 3):
        locals()["ax"+str(plot)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(plot)].set_facecolor(background_color)
        locals()["ax"+str(plot)].tick_params(axis='y', left=False)
        locals()["ax"+str(plot)].get_yaxis().set_visible(False)
        locals()["ax"+str(plot)].set_axisbelow(True)
        for s in ["top","right","left"]:
            locals()["ax"+str(plot)].spines[s].set_visible(False)
        plot += 1

plot = 0

Yes = df2021[df2021['lower_happy'] == 1]
No = df2021[df2021['lower_happy'] == 0]


 
for variable in continuous:
        sns.kdeplot(Yes[variable], ax=locals()["ax"+str(plot)], color=high_c,ec='black', shade=True, linewidth=1.5, alpha=0.9, zorder=3, legend=False)
        sns.kdeplot(No[variable],ax=locals()["ax"+str(plot)], color=low_c, shade=True, ec='black',linewidth=1.5, alpha=0.9, zorder=3, legend=False)
        locals()["ax"+str(plot)].grid(which='major', axis='x', zorder=0, color='gray', linestyle=':', dashes=(1,5))
        locals()["ax"+str(plot)].set_xlabel(variable, fontfamily='monospace')
        plot += 1
        
Xstart, Xend = ax0.get_xlim()
Ystart, Yend = ax0.get_ylim()

ax0.text(Xstart, Yend+(Yend*0.5), 'Differences between happy & unhappy countries', fontsize=17, fontweight='bold', fontfamily='sansserif',color='#323232')
ax0.text(Xstart, Yend+(Yend*0.25), 'There are large differences, with GDP & Social Support being clear\nperhaps more interesting though, unhappy countries appear to be more generous', fontsize=12, fontweight='light', fontfamily='monospace',color='gray')

plt.show()

findfont: Font family ['sansserif'] not found. Falling back to DejaVu Sans.

png

A continental view

continent_score = df2021.groupby('Regional indicator')['Healthy life expectancy','Logged GDP per capita','Perceptions of corruption','Freedom to make life choices','Ladder score'].mean().reset_index()

background = "#fbfbfb"

fig, ax = plt.subplots(1,1, figsize=(10, 5),dpi=150)
fig.patch.set_facecolor(background) # figure background color

cmap = [c2,c1]

color_map = ['#e7e9e7' for _ in range(10)]
color_map[9] =  c1 # color highlight
color_map[5] =  c1
color_map[8] =  c2
color_map[6] =  c2

ax.set_facecolor(background)
sns.scatterplot(data=continent_score, x=continent_score['Healthy life expectancy'], y=continent_score['Ladder score'],hue=continent_score['Regional indicator'], alpha=0.9,ec='black',palette=color_map,size=df2021["Ladder score"], legend=False, sizes=(5, 600))

ax.set_xlabel("Life Expectancy",fontfamily='monospace',color='gray')
ax.set_ylabel("Happiness Index Score",fontfamily='monospace',color='gray')
ax.tick_params(axis = 'both', which = 'major', labelsize = 10)

for s in ["top","right","left"]:
    ax.spines[s].set_visible(False)
    
ax.text(55,7.5,'Happiness Score & Life Expectancy by Continent',fontfamily='sansserif',fontweight='normal',fontsize=17,weight='bold',color='#323232')
ax.text(55,7.3,'There are clear distinctions, with four stand-out continents',fontfamily='monospace',fontweight='light',fontsize=12,color='gray')

L = ax.legend(frameon=False,loc="upper center", bbox_to_anchor=(1.25, 0.8), ncol= 1)
plt.setp(L.texts, family='monospace')
L.get_frame().set_facecolor('none')

ax.tick_params(axis='both', which='both',left=False, bottom=False,labelbottom=True) 

for i, txt in enumerate(continent_score['Regional indicator']):
    ax.annotate(txt, (continent_score['Healthy life expectancy'][i]+0.5, continent_score['Ladder score'][i]),fontfamily='monospace')

plt.show()

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  """Entry point for launching an IPython kernel.
No handles with labels found to put in legend.

png

Happiness index over time

background = "#fbfbfb"
fig, ax = plt.subplots(1,1, figsize=(10, 5),dpi=150)
fig.patch.set_facecolor(background) # figure background color
ax.set_facecolor(background)

# Reduced list as too many to show all at once 
top_list_ = df.groupby('Country')['Life Ladder'].mean().sort_values(ascending=False).reset_index()[:20].sort_values(by='Life Ladder',ascending=True)


plot = 1
for country in top_list_['Country']:
    mean = df[df['Country'] == country].groupby('Country')['Life Ladder'].mean()
    # historic scores
    sns.scatterplot(data=df[df['Country'] == country], y=plot, x='Life Ladder',color='lightgray',s=50,ax=ax)
    # mean score
    sns.scatterplot(data=df[df['Country'] == country], y=plot, x=mean,color='gold',ec='black',linewidth=1,s=75,ax=ax)
    #2021 score
    sns.scatterplot(data=df2021[df2021['Country'] == country], y=plot, x='Ladder score',color=c1,ec='black',linewidth=1,s=75,ax=ax)   
    plot += 1


ax.set_yticks(top_list_.index+1)
ax.set_yticklabels(top_list_['Country'][::-1], fontdict={'horizontalalignment': 'right'}, alpha=0.7)
ax.tick_params(axis=u'both', which=u'both',length=0)
ax.set_xlabel("Happiness Index Score",fontfamily='monospace',color='gray')


for s in ['top','right','bottom','left']:
    ax.spines[s].set_visible(False)
    
Xstart, Xend = ax.get_xlim()
Ystart, Yend = ax.get_ylim()

ax.hlines(y=top_list_.index+1, xmin=Xstart, xmax=Xend, color='gray', alpha=0.5, linewidth=.3, linestyles='--')
ax.set_axisbelow(True)
ax.text(6.25, Yend+4.3, 'Happiness Index Scores through the years', fontsize=17, fontweight='bold', fontfamily='sansserif',color='#323232')
ax.text(6.25, Yend+0.75,
'''
Countries scores do vary over time, however they remain  farily consistent
It is interesting that Finland's 2021 score is amongst the highest of all time
''', fontsize=12, fontweight='light', fontfamily='monospace',color='gray')

plt.annotate('2021\nscore', xy=(7.842, 19), xytext=(8.2, 11),
             arrowprops=dict(facecolor='steelblue',arrowstyle="->",connectionstyle="arc3,rad=.3"), fontsize=10,fontfamily='monospace',ha='center', color=c1)

plt.annotate('Mean\nscore', xy=(7.6804, 20), xytext=(8.2, 16),
             arrowprops=dict(facecolor='steelblue',arrowstyle="->",connectionstyle="arc3,rad=.5"), fontsize=10,fontfamily='monospace',ha='center', color='gold')


plt.show()

png

A global view over time

import plotly.express as px

fig = px.choropleth(data_frame=df, locations='Country',
                    locationmode="country names",
                    color='Life Ladder',
                    animation_frame='year',
                    range_color = "",
                    #color_continuous_scale = [c2,c1])
                    color_continuous_scale = px.colors.sequential.RdBu)
fig.show()

fig.write_html("world_hapiness.html")

Regression

df.drop("year", axis=1).hist(bins=50, figsize=(20,15))
plt.show()

png

Create a test set

df_ = df2021[['Ladder score',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']]

Transformation pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

df_ = num_pipeline.fit_transform(df_)
df_.shape

(149, 10)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df_, test_size=0.2, random_state=42)

train_set.shape, test_set.shape

((119, 10), (30, 10))

Select and Train Model

Linear Model

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()

lin_reg.fit(X=train_set[:, 1:], y=train_set[:,0])

LinearRegression()

from sklearn.metrics import mean_squared_error

predictions = lin_reg.predict(test_set[:, 1:])
lin_mse = mean_squared_error(test_set[:, 0], predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse 

0.00031220104713910665

Decision Tree

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth = 4)
tree_reg.fit(X=train_set[:, 1:], y=train_set[:,0])

DecisionTreeRegressor(max_depth=4)

predictions = tree_reg.predict(test_set[:, 1:])
lin_mse = mean_squared_error(test_set[:, 0], predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse 

0.1019944246040892

fig = plt.figure(figsize=(25,15))
sklearn.tree.plot_tree(tree_reg)
plt.show()

png

Cross-Validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=tree_reg, X=train_set[:, 1:], y=train_set[:,0]
                         , scoring='neg_mean_squared_error', cv=10)

tree_rmse_scores = np.sqrt(-scores)
print("Scores:", tree_rmse_scores)
print("Mean:", tree_rmse_scores.mean())
print("Standard Deviation:", tree_rmse_scores.std())

Scores: [0.08063915 0.12074651 0.10401294 0.11853572 0.16611299 0.07206908
 0.11776827 0.13357675 0.1597485  0.25173151]
Mean: 0.13249414280237842
Standard Deviation: 0.04877781586110474

Ensemble Learning with the Random Forest

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X=train_set[:, 1:], y=train_set[:,0])

RandomForestRegressor()

forest_mse = mean_squared_error(y_true=test_set[:, 0], y_pred=forest_reg.predict(X=test_set[:, 1:]))
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.04082603271734864

scores = cross_val_score(estimator=forest_reg, X=train_set[:, 1:], y=train_set[:,0]
                         , scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-scores)
print("Scores:", forest_rmse_scores)
print("Mean:", forest_rmse_scores.mean())
print("Standard Deviation:", forest_rmse_scores.std()) # much better now

Scores: [0.02182252 0.0849461  0.02198475 0.07533847 0.15627785 0.05930377
 0.03977674 0.05536409 0.06649303 0.25151086]
Mean: 0.08328181891050981
Standard Deviation: 0.06690200008334836

Finetune with Grid-Search

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(estimator=forest_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, return_train_score=True, n_jobs=-1)
grid_search.fit(X=train_set[:, 1:], y=train_set[:,0])

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

34082522900363926 {'max_features': 2, 'n_estimators': 3}
24008412892225006 {'max_features': 2, 'n_estimators': 10}
21986291868367538 {'max_features': 2, 'n_estimators': 30}
17776709950308564 {'max_features': 4, 'n_estimators': 3}
1468019626867688 {'max_features': 4, 'n_estimators': 10}
12742906434054158 {'max_features': 4, 'n_estimators': 30}
14940029641388528 {'max_features': 6, 'n_estimators': 3}
11440546627455518 {'max_features': 6, 'n_estimators': 10}
12027962076790268 {'max_features': 6, 'n_estimators': 30}
10803897201257359 {'max_features': 8, 'n_estimators': 3}
12045244766329144 {'max_features': 8, 'n_estimators': 10}
0991261383049853 {'max_features': 8, 'n_estimators': 30}
19098266384099238 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
21873900029484047 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
17707483768792917 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
15222447554234794 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
15742856042032355 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
11996174232954583 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}

Randomized Search

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=9),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X=train_set[:, 1:], y=train_set[:,0])

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd1efe54250>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd1efea9590>},
                   random_state=42, scoring='neg_mean_squared_error')

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1026364152276238 {'max_features': 7, 'n_estimators': 180}
11023158498480463 {'max_features': 5, 'n_estimators': 15}
14301237257745014 {'max_features': 3, 'n_estimators': 72}
10729242519657402 {'max_features': 5, 'n_estimators': 21}
10407094852494171 {'max_features': 7, 'n_estimators': 122}
14463386329172387 {'max_features': 3, 'n_estimators': 75}
14467805582633234 {'max_features': 3, 'n_estimators': 88}
10730669220705276 {'max_features': 5, 'n_estimators': 100}
1014822451619872 {'max_features': 8, 'n_estimators': 152}
14540216293970593 {'max_features': 3, 'n_estimators': 150}

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([5.62869082e-04, 4.81859114e-01, 5.08040907e-01, 4.28212124e-04,
       6.71086804e-04, 4.51735910e-04, 9.18665718e-04, 1.49157615e-03,
       5.57583298e-03])

Share on

Twitter Facebook LinkedIn

World Hapiness

Giao Nguyen-Quynh