In [183]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [184]:
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
In [185]:
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
In [186]:
print(df_train.shape, df_test.shape)
(18507, 90) (4689, 90)
In [187]:
df_train.head(2)
Out[187]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action Log_new_cases_per_million Outputvalue_lag1 Outputvalue_lag2 Outputvalue_lag3 Outputvalue_lag4 Outputvalue_lag5 Outputvalue_lag6 Outputvalue_lag7 population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure weather_situation_clear-day weather_situation_clear-night weather_situation_cloudy weather_situation_partly-cloudy-day weather_situation_partly-cloudy-night weather_situation_rain weather_situation_wind-fog-snow Day_of_Week_0 Day_of_Week_1 Day_of_Week_2 Day_of_Week_3 Day_of_Week_4 Day_of_Week_5 Day_of_Week_6 continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 1.374803 1.559703 1.48817 1.504022 1.460146 1.314647 1.113776 1.308031 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 2.238746 2.290159 2.25147 2.148390 2.216469 2.174336 2.169545 2.279605 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
In [188]:
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
In [189]:
df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)
In [190]:
cols = ['Outputvalue_lag1','Outputvalue_lag2',
              'Outputvalue_lag3','Outputvalue_lag4',
              'Outputvalue_lag5','Outputvalue_lag6',
              'Outputvalue_lag7']
df_train.drop(cols,axis=1, inplace=True)
df_test.drop(cols,axis=1, inplace=True)
df_train.head(2)
Out[190]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action Log_new_cases_per_million population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 1.374803 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 2.238746 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
In [191]:
y_train = df_train['Log_new_cases_per_million']
X_train = df_train.drop('Log_new_cases_per_million', axis=1)

y_test = df_test['Log_new_cases_per_million']
X_test = df_test.drop('Log_new_cases_per_million', axis=1)
In [192]:
X_train.head(2)
Out[192]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0

Train Models

In [194]:
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('xgboost',xgboost.XGBRegressor()))
    level0.append(('ranFor',RandomForestRegressor(random_state=0)))
    level0.append(('graBoost',GradientBoostingRegressor(random_state=0)))
    level0.append(('ridge', Ridge(alpha = 10,tol=0.001)))
    level0.append(('lasso', Lasso(alpha = 0.001)))
    level0.append(('linear_reg', LinearRegression()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model

# get a list of models to evaluate
def get_models():
    models = dict()
    models['xgboost'] = xgboost.XGBRegressor()
    models['ranFor'] = RandomForestRegressor(random_state=0)
    models['graBoost'] = GradientBoostingRegressor(random_state=0)
    models['ridge'] = Ridge(alpha = 0.001)
    models['lasso'] = Lasso(alpha = 0.001)
    models['linear_reg'] = LinearRegression()
    models['stacking'] = get_stacking()
    return models

R2

In [195]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
>xgboost 0.809 (0.011)
>ranFor 0.815 (0.010)
>graBoost 0.705 (0.010)
>ridge 0.446 (0.017)
>lasso 0.446 (0.017)
>linear_reg 0.446 (0.017)
>stacking 0.820 (0.010)

Root mean squared error

In [196]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
>xgboost -0.392 (0.012)
>ranFor -0.385 (0.012)
>graBoost -0.487 (0.010)
>ridge -0.667 (0.014)
>lasso -0.667 (0.014)
>linear_reg -0.667 (0.014)
>stacking -0.380 (0.012)

Residual + R2 plot

In [197]:
model = RandomForestRegressor(random_state=0)
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.show()
Out[197]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7364fbdf10>

Feature Importance

In [198]:
model = RandomForestRegressor(random_state=0)
from yellowbrick.model_selection import FeatureImportances
plt.figure(figsize=(8,24))
viz = FeatureImportances(model,colormap='hot_r')
viz.fit(X_train, y_train)
viz.show()
Out[198]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7366abf410>

Test Set Evaluation

R2

RandomForestRegressor

In [199]:
from sklearn.inspection import plot_partial_dependence
reg = RandomForestRegressor(random_state=0).fit(X_train, y_train)
from sklearn.metrics import r2_score
y_pred = reg.predict(X_test)
r2_score(y_test, y_pred)
Out[199]:
0.5991532062914151

stacking

In [200]:
y_pred = models['stacking'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)
Out[200]:
0.6120707964634038

xgboost

In [201]:
y_pred = models['xgboost'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)
Out[201]:
0.5757600991254161

graBoost

In [202]:
y_pred = models['graBoost'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)
Out[202]:
0.5254250706506605

ridge

In [203]:
y_pred = models['ridge'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)
Out[203]:
0.37777324224138

lasso

In [204]:
y_pred = models['lasso'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)
Out[204]:
0.37485859551475054

linear_reg

In [205]:
y_pred = models['linear_reg'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)
Out[205]:
0.37777327990805154

RMSE

RandomForestRegressor

In [206]:
import math
from sklearn.metrics import mean_squared_error
y_pred = reg.predict(X_test)
math.sqrt(mean_squared_error(y_test, y_pred))
Out[206]:
0.583818428841278

eli5: Permutation Importance show weights

In [207]:
import eli5
from eli5.sklearn import PermutationImportance
In [208]:
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
perm = PermutationImportance(model).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_train.columns.tolist(), top=50)
Out[208]:
Weight Feature
0.4624 ± 0.0211 urbanPopulation
0.1408 ± 0.0067 cvd_death_rate
0.0871 ± 0.0075 c4_4_action
0.0869 ± 0.0100 e1_0_action
0.0857 ± 0.0163 hospital_beds_per_thousand
0.0524 ± 0.0095 male_smokers
0.0494 ± 0.0024 population_density
0.0250 ± 0.0049 e2_2_action
0.0204 ± 0.0052 c2_0_action
0.0168 ± 0.0027 c3_2_action
0.0154 ± 0.0008 life_expectancy
0.0153 ± 0.0015 gdp_per_capita
0.0133 ± 0.0017 Number of Tweet
0.0117 ± 0.0025 aged_65_older_sum
0.0107 ± 0.0006 c7_2_action
0.0102 ± 0.0021 h2_2_action
0.0101 ± 0.0022 c1_3_action
0.0095 ± 0.0063 c8_4_action
0.0076 ± 0.0024 e1_1_action
0.0066 ± 0.0031 h2_3_action
0.0065 ± 0.0025 h3_1_action
0.0056 ± 0.0011 c6_2_action
0.0053 ± 0.0018 c2_2_action
0.0049 ± 0.0013 c6_1_action
0.0049 ± 0.0013 handwashing_facilities
0.0045 ± 0.0021 temperature
0.0035 ± 0.0004 c5_0_action
0.0032 ± 0.0018 Thursday
0.0030 ± 0.0012 healthExpenditure
0.0029 ± 0.0006 female_smokers
0.0028 ± 0.0009 c5_2_action
0.0025 ± 0.0004 humidity
0.0018 ± 0.0001 h1_1_action
0.0018 ± 0.0010 c5_1_action
0.0017 ± 0.0009 h3_2_action
0.0017 ± 0.0004 c1_0_action
0.0015 ± 0.0002 continent_Europe
0.0015 ± 0.0011 h2_1_action
0.0013 ± 0.0009 c7_0_action
0.0012 ± 0.0005 Days since first case
0.0012 ± 0.0004 c3_1_action
0.0011 ± 0.0006 Wednesday
0.0011 ± 0.0006 c2_3_action
0.0010 ± 0.0005 Monday
0.0009 ± 0.0010 continent_Asia
0.0009 ± 0.0002 Tuesday
0.0008 ± 0.0014 c4_3_action
0.0007 ± 0.0002 c1_1_action
0.0006 ± 0.0011 c2_1_action
0.0005 ± 0.0006 Saturday
… 32 more …

eli5 explains weights

In [209]:
eli5.explain_weights_df(perm, feature_names=X_train.columns.to_list())\
  .head(50).style.background_gradient(subset=['weight'])
Out[209]:
feature weight std
0 urbanPopulation 0.462448 0.0105442
1 cvd_death_rate 0.140767 0.00336957
2 c4_4_action 0.0870728 0.00376409
3 e1_0_action 0.0869415 0.00500541
4 hospital_beds_per_thousand 0.085703 0.00816721
5 male_smokers 0.0523851 0.00476707
6 population_density 0.0493625 0.00121114
7 e2_2_action 0.0249648 0.00242754
8 c2_0_action 0.0204493 0.00257985
9 c3_2_action 0.0167923 0.0013408
10 life_expectancy 0.0154441 0.000405461
11 gdp_per_capita 0.015254 0.000770447
12 Number of Tweet 0.0133228 0.000834265
13 aged_65_older_sum 0.0116758 0.00124254
14 c7_2_action 0.0106635 0.000290369
15 h2_2_action 0.0101597 0.00105717
16 c1_3_action 0.0101119 0.00109259
17 c8_4_action 0.00949325 0.00313744
18 e1_1_action 0.00761092 0.00119672
19 h2_3_action 0.00662889 0.00155727
20 h3_1_action 0.00650767 0.00126211
21 c6_2_action 0.00560097 0.000530179
22 c2_2_action 0.00526331 0.000920847
23 c6_1_action 0.00494054 0.000635613
24 handwashing_facilities 0.00492399 0.000627038
25 temperature 0.00449942 0.0010565
26 c5_0_action 0.00353339 0.000198231
27 Thursday 0.00318915 0.000893301
28 healthExpenditure 0.00297727 0.000575215
29 female_smokers 0.00286985 0.000302607
30 c5_2_action 0.00278839 0.000463958
31 humidity 0.00245344 0.000207741
32 h1_1_action 0.00182261 3.6703e-05
33 c5_1_action 0.00175089 0.000491127
34 h3_2_action 0.00168831 0.000439305
35 c1_0_action 0.00165642 0.000179647
36 continent_Europe 0.0015024 0.000106517
37 h2_1_action 0.00145065 0.000553079
38 c7_0_action 0.00127572 0.000448311
39 Days since first case 0.0012389 0.000266965
40 c3_1_action 0.00118112 0.000213671
41 Wednesday 0.00109696 0.000298284
42 c2_3_action 0.00106179 0.000300479
43 Monday 0.00104755 0.000259711
44 continent_Asia 0.000900208 0.000476615
45 Tuesday 0.00087217 8.8122e-05
46 c4_3_action 0.000806217 0.000683279
47 c1_1_action 0.000736951 8.16205e-05
48 c2_1_action 0.000587083 0.000540738
49 Saturday 0.000520235 0.00030409

eli5 shows prediction

In [210]:
eli5.show_prediction(model, X_test.iloc[0,:],show_feature_values=True)
Out[210]:

y (score 0.109) top features

Contribution? Feature Value
+0.460 <BIAS> 1.000
+0.284 h3_1_action 152.000
+0.193 cvd_death_rate 597.029
+0.092 population_density 54.422
+0.086 c3_2_action 147.000
+0.063 windSpeed 2.920
+0.051 Days since first case 151.000
+0.043 humidity 0.290
+0.041 healthExpenditure 0.600
+0.032 gdp_per_capita 1803.987
+0.029 diabetes_prevalence 9.590
+0.019 handwashing_facilities 37.746
+0.011 Monday 0.000
+0.008 life_expectancy 64.830
+0.008 c8_4_action 0.000
+0.007 female_smokers 4.805
+0.006 c6_2_action 123.000
+0.005 aged_65_older_sum 1.959
+0.004 c6_1_action 0.000
+0.004 Sentiments 0.002
+0.003 e2_1_action 0.000
+0.002 male_smokers 39.697
+0.002 c7_1_action 0.000
+0.002 Saturday 1.000
+0.001 Thursday 0.000
+0.001 Wednesday 0.000
+0.000 cloudy 0.000
+0.000 Tuesday 0.000
-0.000 clear-day 1.000
-0.001 Friday 0.000
-0.001 c2_2_action 0.000
-0.002 c4_3_action 0.000
-0.009 Sunday 0.000
-0.012 h2_1_action 152.000
-0.014 c5_1_action 0.000
-0.017 hospital_beds_per_thousand 0.500
-0.017 Number of Tweet 82.182
-0.023 c2_3_action 124.000
-0.024 e1_1_action 0.000
-0.027 c8_1_action 47.000
-0.029 e2_2_action 0.000
-0.036 c7_2_action 105.000
-0.039 c5_2_action 120.000
-0.053 e2_0_action 152.000
-0.063 c1_3_action 134.000
-0.067 c4_4_action 112.000
-0.082 h2_2_action 0.000
-0.144 urbanPopulation 25.754
-0.286 e1_0_action 152.000
-0.402 temperature 79.410

PDP

In [481]:
from sklearn.inspection import plot_partial_dependence
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
Out[481]:
RandomForestRegressor(random_state=0)
In [485]:
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
column_index(X_test, ['urbanPopulation', 'e1_0_action', 'c4_4_action','cvd_death_rate','hospital_beds_per_thoudsand','male_smokers','c3_2_action'])
Out[485]:
array([60, 31, 16, 47, 50, 52, 11])
In [495]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[60])  
plt.savefig('Figures_high_quality/PDP_urbanPopulation.pdf')
<Figure size 1600x800 with 0 Axes>
In [496]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[31])  
plt.savefig('Figures_high_quality/PDP_e1_0_action.pdf')
<Figure size 1600x800 with 0 Axes>
In [497]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[16])  
plt.savefig('Figures_high_quality/PDP_c4_4_action.pdf')
<Figure size 1600x800 with 0 Axes>
In [498]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[47])  
plt.savefig('Figures_high_quality/PDP_cvd_death_rate.pdf')
<Figure size 1600x800 with 0 Axes>
In [499]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[50])  
plt.savefig('Figures_high_quality/PDP_hospital_beds_per_thoudsand.pdf')
<Figure size 1600x800 with 0 Axes>
In [500]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[52])  
plt.savefig('Figures_high_quality/PDP_male_smokers.pdf')
<Figure size 1600x800 with 0 Axes>
In [501]:
figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[11])  
plt.savefig('Figures_high_quality/PDP_c3_2_action.pdf')
<Figure size 1600x800 with 0 Axes>

SHAP (SHapley Additive exPlanations)

Feature Contribution over each row

In [229]:
import shap

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values.shape)
(4689, 82)
In [230]:
pd.DataFrame(X_train.iloc[0,:]).transpose()
Out[230]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138.0 0.0 0.0 30.0 0.0 0.0 28.0 0.0 0.0 0.0 28.0 0.0 0.0 0.0 44.0 0.0 0.0 65.0 0.0 0.0 0.0 7.0 0.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 107.0 0.0 0.0 0.0 0.0 78.0 0.0 0.0 106.0 0.0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.77 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
In [231]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])
Out[231]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The prediction starts from the baseline.

The baseline for Shapley values is the average of all predictions. The base_value here is 0.4597 while our predicted value is 0.61. Month=7 has the highest impact on increasing the prediction. Wheras, e1_0_action has the highest negative impact (decreasing the prediction).

Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue

In [232]:
pd.DataFrame(X_test.iloc[30,:]).transpose()
Out[232]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
30 181.0 0.0 0.0 20.0 0.0 0.0 0.0 21.0 0.0 10.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 150.0 0.0 0.0 153.0 0.0 0.0 7.0 0.0 0.0 77.0 0.0 0.0 182.0 0.0 182.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 0.0 7.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.61 75.58 2.69 1.0 0.053264 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [233]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[30,:], X_test.iloc[30,:])
Out[233]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [234]:
pd.DataFrame(X_test.iloc[300,:]).transpose()
Out[234]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
300 149.0 0.0 0.0 0.0 147.0 0.0 0.0 92.0 0.0 0.0 0.0 136.0 0.0 0.0 0.0 0.0 136.0 0.0 0.0 37.0 0.0 0.0 0.0 37.0 0.0 0.0 131.0 0.0 0.0 0.0 103.0 0.0 0.0 150.0 0.0 0.0 0.0 0.0 0.0 0.0 147.0 0.0 0.0 0.0 150.0 119.309 15847.419 559.812 7.11 83.241 4.7 0.3 42.5 73.0 4.9445 0.67 84.6 9.99 1.0 -0.015717 56.031 1.004913 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [235]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[300,:], X_test.iloc[300,:])
Out[235]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [236]:
pd.DataFrame(X_test.iloc[3000,:]).transpose()
Out[236]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
3000 128.0 0.0 0.0 61.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 130.0 0.0 0.0 0.0 130.0 0.0 129.0 0.0 0.0 84.0 0.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 123.0 0.0 123.0 0.0 0.0 0.0 0.0 93.0 0.0 0.0 0.0 0.0 110.0 15.196 2014.306 268.024 2.42 52.232 0.1 1.6 23.0 59.31 2.0025 0.99 72.57 6.47 1.0 0.0 43.136 1.325374 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [237]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[3000,:], X_test.iloc[3000,:])
Out[237]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [238]:
# load JS visualization code to notebook
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_test)
Out[238]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [424]:
# shap.save_html('Figures_high_quality/dependen_plot_c3_2.html', shap.dependence_plot("c3_2_action", shap_values, X_test))

Features that are pink contribute to the model output being higher, that means predicting a success of the project. Blue parts of the visualization indicate a lower model output, predicting a failed project.

Features pushing the prediction higher are shown in pink, those pushing the prediction lower are in blue

Feature Importance

In [239]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

To get an overview of which features are most important for a model we can plot the SHAP values of every feature for every sample. The summary plot tells which features are most important, and also their range of effects over the dataset.

In [240]:
# load JS visualization code to notebook
shap.initjs()
# summarize the effects of all the features
shap.summary_plot(shap_values, X_test)

The summary plot combines feature importance with feature effects. In this summary plot, the order of the columns still represents the amount of information the column is accountable for in the prediction. Each dot in the visualization represents one prediction. The color is related to the real data point. If the actual value in the dataset was high, the color is pink; blue indicates the actual value being low. The x-axis represents the SHAP value, which is the impact on the model output. The model output 1 equates to the prediction of successful; 0 the prediction that the project is going to fail.

Decision Plot

0

In [241]:
pd.DataFrame(X_test.iloc[0,:]).transpose()
Out[241]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 151.0 0.0 0.0 0.0 134.0 0.0 0.0 0.0 124.0 0.0 0.0 147.0 0.0 0.0 0.0 0.0 112.0 0.0 0.0 120.0 0.0 0.0 123.0 0.0 0.0 0.0 105.0 0.0 47.0 0.0 0.0 152.0 0.0 152.0 0.0 0.0 0.0 0.0 0.0 152.0 0.0 0.0 0.0 152.0 0.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.29 79.41 2.92 82.181818 0.002409 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [242]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[0,:],
    features=X_test.iloc[0,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)
In [ ]:
shap.decision_plot(explainer.expected_value, shap_values, X_display.loc[features.index])

25

In [243]:
pd.DataFrame(X_test.iloc[25,:]).transpose()
Out[243]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
25 176.0 0.0 0.0 15.0 0.0 0.0 0.0 16.0 0.0 5.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 145.0 0.0 0.0 148.0 0.0 0.0 2.0 0.0 0.0 72.0 0.0 0.0 177.0 0.0 177.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 2.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.32 76.96 2.22 1.0 0.319582 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [244]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[25,:],
    features=X_test.iloc[25,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

250

In [245]:
pd.DataFrame(X_test.iloc[250,:]).transpose()
Out[245]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
250 209.0 0.0 0.0 0.0 19.0 0.0 0.0 0.0 20.0 0.0 0.0 52.0 0.0 0.0 0.0 0.0 45.0 0.0 96.0 0.0 0.0 0.0 0.0 20.0 0.0 0.0 156.0 0.0 0.0 0.0 155.0 0.0 163.0 0.0 0.0 150.0 0.0 0.0 0.0 0.0 0.0 117.0 0.0 0.0 64.0 3.202 44648.71 107.791 5.07 65.386 3.84 13.0 16.5 83.44 12.8165 0.86 44.46 15.27 3.0 -0.010368 86.124 6.344081 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
In [246]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[250,:],
    features=X_test.iloc[250,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

2500

In [247]:
pd.DataFrame(X_test.iloc[2500,:]).transpose()
Out[247]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
2500 182.0 0.0 0.0 0.0 144.0 0.0 0.0 134.0 0.0 58.0 0.0 0.0 0.0 0.0 58.0 0.0 0.0 183.0 0.0 0.0 183.0 0.0 0.0 0.0 103.0 0.0 0.0 0.0 0.0 112.0 0.0 0.0 0.0 183.0 0.0 0.0 0.0 0.0 0.0 135.0 0.0 0.0 0.0 135.0 0.0 90.672 3645.07 270.892 4.0 66.229 0.8 2.0 33.7 69.82 3.3985 0.6 92.96 4.84 1.0 -0.02828 23.805 1.408793 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [248]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[2500,:],
    features=X_test.iloc[2500,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

Dependence Plot

Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extenstion of the classical parital dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature’s value was NaN.

In [480]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("urbanPopulation", shap_values, X_test)
In [250]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("e1_0_action", shap_values, X_test)

shap.common.approximate_interactions is used to pick what seems to be the strongest interaction.

In [251]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("cvd_death_rate", shap_values, X_test)
In [252]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("gdp_per_capita", shap_values, X_test)
In [448]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("c4_4_action", shap_values, X_test)
In [450]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("male_smokers", shap_values, X_test)
In [255]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("hospital_beds_per_thousand", shap_values, X_test)
In [425]:
# create a dependence plot to show the effect of a single feature across the whole dataset

shap.dependence_plot("c3_2_action", shap_values, X_test)
# plt.savefig('Figures_high_quality/dependence_plot_c3_2.pdf')
<Figure size 432x288 with 0 Axes>

Interaction Between Bariables

In [428]:
X_train.head(2)
Out[428]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0

urbanPopulation & cvd_death_rate

In [474]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [472]:
inter1 = pdp.pdp_interact(
    model=model, dataset=X_train[X_train.columns], 
    model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate'])
fig, axes = pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='grid', x_quantile=True, plot_pdp=True)
In [465]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['urbanPopulation', 'cvd_death_rate'], feature_names=['urbanPopulation', 'cvd_death_rate'])

e1_0_action & healthExpenditure

In [455]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['e1_0_action', 'healthExpenditure']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['e1_0_action', 'healthExpenditure'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [466]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['e1_0_action', 'healthExpenditure'], feature_names=['e1_0_action', 'healthExpenditure'])

gdp_per_capita & cvd_death_rate

In [456]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['gdp_per_capita', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['gdp_per_capita', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [467]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['gdp_per_capita', 'cvd_death_rate'], feature_names=['gdp_per_capita', 'cvd_death_rate'])

c4_4_action & diabetes_prevalence

In [459]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c4_4_action', 'diabetes_prevalence']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c4_4_action', 'diabetes_prevalence'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [468]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c4_4_action', 'diabetes_prevalence'], feature_names=['c4_4_action', 'diabetes_prevalence'])

male_smokers && gdp_per_capita

In [460]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'gdp_per_capita']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'gdp_per_capita'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [469]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'gdp_per_capita'], feature_names=['male_smokers', 'gdp_per_capita'])

hospital_beds_per_thousand & male_smokers

In [461]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'hospital_beds_per_thousand']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'hospital_beds_per_thousand'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [470]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'hospital_beds_per_thousand'], feature_names=['male_smokers', 'hospital_beds_per_thousand'])

c3_2_action & urbanPopulation

In [462]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c3_2_action', 'urbanPopulation']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c3_2_action', 'urbanPopulation'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [471]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c3_2_action', 'urbanPopulation'], feature_names=['c3_2_action', 'urbanPopulation'])

Local Surrogate (LIME)

LIME basically tries to step away from deriving the importance of global features and instead approximates the importance of features for local predictions. It does so by taking the row (or set of data points) from which to predict and generate fake data based on that row. It then calculates the similarity between the fake data and the real data and approximates the effect of the changes based on the similarity between the fake and real data

In [258]:
import lime
import lime.lime_tabular
In [259]:
explainer1 = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, verbose=True, mode='regression')

25

In [260]:
pd.DataFrame(X_test.iloc[25,:]).transpose()
Out[260]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
25 176.0 0.0 0.0 15.0 0.0 0.0 0.0 16.0 0.0 5.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 145.0 0.0 0.0 148.0 0.0 0.0 2.0 0.0 0.0 72.0 0.0 0.0 177.0 0.0 177.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 2.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.32 76.96 2.22 1.0 0.319582 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [406]:
i = 25
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')
Intercept 0.4629940975312947
Prediction_local [0.02338742]
Right: 0.43631627172114307
<Figure size 1600x800 with 0 Axes>

blue -> negative influence

orange -> positive influence

250

In [263]:
pd.DataFrame(X_test.iloc[250,:]).transpose()
Out[263]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
250 209.0 0.0 0.0 0.0 19.0 0.0 0.0 0.0 20.0 0.0 0.0 52.0 0.0 0.0 0.0 0.0 45.0 0.0 96.0 0.0 0.0 0.0 0.0 20.0 0.0 0.0 156.0 0.0 0.0 0.0 155.0 0.0 163.0 0.0 0.0 150.0 0.0 0.0 0.0 0.0 0.0 117.0 0.0 0.0 64.0 3.202 44648.71 107.791 5.07 65.386 3.84 13.0 16.5 83.44 12.8165 0.86 44.46 15.27 3.0 -0.010368 86.124 6.344081 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
In [405]:
i = 250
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_250.pdf')
Intercept 0.15037628302273087
Prediction_local [0.83037792]
Right: 0.18410775678020885
<Figure size 1600x800 with 0 Axes>

2500

In [265]:
pd.DataFrame(X_test.iloc[2500,:]).transpose()
Out[265]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
2500 182.0 0.0 0.0 0.0 144.0 0.0 0.0 134.0 0.0 58.0 0.0 0.0 0.0 0.0 58.0 0.0 0.0 183.0 0.0 0.0 183.0 0.0 0.0 0.0 103.0 0.0 0.0 0.0 0.0 112.0 0.0 0.0 0.0 183.0 0.0 0.0 0.0 0.0 0.0 135.0 0.0 0.0 0.0 135.0 0.0 90.672 3645.07 270.892 4.0 66.229 0.8 2.0 33.7 69.82 3.3985 0.6 92.96 4.84 1.0 -0.02828 23.805 1.408793 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [404]:
i = 2500
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_2500.pdf')
Intercept 0.32360037315707413
Prediction_local [0.17171314]
Right: -0.5648482808748628
<Figure size 1600x800 with 0 Axes>

3000

In [267]:
pd.DataFrame(X_test.iloc[3000,:]).transpose()
Out[267]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
3000 128.0 0.0 0.0 61.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 130.0 0.0 0.0 0.0 130.0 0.0 129.0 0.0 0.0 84.0 0.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 123.0 0.0 123.0 0.0 0.0 0.0 0.0 93.0 0.0 0.0 0.0 0.0 110.0 15.196 2014.306 268.024 2.42 52.232 0.1 1.6 23.0 59.31 2.0025 0.99 72.57 6.47 1.0 0.0 43.136 1.325374 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [403]:
i = 3000
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')
Intercept 0.42599367919267617
Prediction_local [0.07624511]
Right: -0.10981900162251022
<Figure size 1600x800 with 0 Axes>

4688

In [269]:
pd.DataFrame(X_test.iloc[4688,:]).transpose()
Out[269]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
4688 178.0 0.0 0.0 85.0 0.0 0.0 0.0 123.0 0.0 0.0 0.0 169.0 0.0 0.0 0.0 0.0 14.0 0.0 159.0 0.0 0.0 0.0 51.0 0.0 0.0 14.0 0.0 0.0 0.0 0.0 159.0 0.0 133.0 0.0 0.0 95.0 0.0 0.0 0.0 0.0 0.0 123.0 0.0 0.0 178.0 46.754 12294.876 200.38 5.52 43.993 2.32 8.1 33.2 64.13 4.1985 0.62 43.86 4.85 148.0 0.000437 66.856 4.35277 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [391]:
i = 4688
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=10)
exp.show_in_notebook(show_table=True)
Intercept 0.10019342237355466
Prediction_local [0.74153209]
Right: 0.5718384449800371

Individual Conditional Expectation (ICE)

How does the prediction change when 1 feature changes ? Individual Conditional Expectation, as its name suggests, is a plot that shows how a change in an individual feature changes the outcome of each individual prediction (one line per prediction). It can be used for regression tasks only.

In [271]:
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
Out[271]:
RandomForestRegressor(random_state=0)
In [272]:
from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'urbanPopulation', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('urbanPopulation');
In [273]:
from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'gdp_per_capita', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('gdp_per_capita');

if gdp_per_capita is higher than 20000, then

In [274]:
ice_df = ice(X_train, 'e1_0_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('e1_0_action');
In [275]:
ice_df = ice(X_train, 'c4_4_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c4_4_action');
In [276]:
ice_df = ice(X_train, 'cvd_death_rate', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('cvd_death_rate');
In [277]:
ice_df = ice(X_train, 'hospital_beds_per_thousand', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('hospital_beds_per_thousand');
In [278]:
ice_df = ice(X_train, 'male_smokers', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('male_smokers');
In [279]:
ice_df = ice(X_train, 'c3_2_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c3_2_action');

Weather_situation

In [283]:
from pdpbox import pdp, get_dataset, info_plots
import matplotlib.font_manager
ross_data = df_train
ross_features = X_train.columns
ross_model = model
ross_target = "Log_new_cases_per_million"
In [361]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature=['clear-day', 'clear-night', 'cloudy', 'partly-cloudy-day','partly-cloudy-night','rain','wind-fog-snow'], 
    feature_name='Weather Situation', target=ross_target,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/weather_situation_target_plot.pdf')
In [419]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday','Sunday'], 
    feature_name='Day of week', target=ross_target,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/Day_of_week_target_plot.pdf')
In [363]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature=['continent_Africa', 'continent_Asia', 'continent_North America', 'continent_Oceania', 'continent_South America', 'continent_Europe'], 
    feature_name='Continents', target=ross_target,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/continents_target_plot.pdf')

gdp_per_capita

In [364]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, 
    feature='gdp_per_capita', 
    feature_name='gdp_per_capita', 
    target=ross_target,
    percentile_range=(0,100), 
    show_percentile=True,
    show_outliers = True,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/gdp_per_capita_target_plot.pdf')
In [365]:
fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='gdp_per_capita',
    feature_name='gdp_per_capita', 
    show_percentile=True,
    figsize = (20,10),
    predict_kwds={}
)
plt.savefig('Figures_high_quality/gdp_per_capita_actual_plot.pdf')
In [366]:
pdp_gdp = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='gdp_per_capita'
)
fig, axes = pdp.pdp_plot(pdp_gdp, 'gdp_per_capita')

fig, axes = pdp.pdp_plot(
    pdp_gdp, 'gdp_per_capita', 
    plot_lines=True, 
    frac_to_plot=2000, 
    x_quantile=True,
    figsize = (20,10), 
    plot_pts_dist=True, 
    show_percentile=True
)
plt.savefig('Figures_high_quality/gdp_per_capita_pdp_plot.pdf')

On the x-axis, you can find the values of the feature you are trying to understand, i.e., gdp_per_capita. On the y-axis, you find the prediction (in the case of regression, it is the real-valued prediction).

The bar on the bottom represents the distribution of training data points in different quantiles. It helps us gauge the goodness of the inference.

The single line in the PD plot shows the average functional relationship between the feature and target. All the lines in the ICE plot show the heterogeneity in the training data, i.e., how all the observations in the training data vary with the different values of the gdp_per_capita

The yellow and black line gives us the average effect on the predictions.

In [372]:
fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['gdp_per_capita', ['e1_0_action','cvd_death_rate', 'c4_4_action', 'c7_2_action','urbanPopulation']], 
    feature_names=['gdp_per_capita', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/gdp_per_capita_target_plot_interact.pdf')

Interaction between gdp_per_capita and different actions ('e1_0_action','cvd_death_rate', 'c4_4_action', 'c7_2_action')

The darker color means the higher prediction value (log_total_new_cases_milion) & the bubble size is of less importance

e1_0_action

In [367]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature='e1_0_action', 
    feature_name='e1_0_action',
    target=ross_target,
    show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_target_plot.pdf')
In [368]:
fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='e1_0_action',
    feature_name='e1_0_action', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_actual_plot.pdf')
In [370]:
pdp_gdp = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='e1_0_action'
)
fig, axes = pdp.pdp_plot(pdp_gdp, 'e1_0_action')

fig, axes = pdp.pdp_plot(
    pdp_gdp, 'e1_0_action', plot_lines=True, frac_to_plot=2000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_pdp_plot.pdf')
In [371]:
fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['e1_0_action', ['gdp_per_capita','c4_4_action', 'c7_2_action', 'cvd_death_rate','urbanPopulation']], 
    feature_names=['e1_0_action', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_target_plot_interact.pdf')

c4_4_action

In [376]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature='c4_4_action', feature_name='c4_4_action', target=ross_target, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_target_plot.pdf')
In [378]:
fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='c4_4_action',
    feature_name='c4_4_action', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_actual_plot.pdf')
In [379]:
pdp_h1 = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='c4_4_action'
)
fig, axes = pdp.pdp_plot(pdp_gdp, 'c4_4_action')

fig, axes = pdp.pdp_plot(
    pdp_h1, 'c4_4_action', plot_lines=True, frac_to_plot=3000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_actual_plot.pdf')
In [380]:
fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['c4_4_action', ['gdp_per_capita','e1_0_action', 'c7_2_action', 'cvd_death_rate','urbanPopulation']], 
    feature_names=['c4_4_action', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_pdp_plot.pdf')

cvd_death_rate

In [381]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data,
    feature='cvd_death_rate',
    feature_name='cvd_death_rate', 
    target=ross_target,
    show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_target_plot.pdf')
In [382]:
fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='cvd_death_rate',
    feature_name='cvd_death_rate', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_actual_plot.pdf')
In [383]:
pdp_cvd = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='cvd_death_rate'
)

fig, axes = pdp.pdp_plot(pdp_gdp, 'cvd_death_rate')

fig, axes = pdp.pdp_plot(
    pdp_cvd, 'cvd_death_rate', plot_lines=True, frac_to_plot=2000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_pdp_plot.pdf')
In [384]:
fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['cvd_death_rate', ['e1_0_action','gdp_per_capita','c4_4_action', 'urbanPopulation']], 
    feature_names=['cvd_death_rate', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_target_plot_interact.pdf')

hospital_beds_per_thousand

In [385]:
fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, 
    feature='hospital_beds_per_thousand', 
    feature_name='hospital_beds_per_thousand', 
    target=ross_target, 
    show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_target_plot.pdf')
In [386]:
fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='hospital_beds_per_thousand',
    feature_name='hospital_beds_per_thousand', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_actual_plot.pdf')
In [387]:
pdp_cvd = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='cvd_death_rate'
)

fig, axes = pdp.pdp_plot(pdp_gdp, 'hospital_beds_per_thousand')

fig, axes = pdp.pdp_plot(
    pdp_cvd, 'hospital_beds_per_thousand', plot_lines=True, frac_to_plot=2000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_pdp_plot.pdf')
In [388]:
fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['hospital_beds_per_thousand', ['e1_0_action','gdp_per_capita','c4_4_action', 'urbanPopulation']], 
    feature_names=['hospital_beds_per_thousand', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_target_plot_interact.pdf')
In [476]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')
Out[476]:

Decison Tree Interpretebility

In [309]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import graphviz
from sklearn.metrics import r2_score
X_train.iloc[:, [31,60]].head(0)
Out[309]:
e1_0_action urbanPopulation
In [310]:
clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
r2_score(y_pred, y_test)
export_graphviz(clf, out_file="tree.dot")
with open("tree.dot") as f:
    dot_graph = f.read()

graphviz.Source(dot_graph)
Out[310]:
Tree 0 X[31] <= 0.5 mse = 0.804 samples = 18507 value = 0.461 1 X[60] <= 63.199 mse = 0.766 samples = 10771 value = 0.751 0->1 True 8 X[60] <= 67.716 mse = 0.575 samples = 7736 value = 0.056 0->8 False 2 X[50] <= 4.42 mse = 0.574 samples = 4160 value = 0.336 1->2 5 X[16] <= 0.5 mse = 0.71 samples = 6611 value = 1.012 1->5 3 mse = 0.543 samples = 3172 value = 0.192 2->3 4 mse = 0.395 samples = 988 value = 0.798 2->4 6 mse = 0.615 samples = 3315 value = 0.725 5->6 7 mse = 0.639 samples = 3296 value = 1.302 5->7 9 X[47] <= 287.674 mse = 0.421 samples = 4874 value = -0.099 8->9 12 X[31] <= 71.5 mse = 0.728 samples = 2862 value = 0.32 8->12 10 mse = 0.399 samples = 2535 value = -0.253 9->10 11 mse = 0.39 samples = 2339 value = 0.068 9->11 13 mse = 0.632 samples = 2339 value = 0.154 12->13 14 mse = 0.482 samples = 523 value = 1.061 12->14