In [183]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [184]:
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
In [185]:
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
In [186]:
print(df_train.shape, df_test.shape)
(18507, 90) (4689, 90)
In [188]:
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
In [189]:
df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)
In [192]:
X_train.head(2)
Out[192]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0

SHAP (SHapley Additive exPlanations)

Feature Contribution over each row

In [229]:
import shap

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values.shape)
(4689, 82)
In [230]:
pd.DataFrame(X_train.iloc[0,:]).transpose()
Out[230]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138.0 0.0 0.0 30.0 0.0 0.0 28.0 0.0 0.0 0.0 28.0 0.0 0.0 0.0 44.0 0.0 0.0 65.0 0.0 0.0 0.0 7.0 0.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 107.0 0.0 0.0 0.0 0.0 78.0 0.0 0.0 106.0 0.0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.77 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
In [231]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])
Out[231]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The prediction starts from the baseline.

The baseline for Shapley values is the average of all predictions. The base_value here is 0.4597 while our predicted value is 0.61. Month=7 has the highest impact on increasing the prediction. Wheras, e1_0_action has the highest negative impact (decreasing the prediction).

Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue

In [232]:
pd.DataFrame(X_test.iloc[30,:]).transpose()
Out[232]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
30 181.0 0.0 0.0 20.0 0.0 0.0 0.0 21.0 0.0 10.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 150.0 0.0 0.0 153.0 0.0 0.0 7.0 0.0 0.0 77.0 0.0 0.0 182.0 0.0 182.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 0.0 7.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.61 75.58 2.69 1.0 0.053264 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [233]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[30,:], X_test.iloc[30,:])
Out[233]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [234]:
pd.DataFrame(X_test.iloc[300,:]).transpose()
Out[234]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
300 149.0 0.0 0.0 0.0 147.0 0.0 0.0 92.0 0.0 0.0 0.0 136.0 0.0 0.0 0.0 0.0 136.0 0.0 0.0 37.0 0.0 0.0 0.0 37.0 0.0 0.0 131.0 0.0 0.0 0.0 103.0 0.0 0.0 150.0 0.0 0.0 0.0 0.0 0.0 0.0 147.0 0.0 0.0 0.0 150.0 119.309 15847.419 559.812 7.11 83.241 4.7 0.3 42.5 73.0 4.9445 0.67 84.6 9.99 1.0 -0.015717 56.031 1.004913 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [235]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[300,:], X_test.iloc[300,:])
Out[235]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [236]:
pd.DataFrame(X_test.iloc[3000,:]).transpose()
Out[236]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
3000 128.0 0.0 0.0 61.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 130.0 0.0 0.0 0.0 130.0 0.0 129.0 0.0 0.0 84.0 0.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 123.0 0.0 123.0 0.0 0.0 0.0 0.0 93.0 0.0 0.0 0.0 0.0 110.0 15.196 2014.306 268.024 2.42 52.232 0.1 1.6 23.0 59.31 2.0025 0.99 72.57 6.47 1.0 0.0 43.136 1.325374 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [237]:
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[3000,:], X_test.iloc[3000,:])
Out[237]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [238]:
# load JS visualization code to notebook
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_test)
Out[238]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [424]:
# shap.save_html('Figures_high_quality/dependen_plot_c3_2.html', shap.dependence_plot("c3_2_action", shap_values, X_test))

Features that are pink contribute to the model output being higher, that means predicting a success of the project. Blue parts of the visualization indicate a lower model output, predicting a failed project.

Features pushing the prediction higher are shown in pink, those pushing the prediction lower are in blue

Feature Importance

In [239]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

To get an overview of which features are most important for a model we can plot the SHAP values of every feature for every sample. The summary plot tells which features are most important, and also their range of effects over the dataset.

In [240]:
# load JS visualization code to notebook
shap.initjs()
# summarize the effects of all the features
shap.summary_plot(shap_values, X_test)

The summary plot combines feature importance with feature effects. In this summary plot, the order of the columns still represents the amount of information the column is accountable for in the prediction. Each dot in the visualization represents one prediction. The color is related to the real data point. If the actual value in the dataset was high, the color is pink; blue indicates the actual value being low. The x-axis represents the SHAP value, which is the impact on the model output. The model output 1 equates to the prediction of successful; 0 the prediction that the project is going to fail.

Decision Plot

0

In [241]:
pd.DataFrame(X_test.iloc[0,:]).transpose()
Out[241]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 151.0 0.0 0.0 0.0 134.0 0.0 0.0 0.0 124.0 0.0 0.0 147.0 0.0 0.0 0.0 0.0 112.0 0.0 0.0 120.0 0.0 0.0 123.0 0.0 0.0 0.0 105.0 0.0 47.0 0.0 0.0 152.0 0.0 152.0 0.0 0.0 0.0 0.0 0.0 152.0 0.0 0.0 0.0 152.0 0.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.29 79.41 2.92 82.181818 0.002409 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [242]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[0,:],
    features=X_test.iloc[0,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)
In [ ]:
shap.decision_plot(explainer.expected_value, shap_values, X_display.loc[features.index])

25

In [243]:
pd.DataFrame(X_test.iloc[25,:]).transpose()
Out[243]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
25 176.0 0.0 0.0 15.0 0.0 0.0 0.0 16.0 0.0 5.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 145.0 0.0 0.0 148.0 0.0 0.0 2.0 0.0 0.0 72.0 0.0 0.0 177.0 0.0 177.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 2.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.32 76.96 2.22 1.0 0.319582 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [244]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[25,:],
    features=X_test.iloc[25,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

250

In [245]:
pd.DataFrame(X_test.iloc[250,:]).transpose()
Out[245]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
250 209.0 0.0 0.0 0.0 19.0 0.0 0.0 0.0 20.0 0.0 0.0 52.0 0.0 0.0 0.0 0.0 45.0 0.0 96.0 0.0 0.0 0.0 0.0 20.0 0.0 0.0 156.0 0.0 0.0 0.0 155.0 0.0 163.0 0.0 0.0 150.0 0.0 0.0 0.0 0.0 0.0 117.0 0.0 0.0 64.0 3.202 44648.71 107.791 5.07 65.386 3.84 13.0 16.5 83.44 12.8165 0.86 44.46 15.27 3.0 -0.010368 86.124 6.344081 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
In [246]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[250,:],
    features=X_test.iloc[250,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

2500

In [247]:
pd.DataFrame(X_test.iloc[2500,:]).transpose()
Out[247]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
2500 182.0 0.0 0.0 0.0 144.0 0.0 0.0 134.0 0.0 58.0 0.0 0.0 0.0 0.0 58.0 0.0 0.0 183.0 0.0 0.0 183.0 0.0 0.0 0.0 103.0 0.0 0.0 0.0 0.0 112.0 0.0 0.0 0.0 183.0 0.0 0.0 0.0 0.0 0.0 135.0 0.0 0.0 0.0 135.0 0.0 90.672 3645.07 270.892 4.0 66.229 0.8 2.0 33.7 69.82 3.3985 0.6 92.96 4.84 1.0 -0.02828 23.805 1.408793 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [248]:
# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[2500,:],
    features=X_test.iloc[2500,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

Dependence Plot

Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extenstion of the classical parital dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature’s value was NaN.

In [249]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("urbanPopulation", shap_values, X_test)
In [250]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("e1_0_action", shap_values, X_test)

shap.common.approximate_interactions is used to pick what seems to be the strongest interaction.

In [251]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("cvd_death_rate", shap_values, X_test)
In [252]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("gdp_per_capita", shap_values, X_test)
In [448]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("c4_4_action", shap_values, X_test)
In [450]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("male_smokers", shap_values, X_test)
In [255]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("hospital_beds_per_thousand", shap_values, X_test)
In [425]:
# create a dependence plot to show the effect of a single feature across the whole dataset

shap.dependence_plot("c3_2_action", shap_values, X_test)
# plt.savefig('Figures_high_quality/dependence_plot_c3_2.pdf')
<Figure size 432x288 with 0 Axes>

Interaction Between Bariables

In [428]:
X_train.head(2)
Out[428]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0

urbanPopulation & cvd_death_rate

In [474]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [472]:
inter1 = pdp.pdp_interact(
    model=model, dataset=X_train[X_train.columns], 
    model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate'])
fig, axes = pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='grid', x_quantile=True, plot_pdp=True)
In [465]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['urbanPopulation', 'cvd_death_rate'], feature_names=['urbanPopulation', 'cvd_death_rate'])

e1_0_action & healthExpenditure

In [455]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['e1_0_action', 'healthExpenditure']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['e1_0_action', 'healthExpenditure'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [466]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['e1_0_action', 'healthExpenditure'], feature_names=['e1_0_action', 'healthExpenditure'])

gdp_per_capita & cvd_death_rate

In [456]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['gdp_per_capita', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['gdp_per_capita', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [467]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['gdp_per_capita', 'cvd_death_rate'], feature_names=['gdp_per_capita', 'cvd_death_rate'])

c4_4_action & diabetes_prevalence

In [459]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c4_4_action', 'diabetes_prevalence']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c4_4_action', 'diabetes_prevalence'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [468]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c4_4_action', 'diabetes_prevalence'], feature_names=['c4_4_action', 'diabetes_prevalence'])

male_smokers && gdp_per_capita

In [460]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'gdp_per_capita']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'gdp_per_capita'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [469]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'gdp_per_capita'], feature_names=['male_smokers', 'gdp_per_capita'])

hospital_beds_per_thousand & male_smokers

In [461]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'hospital_beds_per_thousand']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'hospital_beds_per_thousand'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [470]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'hospital_beds_per_thousand'], feature_names=['male_smokers', 'hospital_beds_per_thousand'])

c3_2_action & urbanPopulation

In [462]:
inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c3_2_action', 'urbanPopulation']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c3_2_action', 'urbanPopulation'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)
In [471]:
fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c3_2_action', 'urbanPopulation'], feature_names=['c3_2_action', 'urbanPopulation'])
In [476]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')
Out[476]: