import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)

print(df_train.shape, df_test.shape)

(18507, 90) (4689, 90)

df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

X_train.head(2)

SHAP (SHapley Additive exPlanations)¶

Feature Contribution over each row¶

import shap

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values.shape)

(4689, 82)

pd.DataFrame(X_train.iloc[0,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The prediction starts from the baseline.

The baseline for Shapley values is the average of all predictions. The base_value here is 0.4597 while our predicted value is 0.61. Month=7 has the highest impact on increasing the prediction. Wheras, e1_0_action has the highest negative impact (decreasing the prediction).

Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue

pd.DataFrame(X_test.iloc[30,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[30,:], X_test.iloc[30,:])

pd.DataFrame(X_test.iloc[300,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[300,:], X_test.iloc[300,:])

pd.DataFrame(X_test.iloc[3000,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[3000,:], X_test.iloc[3000,:])

# load JS visualization code to notebook
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_test)

# shap.save_html('Figures_high_quality/dependen_plot_c3_2.html', shap.dependence_plot("c3_2_action", shap_values, X_test))

Features that are pink contribute to the model output being higher, that means predicting a success of the project. Blue parts of the visualization indicate a lower model output, predicting a failed project.

Features pushing the prediction higher are shown in pink, those pushing the prediction lower are in blue

Feature Importance¶

shap.summary_plot(shap_values, X_test, plot_type="bar")

To get an overview of which features are most important for a model we can plot the SHAP values of every feature for every sample. The summary plot tells which features are most important, and also their range of effects over the dataset.

# load JS visualization code to notebook
shap.initjs()
# summarize the effects of all the features
shap.summary_plot(shap_values, X_test)

The summary plot combines feature importance with feature effects. In this summary plot, the order of the columns still represents the amount of information the column is accountable for in the prediction. Each dot in the visualization represents one prediction. The color is related to the real data point. If the actual value in the dataset was high, the color is pink; blue indicates the actual value being low. The x-axis represents the SHAP value, which is the impact on the model output. The model output 1 equates to the prediction of successful; 0 the prediction that the project is going to fail.

Decision Plot¶

0¶

pd.DataFrame(X_test.iloc[0,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[0,:],
    features=X_test.iloc[0,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

shap.decision_plot(explainer.expected_value, shap_values, X_display.loc[features.index])

25¶

pd.DataFrame(X_test.iloc[25,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[25,:],
    features=X_test.iloc[25,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

250¶

pd.DataFrame(X_test.iloc[250,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[250,:],
    features=X_test.iloc[250,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

2500¶

pd.DataFrame(X_test.iloc[2500,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[2500,:],
    features=X_test.iloc[2500,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

Dependence Plot¶

Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extenstion of the classical parital dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature’s value was NaN.

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("urbanPopulation", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("e1_0_action", shap_values, X_test)

shap.common.approximate_interactions is used to pick what seems to be the strongest interaction.

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("cvd_death_rate", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("gdp_per_capita", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("c4_4_action", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("male_smokers", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("hospital_beds_per_thousand", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset

shap.dependence_plot("c3_2_action", shap_values, X_test)
# plt.savefig('Figures_high_quality/dependence_plot_c3_2.pdf')

<Figure size 432x288 with 0 Axes>

Interaction Between Bariables¶

X_train.head(2)

urbanPopulation & cvd_death_rate¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

inter1 = pdp.pdp_interact(
    model=model, dataset=X_train[X_train.columns], 
    model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate'])
fig, axes = pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='grid', x_quantile=True, plot_pdp=True)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['urbanPopulation', 'cvd_death_rate'], feature_names=['urbanPopulation', 'cvd_death_rate'])

e1_0_action & healthExpenditure¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['e1_0_action', 'healthExpenditure']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['e1_0_action', 'healthExpenditure'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['e1_0_action', 'healthExpenditure'], feature_names=['e1_0_action', 'healthExpenditure'])

gdp_per_capita & cvd_death_rate¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['gdp_per_capita', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['gdp_per_capita', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['gdp_per_capita', 'cvd_death_rate'], feature_names=['gdp_per_capita', 'cvd_death_rate'])

c4_4_action & diabetes_prevalence¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c4_4_action', 'diabetes_prevalence']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c4_4_action', 'diabetes_prevalence'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c4_4_action', 'diabetes_prevalence'], feature_names=['c4_4_action', 'diabetes_prevalence'])

male_smokers && gdp_per_capita¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'gdp_per_capita']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'gdp_per_capita'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'gdp_per_capita'], feature_names=['male_smokers', 'gdp_per_capita'])

hospital_beds_per_thousand & male_smokers¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'hospital_beds_per_thousand']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'hospital_beds_per_thousand'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'hospital_beds_per_thousand'], feature_names=['male_smokers', 'hospital_beds_per_thousand'])

c3_2_action & urbanPopulation¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c3_2_action', 'urbanPopulation']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c3_2_action', 'urbanPopulation'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c3_2_action', 'urbanPopulation'], feature_names=['c3_2_action', 'urbanPopulation'])

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')

	Days since first case	c1_0_action	c1_1_action	c1_2_action	c1_3_action	c2_0_action	c2_1_action	c2_2_action	c2_3_action	c3_0_action	c3_1_action	c3_2_action	c4_0_action	c4_1_action	c4_2_action	c4_3_action	c4_4_action	c5_0_action	c5_1_action	c5_2_action	c6_0_action	c6_1_action	c6_2_action	c6_3_action	c7_0_action	c7_1_action	c7_2_action	c8_0_action	c8_1_action	c8_2_action	c8_4_action	e1_0_action	e1_1_action	e2_0_action	e2_1_action	e2_2_action	h1_0_action	h1_1_action	h2_0_action	h2_1_action	h2_2_action	h2_3_action	h3_0_action	h3_1_action	h3_2_action	population_density	gdp_per_capita	cvd_death_rate	diabetes_prevalence	handwashing_facilities	hospital_beds_per_thousand	female_smokers	male_smokers	life_expectancy	aged_65_older_sum	humidity	temperature	windSpeed	Number of Tweet	Sentiments	urbanPopulation	healthExpenditure	clear-day	clear-night	cloudy	partly-cloudy-day	partly-cloudy-night	rain	wind-fog-snow	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday	continent_Africa	continent_Asia	continent_Europe	continent_North America	continent_Oceania	continent_South America
0	138	0	0	30	0	0	28	0	0	0	28	0	0	0	44	0	0	65	0	0	0	7	0	0	44	0	0	0	0	0	0	0	0	0	0	107	0	0	0	0	78	0	0	106	0	85.129	23313.199	370.946	9.74	86.979	6.892	22.9	37.1	76.05	14.7700	0.65	64.78	3.66	1.0	0.049127	54.084	4.053393	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
1	139	0	0	0	134	0	12	0	0	0	0	125	0	0	0	0	124	0	0	123	0	0	34	0	0	0	64	0	0	0	122	0	103	0	67	0	0	0	0	0	124	0	0	0	128	232.128	65530.537	132.235	15.84	97.400	2.000	2.7	37.0	75.49	1.7295	0.21	103.35	14.72	1.0	0.000000	100.000	4.623860	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0

	Days since first case	c1_0_action	c1_1_action	c1_2_action	c1_3_action	c2_0_action	c2_1_action	c2_2_action	c2_3_action	c3_0_action	c3_1_action	c3_2_action	c4_0_action	c4_1_action	c4_2_action	c4_3_action	c4_4_action	c5_0_action	c5_1_action	c5_2_action	c6_0_action	c6_1_action	c6_2_action	c6_3_action	c7_0_action	c7_1_action	c7_2_action	c8_0_action	c8_1_action	c8_2_action	c8_4_action	e1_0_action	e1_1_action	e2_0_action	e2_1_action	e2_2_action	h1_0_action	h1_1_action	h2_0_action	h2_1_action	h2_2_action	h2_3_action	h3_0_action	h3_1_action	h3_2_action	population_density	gdp_per_capita	cvd_death_rate	diabetes_prevalence	handwashing_facilities	hospital_beds_per_thousand	female_smokers	male_smokers	life_expectancy	aged_65_older_sum	humidity	temperature	windSpeed	Number of Tweet	Sentiments	urbanPopulation	healthExpenditure	clear-day	clear-night	cloudy	partly-cloudy-day	partly-cloudy-night	rain	wind-fog-snow	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday	continent_Africa	continent_Asia	continent_Europe	continent_North America	continent_Oceania	continent_South America
0	138	0	0	30	0	0	28	0	0	0	28	0	0	0	44	0	0	65	0	0	0	7	0	0	44	0	0	0	0	0	0	0	0	0	0	107	0	0	0	0	78	0	0	106	0	85.129	23313.199	370.946	9.74	86.979	6.892	22.9	37.1	76.05	14.7700	0.65	64.78	3.66	1.0	0.049127	54.084	4.053393	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
1	139	0	0	0	134	0	12	0	0	0	0	125	0	0	0	0	124	0	0	123	0	0	34	0	0	0	64	0	0	0	122	0	103	0	67	0	0	0	0	0	124	0	0	0	128	232.128	65530.537	132.235	15.84	97.400	2.000	2.7	37.0	75.49	1.7295	0.21	103.35	14.72	1.0	0.000000	100.000	4.623860	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0