import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)

print(df_train.shape, df_test.shape)

(18507, 90) (4689, 90)

df_train.head(2)

df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

cols = ['Outputvalue_lag1','Outputvalue_lag2',
              'Outputvalue_lag3','Outputvalue_lag4',
              'Outputvalue_lag5','Outputvalue_lag6',
              'Outputvalue_lag7']
df_train.drop(cols,axis=1, inplace=True)
df_test.drop(cols,axis=1, inplace=True)
df_train.head(2)

y_train = df_train['Log_new_cases_per_million']
X_train = df_train.drop('Log_new_cases_per_million', axis=1)

y_test = df_test['Log_new_cases_per_million']
X_test = df_test.drop('Log_new_cases_per_million', axis=1)

X_train.head(2)

Train Models¶

def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('xgboost',xgboost.XGBRegressor()))
    level0.append(('ranFor',RandomForestRegressor(random_state=0)))
    level0.append(('graBoost',GradientBoostingRegressor(random_state=0)))
    level0.append(('ridge', Ridge(alpha = 10,tol=0.001)))
    level0.append(('lasso', Lasso(alpha = 0.001)))
    level0.append(('linear_reg', LinearRegression()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model

# get a list of models to evaluate
def get_models():
    models = dict()
    models['xgboost'] = xgboost.XGBRegressor()
    models['ranFor'] = RandomForestRegressor(random_state=0)
    models['graBoost'] = GradientBoostingRegressor(random_state=0)
    models['ridge'] = Ridge(alpha = 0.001)
    models['lasso'] = Lasso(alpha = 0.001)
    models['linear_reg'] = LinearRegression()
    models['stacking'] = get_stacking()
    return models

R2¶

# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline

>xgboost 0.809 (0.011)
>ranFor 0.815 (0.010)
>graBoost 0.705 (0.010)
>ridge 0.446 (0.017)
>lasso 0.446 (0.017)
>linear_reg 0.446 (0.017)
>stacking 0.820 (0.010)

Root mean squared error¶

# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline

>xgboost -0.392 (0.012)
>ranFor -0.385 (0.012)
>graBoost -0.487 (0.010)
>ridge -0.667 (0.014)
>lasso -0.667 (0.014)
>linear_reg -0.667 (0.014)
>stacking -0.380 (0.012)

Residual + R2 plot¶

model = RandomForestRegressor(random_state=0)
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f7364fbdf10>

Feature Importance¶

model = RandomForestRegressor(random_state=0)
from yellowbrick.model_selection import FeatureImportances
plt.figure(figsize=(8,24))
viz = FeatureImportances(model,colormap='hot_r')
viz.fit(X_train, y_train)
viz.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f7366abf410>

Test Set Evaluation¶

RandomForestRegressor¶

from sklearn.inspection import plot_partial_dependence
reg = RandomForestRegressor(random_state=0).fit(X_train, y_train)
from sklearn.metrics import r2_score
y_pred = reg.predict(X_test)
r2_score(y_test, y_pred)

0.5991532062914151

stacking¶

y_pred = models['stacking'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.6120707964634038

xgboost¶

y_pred = models['xgboost'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.5757600991254161

graBoost¶

y_pred = models['graBoost'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.5254250706506605

ridge¶

y_pred = models['ridge'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.37777324224138

lasso¶

y_pred = models['lasso'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.37485859551475054

linear_reg¶

y_pred = models['linear_reg'].fit(X_train, y_train).predict(X_test)
r2_score(y_test, y_pred)

0.37777327990805154

RMSE¶

RandomForestRegressor¶

import math
from sklearn.metrics import mean_squared_error
y_pred = reg.predict(X_test)
math.sqrt(mean_squared_error(y_test, y_pred))

0.583818428841278

eli5: Permutation Importance show weights¶

import eli5
from eli5.sklearn import PermutationImportance

model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
perm = PermutationImportance(model).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_train.columns.tolist(), top=50)

eli5 explains weights¶

eli5.explain_weights_df(perm, feature_names=X_train.columns.to_list())\
  .head(50).style.background_gradient(subset=['weight'])

eli5 shows prediction¶

eli5.show_prediction(model, X_test.iloc[0,:],show_feature_values=True)

PDP¶

from sklearn.inspection import plot_partial_dependence
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
column_index(X_test, ['urbanPopulation', 'e1_0_action', 'c4_4_action','cvd_death_rate','hospital_beds_per_thoudsand','male_smokers','c3_2_action'])

array([60, 31, 16, 47, 50, 52, 11])

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[60])  
plt.savefig('Figures_high_quality/PDP_urbanPopulation.pdf')

<Figure size 1600x800 with 0 Axes>

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[31])  
plt.savefig('Figures_high_quality/PDP_e1_0_action.pdf')

<Figure size 1600x800 with 0 Axes>

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[16])  
plt.savefig('Figures_high_quality/PDP_c4_4_action.pdf')

<Figure size 1600x800 with 0 Axes>

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[47])  
plt.savefig('Figures_high_quality/PDP_cvd_death_rate.pdf')

<Figure size 1600x800 with 0 Axes>

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[50])  
plt.savefig('Figures_high_quality/PDP_hospital_beds_per_thoudsand.pdf')

<Figure size 1600x800 with 0 Axes>

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[52])  
plt.savefig('Figures_high_quality/PDP_male_smokers.pdf')

<Figure size 1600x800 with 0 Axes>

figure(figsize=(20, 10), dpi=80)
disp1 = plot_partial_dependence(model, X_test,features=[11])  
plt.savefig('Figures_high_quality/PDP_c3_2_action.pdf')

<Figure size 1600x800 with 0 Axes>

SHAP (SHapley Additive exPlanations)¶

Feature Contribution over each row¶

import shap

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values.shape)

(4689, 82)

pd.DataFrame(X_train.iloc[0,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The prediction starts from the baseline.

The baseline for Shapley values is the average of all predictions. The base_value here is 0.4597 while our predicted value is 0.61. Month=7 has the highest impact on increasing the prediction. Wheras, e1_0_action has the highest negative impact (decreasing the prediction).

Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue

pd.DataFrame(X_test.iloc[30,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[30,:], X_test.iloc[30,:])

pd.DataFrame(X_test.iloc[300,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[300,:], X_test.iloc[300,:])

pd.DataFrame(X_test.iloc[3000,:]).transpose()

shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[3000,:], X_test.iloc[3000,:])

# load JS visualization code to notebook
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_test)

# shap.save_html('Figures_high_quality/dependen_plot_c3_2.html', shap.dependence_plot("c3_2_action", shap_values, X_test))

Features that are pink contribute to the model output being higher, that means predicting a success of the project. Blue parts of the visualization indicate a lower model output, predicting a failed project.

Features pushing the prediction higher are shown in pink, those pushing the prediction lower are in blue

Feature Importance¶

shap.summary_plot(shap_values, X_test, plot_type="bar")

To get an overview of which features are most important for a model we can plot the SHAP values of every feature for every sample. The summary plot tells which features are most important, and also their range of effects over the dataset.

# load JS visualization code to notebook
shap.initjs()
# summarize the effects of all the features
shap.summary_plot(shap_values, X_test)

The summary plot combines feature importance with feature effects. In this summary plot, the order of the columns still represents the amount of information the column is accountable for in the prediction. Each dot in the visualization represents one prediction. The color is related to the real data point. If the actual value in the dataset was high, the color is pink; blue indicates the actual value being low. The x-axis represents the SHAP value, which is the impact on the model output. The model output 1 equates to the prediction of successful; 0 the prediction that the project is going to fail.

Decision Plot¶

0¶

pd.DataFrame(X_test.iloc[0,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[0,:],
    features=X_test.iloc[0,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

shap.decision_plot(explainer.expected_value, shap_values, X_display.loc[features.index])

25¶

pd.DataFrame(X_test.iloc[25,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[25,:],
    features=X_test.iloc[25,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

250¶

pd.DataFrame(X_test.iloc[250,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[250,:],
    features=X_test.iloc[250,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

2500¶

pd.DataFrame(X_test.iloc[2500,:]).transpose()

# We provide new_base_value as the cutoff probability for the classification mode
# This is done to increase the interpretability of the plot 

row = 2500
shap.decision_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values[2500,:],
    features=X_test.iloc[2500,:],
    feature_names=X_test.columns.tolist(),
    link="identity",
    new_base_value=0.5,
)

Dependence Plot¶

Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extenstion of the classical parital dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature’s value was NaN.

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("urbanPopulation", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("e1_0_action", shap_values, X_test)

shap.common.approximate_interactions is used to pick what seems to be the strongest interaction.

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("cvd_death_rate", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("gdp_per_capita", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("c4_4_action", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("male_smokers", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("hospital_beds_per_thousand", shap_values, X_test)

# create a dependence plot to show the effect of a single feature across the whole dataset

shap.dependence_plot("c3_2_action", shap_values, X_test)
# plt.savefig('Figures_high_quality/dependence_plot_c3_2.pdf')

<Figure size 432x288 with 0 Axes>

Interaction Between Bariables¶

X_train.head(2)

urbanPopulation & cvd_death_rate¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

inter1 = pdp.pdp_interact(
    model=model, dataset=X_train[X_train.columns], 
    model_features=X_train.columns, features=['urbanPopulation', 'cvd_death_rate'])
fig, axes = pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=['urbanPopulation', 'cvd_death_rate'], 
    plot_type='grid', x_quantile=True, plot_pdp=True)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['urbanPopulation', 'cvd_death_rate'], feature_names=['urbanPopulation', 'cvd_death_rate'])

e1_0_action & healthExpenditure¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['e1_0_action', 'healthExpenditure']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['e1_0_action', 'healthExpenditure'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['e1_0_action', 'healthExpenditure'], feature_names=['e1_0_action', 'healthExpenditure'])

gdp_per_capita & cvd_death_rate¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['gdp_per_capita', 'cvd_death_rate']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['gdp_per_capita', 'cvd_death_rate'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['gdp_per_capita', 'cvd_death_rate'], feature_names=['gdp_per_capita', 'cvd_death_rate'])

c4_4_action & diabetes_prevalence¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c4_4_action', 'diabetes_prevalence']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c4_4_action', 'diabetes_prevalence'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c4_4_action', 'diabetes_prevalence'], feature_names=['c4_4_action', 'diabetes_prevalence'])

male_smokers && gdp_per_capita¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'gdp_per_capita']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'gdp_per_capita'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'gdp_per_capita'], feature_names=['male_smokers', 'gdp_per_capita'])

hospital_beds_per_thousand & male_smokers¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['male_smokers', 'hospital_beds_per_thousand']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['male_smokers', 'hospital_beds_per_thousand'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['male_smokers', 'hospital_beds_per_thousand'], feature_names=['male_smokers', 'hospital_beds_per_thousand'])

c3_2_action & urbanPopulation¶

inter1 = pdp.pdp_interact(
model=model, dataset=X_train, model_features=X_train.columns, features=['c3_2_action', 'urbanPopulation']
)

fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1,
    feature_names=['c3_2_action', 'urbanPopulation'], 
    plot_type='contour',
    x_quantile=True,
    plot_pdp=True
)

fig, axes, summary_df = info_plots.actual_plot_interact(
model=model, X=X_train, features=['c3_2_action', 'urbanPopulation'], feature_names=['c3_2_action', 'urbanPopulation'])

Local Surrogate (LIME)¶

LIME basically tries to step away from deriving the importance of global features and instead approximates the importance of features for local predictions. It does so by taking the row (or set of data points) from which to predict and generate fake data based on that row. It then calculates the similarity between the fake data and the real data and approximates the effect of the changes based on the similarity between the fake and real data

import lime
import lime.lime_tabular

explainer1 = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, verbose=True, mode='regression')

25¶

pd.DataFrame(X_test.iloc[25,:]).transpose()

i = 25
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')

Intercept 0.4629940975312947
Prediction_local [0.02338742]
Right: 0.43631627172114307

<Figure size 1600x800 with 0 Axes>

blue -> negative influence

orange -> positive influence

250¶

pd.DataFrame(X_test.iloc[250,:]).transpose()

i = 250
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_250.pdf')

Intercept 0.15037628302273087
Prediction_local [0.83037792]
Right: 0.18410775678020885

<Figure size 1600x800 with 0 Axes>

2500¶

pd.DataFrame(X_test.iloc[2500,:]).transpose()

i = 2500
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_2500.pdf')

Intercept 0.32360037315707413
Prediction_local [0.17171314]
Right: -0.5648482808748628

<Figure size 1600x800 with 0 Axes>

3000¶

pd.DataFrame(X_test.iloc[3000,:]).transpose()

i = 3000
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')

Intercept 0.42599367919267617
Prediction_local [0.07624511]
Right: -0.10981900162251022

<Figure size 1600x800 with 0 Axes>

4688¶

pd.DataFrame(X_test.iloc[4688,:]).transpose()

i = 4688
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=10)
exp.show_in_notebook(show_table=True)

Intercept 0.10019342237355466
Prediction_local [0.74153209]
Right: 0.5718384449800371

Individual Conditional Expectation (ICE)¶

How does the prediction change when 1 feature changes ? Individual Conditional Expectation, as its name suggests, is a plot that shows how a change in an individual feature changes the outcome of each individual prediction (one line per prediction). It can be used for regression tasks only.

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'urbanPopulation', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('urbanPopulation');

from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'gdp_per_capita', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('gdp_per_capita');

if gdp_per_capita is higher than 20000, then

ice_df = ice(X_train, 'e1_0_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('e1_0_action');

ice_df = ice(X_train, 'c4_4_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c4_4_action');

ice_df = ice(X_train, 'cvd_death_rate', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('cvd_death_rate');

ice_df = ice(X_train, 'hospital_beds_per_thousand', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('hospital_beds_per_thousand');

ice_df = ice(X_train, 'male_smokers', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('male_smokers');

ice_df = ice(X_train, 'c3_2_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c3_2_action');

Weather_situation¶

from pdpbox import pdp, get_dataset, info_plots
import matplotlib.font_manager
ross_data = df_train
ross_features = X_train.columns
ross_model = model
ross_target = "Log_new_cases_per_million"

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature=['clear-day', 'clear-night', 'cloudy', 'partly-cloudy-day','partly-cloudy-night','rain','wind-fog-snow'], 
    feature_name='Weather Situation', target=ross_target,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/weather_situation_target_plot.pdf')

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday','Sunday'], 
    feature_name='Day of week', target=ross_target,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/Day_of_week_target_plot.pdf')

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature=['continent_Africa', 'continent_Asia', 'continent_North America', 'continent_Oceania', 'continent_South America', 'continent_Europe'], 
    feature_name='Continents', target=ross_target,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/continents_target_plot.pdf')

gdp_per_capita¶

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, 
    feature='gdp_per_capita', 
    feature_name='gdp_per_capita', 
    target=ross_target,
    percentile_range=(0,100), 
    show_percentile=True,
    show_outliers = True,
    figsize = (20,10)
)
plt.savefig('Figures_high_quality/gdp_per_capita_target_plot.pdf')

fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='gdp_per_capita',
    feature_name='gdp_per_capita', 
    show_percentile=True,
    figsize = (20,10),
    predict_kwds={}
)
plt.savefig('Figures_high_quality/gdp_per_capita_actual_plot.pdf')

pdp_gdp = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='gdp_per_capita'
)
fig, axes = pdp.pdp_plot(pdp_gdp, 'gdp_per_capita')

fig, axes = pdp.pdp_plot(
    pdp_gdp, 'gdp_per_capita', 
    plot_lines=True, 
    frac_to_plot=2000, 
    x_quantile=True,
    figsize = (20,10), 
    plot_pts_dist=True, 
    show_percentile=True
)
plt.savefig('Figures_high_quality/gdp_per_capita_pdp_plot.pdf')

On the x-axis, you can find the values of the feature you are trying to understand, i.e., gdp_per_capita. On the y-axis, you find the prediction (in the case of regression, it is the real-valued prediction).

The bar on the bottom represents the distribution of training data points in different quantiles. It helps us gauge the goodness of the inference.

The single line in the PD plot shows the average functional relationship between the feature and target. All the lines in the ICE plot show the heterogeneity in the training data, i.e., how all the observations in the training data vary with the different values of the gdp_per_capita

The yellow and black line gives us the average effect on the predictions.

fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['gdp_per_capita', ['e1_0_action','cvd_death_rate', 'c4_4_action', 'c7_2_action','urbanPopulation']], 
    feature_names=['gdp_per_capita', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/gdp_per_capita_target_plot_interact.pdf')

Interaction between gdp_per_capita and different actions ('e1_0_action','cvd_death_rate', 'c4_4_action', 'c7_2_action')

The darker color means the higher prediction value (log_total_new_cases_milion) & the bubble size is of less importance

e1_0_action¶

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature='e1_0_action', 
    feature_name='e1_0_action',
    target=ross_target,
    show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_target_plot.pdf')

fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='e1_0_action',
    feature_name='e1_0_action', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_actual_plot.pdf')

pdp_gdp = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='e1_0_action'
)
fig, axes = pdp.pdp_plot(pdp_gdp, 'e1_0_action')

fig, axes = pdp.pdp_plot(
    pdp_gdp, 'e1_0_action', plot_lines=True, frac_to_plot=2000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_pdp_plot.pdf')

fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['e1_0_action', ['gdp_per_capita','c4_4_action', 'c7_2_action', 'cvd_death_rate','urbanPopulation']], 
    feature_names=['e1_0_action', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/e1_0_action_target_plot_interact.pdf')

c4_4_action¶

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, feature='c4_4_action', feature_name='c4_4_action', target=ross_target, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_target_plot.pdf')

fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='c4_4_action',
    feature_name='c4_4_action', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_actual_plot.pdf')

pdp_h1 = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='c4_4_action'
)
fig, axes = pdp.pdp_plot(pdp_gdp, 'c4_4_action')

fig, axes = pdp.pdp_plot(
    pdp_h1, 'c4_4_action', plot_lines=True, frac_to_plot=3000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_actual_plot.pdf')

fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['c4_4_action', ['gdp_per_capita','e1_0_action', 'c7_2_action', 'cvd_death_rate','urbanPopulation']], 
    feature_names=['c4_4_action', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/c4_4_action_pdp_plot.pdf')

cvd_death_rate¶

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data,
    feature='cvd_death_rate',
    feature_name='cvd_death_rate', 
    target=ross_target,
    show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_target_plot.pdf')

fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='cvd_death_rate',
    feature_name='cvd_death_rate', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_actual_plot.pdf')

pdp_cvd = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='cvd_death_rate'
)

fig, axes = pdp.pdp_plot(pdp_gdp, 'cvd_death_rate')

fig, axes = pdp.pdp_plot(
    pdp_cvd, 'cvd_death_rate', plot_lines=True, frac_to_plot=2000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_pdp_plot.pdf')

fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['cvd_death_rate', ['e1_0_action','gdp_per_capita','c4_4_action', 'urbanPopulation']], 
    feature_names=['cvd_death_rate', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/cvd_death_rate_target_plot_interact.pdf')

hospital_beds_per_thousand¶

fig, axes, summary_df = info_plots.target_plot(
    df=ross_data, 
    feature='hospital_beds_per_thousand', 
    feature_name='hospital_beds_per_thousand', 
    target=ross_target, 
    show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_target_plot.pdf')

fig, axes, summary_df = info_plots.actual_plot(
    model=ross_model, X= ross_data[ross_features], 
    feature='hospital_beds_per_thousand',
    feature_name='hospital_beds_per_thousand', 
    show_percentile=True,
    predict_kwds={},
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_actual_plot.pdf')

pdp_cvd = pdp.pdp_isolate(
    model=ross_model, dataset=ross_data, model_features=ross_features, feature='cvd_death_rate'
)

fig, axes = pdp.pdp_plot(pdp_gdp, 'hospital_beds_per_thousand')

fig, axes = pdp.pdp_plot(
    pdp_cvd, 'hospital_beds_per_thousand', plot_lines=True, frac_to_plot=2000, x_quantile=True, 
    plot_pts_dist=True, show_percentile=True,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_pdp_plot.pdf')

fig, axes, summary_df = info_plots.target_plot_interact(
    df=ross_data, features=['hospital_beds_per_thousand', ['e1_0_action','gdp_per_capita','c4_4_action', 'urbanPopulation']], 
    feature_names=['hospital_beds_per_thousand', 'Variables'], target=ross_target,
    figsize=(20,10)
)
plt.savefig('Figures_high_quality/hospital_beds_per_thousand_target_plot_interact.pdf')

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')

Decison Tree Interpretebility¶

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import graphviz
from sklearn.metrics import r2_score
X_train.iloc[:, [31,60]].head(0)

clf = DecisionTreeRegressor(max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
r2_score(y_pred, y_test)
export_graphviz(clf, out_file="tree.dot")
with open("tree.dot") as f:
    dot_graph = f.read()

graphviz.Source(dot_graph)

	Days since first case	c1_0_action	c1_1_action	c1_2_action	c1_3_action	c2_0_action	c2_1_action	c2_2_action	c2_3_action	c3_0_action	c3_1_action	c3_2_action	c4_0_action	c4_1_action	c4_2_action	c4_3_action	c4_4_action	c5_0_action	c5_1_action	c5_2_action	c6_0_action	c6_1_action	c6_2_action	c6_3_action	c7_0_action	c7_1_action	c7_2_action	c8_0_action	c8_1_action	c8_2_action	c8_4_action	e1_0_action	e1_1_action	e2_0_action	e2_1_action	e2_2_action	h1_0_action	h1_1_action	h2_0_action	h2_1_action	h2_2_action	h2_3_action	h3_0_action	h3_1_action	h3_2_action	Log_new_cases_per_million	Outputvalue_lag1	Outputvalue_lag2	Outputvalue_lag3	Outputvalue_lag4	Outputvalue_lag5	Outputvalue_lag6	Outputvalue_lag7	population_density	gdp_per_capita	cvd_death_rate	diabetes_prevalence	handwashing_facilities	hospital_beds_per_thousand	female_smokers	male_smokers	life_expectancy	aged_65_older_sum	humidity	temperature	windSpeed	Number of Tweet	Sentiments	urbanPopulation	healthExpenditure	weather_situation_clear-day	weather_situation_clear-night	weather_situation_cloudy	weather_situation_partly-cloudy-day	weather_situation_partly-cloudy-night	weather_situation_rain	weather_situation_wind-fog-snow	Day_of_Week_0	Day_of_Week_1	Day_of_Week_2	Day_of_Week_3	Day_of_Week_4	Day_of_Week_5	Day_of_Week_6	continent_Africa	continent_Asia	continent_Europe	continent_North America	continent_Oceania	continent_South America
0	138	0	0	30	0	0	28	0	0	0	28	0	0	0	44	0	0	65	0	0	0	7	0	0	44	0	0	0	0	0	0	0	0	0	0	107	0	0	0	0	78	0	0	106	0	1.374803	1.559703	1.48817	1.504022	1.460146	1.314647	1.113776	1.308031	85.129	23313.199	370.946	9.74	86.979	6.892	22.9	37.1	76.05	14.7700	0.65	64.78	3.66	1.0	0.049127	54.084	4.053393	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
1	139	0	0	0	134	0	12	0	0	0	0	125	0	0	0	0	124	0	0	123	0	0	34	0	0	0	64	0	0	0	122	0	103	0	67	0	0	0	0	0	124	0	0	0	128	2.238746	2.290159	2.25147	2.148390	2.216469	2.174336	2.169545	2.279605	232.128	65530.537	132.235	15.84	97.400	2.000	2.7	37.0	75.49	1.7295	0.21	103.35	14.72	1.0	0.000000	100.000	4.623860	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0

	Days since first case	c1_0_action	c1_1_action	c1_2_action	c1_3_action	c2_0_action	c2_1_action	c2_2_action	c2_3_action	c3_0_action	c3_1_action	c3_2_action	c4_0_action	c4_1_action	c4_2_action	c4_3_action	c4_4_action	c5_0_action	c5_1_action	c5_2_action	c6_0_action	c6_1_action	c6_2_action	c6_3_action	c7_0_action	c7_1_action	c7_2_action	c8_0_action	c8_1_action	c8_2_action	c8_4_action	e1_0_action	e1_1_action	e2_0_action	e2_1_action	e2_2_action	h1_0_action	h1_1_action	h2_0_action	h2_1_action	h2_2_action	h2_3_action	h3_0_action	h3_1_action	h3_2_action	Log_new_cases_per_million	population_density	gdp_per_capita	cvd_death_rate	diabetes_prevalence	handwashing_facilities	hospital_beds_per_thousand	female_smokers	male_smokers	life_expectancy	aged_65_older_sum	humidity	temperature	windSpeed	Number of Tweet	Sentiments	urbanPopulation	healthExpenditure	clear-day	clear-night	cloudy	partly-cloudy-day	partly-cloudy-night	rain	wind-fog-snow	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday	continent_Africa	continent_Asia	continent_Europe	continent_North America	continent_Oceania	continent_South America
0	138	0	0	30	0	0	28	0	0	0	28	0	0	0	44	0	0	65	0	0	0	7	0	0	44	0	0	0	0	0	0	0	0	0	0	107	0	0	0	0	78	0	0	106	0	1.374803	85.129	23313.199	370.946	9.74	86.979	6.892	22.9	37.1	76.05	14.7700	0.65	64.78	3.66	1.0	0.049127	54.084	4.053393	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
1	139	0	0	0	134	0	12	0	0	0	0	125	0	0	0	0	124	0	0	123	0	0	34	0	0	0	64	0	0	0	122	0	103	0	67	0	0	0	0	0	124	0	0	0	128	2.238746	232.128	65530.537	132.235	15.84	97.400	2.000	2.7	37.0	75.49	1.7295	0.21	103.35	14.72	1.0	0.000000	100.000	4.623860	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0

	Days since first case	c1_0_action	c1_1_action	c1_2_action	c1_3_action	c2_0_action	c2_1_action	c2_2_action	c2_3_action	c3_0_action	c3_1_action	c3_2_action	c4_0_action	c4_1_action	c4_2_action	c4_3_action	c4_4_action	c5_0_action	c5_1_action	c5_2_action	c6_0_action	c6_1_action	c6_2_action	c6_3_action	c7_0_action	c7_1_action	c7_2_action	c8_0_action	c8_1_action	c8_2_action	c8_4_action	e1_0_action	e1_1_action	e2_0_action	e2_1_action	e2_2_action	h1_0_action	h1_1_action	h2_0_action	h2_1_action	h2_2_action	h2_3_action	h3_0_action	h3_1_action	h3_2_action	population_density	gdp_per_capita	cvd_death_rate	diabetes_prevalence	handwashing_facilities	hospital_beds_per_thousand	female_smokers	male_smokers	life_expectancy	aged_65_older_sum	humidity	temperature	windSpeed	Number of Tweet	Sentiments	urbanPopulation	healthExpenditure	clear-day	clear-night	cloudy	partly-cloudy-day	partly-cloudy-night	rain	wind-fog-snow	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday	continent_Africa	continent_Asia	continent_Europe	continent_North America	continent_Oceania	continent_South America
0	138	0	0	30	0	0	28	0	0	0	28	0	0	0	44	0	0	65	0	0	0	7	0	0	44	0	0	0	0	0	0	0	0	0	0	107	0	0	0	0	78	0	0	106	0	85.129	23313.199	370.946	9.74	86.979	6.892	22.9	37.1	76.05	14.7700	0.65	64.78	3.66	1.0	0.049127	54.084	4.053393	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
1	139	0	0	0	134	0	12	0	0	0	0	125	0	0	0	0	124	0	0	123	0	0	34	0	0	0	64	0	0	0	122	0	103	0	67	0	0	0	0	0	124	0	0	0	128	232.128	65530.537	132.235	15.84	97.400	2.000	2.7	37.0	75.49	1.7295	0.21	103.35	14.72	1.0	0.000000	100.000	4.623860	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0

Weight	Feature
0.4624 ± 0.0211	urbanPopulation
0.1408 ± 0.0067	cvd_death_rate
0.0871 ± 0.0075	c4_4_action
0.0869 ± 0.0100	e1_0_action
0.0857 ± 0.0163	hospital_beds_per_thousand
0.0524 ± 0.0095	male_smokers
0.0494 ± 0.0024	population_density
0.0250 ± 0.0049	e2_2_action
0.0204 ± 0.0052	c2_0_action
0.0168 ± 0.0027	c3_2_action
0.0154 ± 0.0008	life_expectancy
0.0153 ± 0.0015	gdp_per_capita
0.0133 ± 0.0017	Number of Tweet
0.0117 ± 0.0025	aged_65_older_sum
0.0107 ± 0.0006	c7_2_action
0.0102 ± 0.0021	h2_2_action
0.0101 ± 0.0022	c1_3_action
0.0095 ± 0.0063	c8_4_action
0.0076 ± 0.0024	e1_1_action
0.0066 ± 0.0031	h2_3_action
0.0065 ± 0.0025	h3_1_action
0.0056 ± 0.0011	c6_2_action
0.0053 ± 0.0018	c2_2_action
0.0049 ± 0.0013	c6_1_action
0.0049 ± 0.0013	handwashing_facilities
0.0045 ± 0.0021	temperature
0.0035 ± 0.0004	c5_0_action
0.0032 ± 0.0018	Thursday
0.0030 ± 0.0012	healthExpenditure
0.0029 ± 0.0006	female_smokers
0.0028 ± 0.0009	c5_2_action
0.0025 ± 0.0004	humidity
0.0018 ± 0.0001	h1_1_action
0.0018 ± 0.0010	c5_1_action
0.0017 ± 0.0009	h3_2_action
0.0017 ± 0.0004	c1_0_action
0.0015 ± 0.0002	continent_Europe
0.0015 ± 0.0011	h2_1_action
0.0013 ± 0.0009	c7_0_action
0.0012 ± 0.0005	Days since first case
0.0012 ± 0.0004	c3_1_action
0.0011 ± 0.0006	Wednesday
0.0011 ± 0.0006	c2_3_action
0.0010 ± 0.0005	Monday
0.0009 ± 0.0010	continent_Asia
0.0009 ± 0.0002	Tuesday
0.0008 ± 0.0014	c4_3_action
0.0007 ± 0.0002	c1_1_action
0.0006 ± 0.0011	c2_1_action
0.0005 ± 0.0006	Saturday
… 32 more …

	feature	weight	std
0	urbanPopulation	0.462448	0.0105442
1	cvd_death_rate	0.140767	0.00336957
2	c4_4_action	0.0870728	0.00376409
3	e1_0_action	0.0869415	0.00500541
4	hospital_beds_per_thousand	0.085703	0.00816721
5	male_smokers	0.0523851	0.00476707
6	population_density	0.0493625	0.00121114
7	e2_2_action	0.0249648	0.00242754
8	c2_0_action	0.0204493	0.00257985
9	c3_2_action	0.0167923	0.0013408
10	life_expectancy	0.0154441	0.000405461
11	gdp_per_capita	0.015254	0.000770447
12	Number of Tweet	0.0133228	0.000834265
13	aged_65_older_sum	0.0116758	0.00124254
14	c7_2_action	0.0106635	0.000290369
15	h2_2_action	0.0101597	0.00105717
16	c1_3_action	0.0101119	0.00109259
17	c8_4_action	0.00949325	0.00313744
18	e1_1_action	0.00761092	0.00119672
19	h2_3_action	0.00662889	0.00155727
20	h3_1_action	0.00650767	0.00126211
21	c6_2_action	0.00560097	0.000530179
22	c2_2_action	0.00526331	0.000920847
23	c6_1_action	0.00494054	0.000635613
24	handwashing_facilities	0.00492399	0.000627038
25	temperature	0.00449942	0.0010565
26	c5_0_action	0.00353339	0.000198231
27	Thursday	0.00318915	0.000893301
28	healthExpenditure	0.00297727	0.000575215
29	female_smokers	0.00286985	0.000302607
30	c5_2_action	0.00278839	0.000463958
31	humidity	0.00245344	0.000207741
32	h1_1_action	0.00182261	3.6703e-05
33	c5_1_action	0.00175089	0.000491127
34	h3_2_action	0.00168831	0.000439305
35	c1_0_action	0.00165642	0.000179647
36	continent_Europe	0.0015024	0.000106517
37	h2_1_action	0.00145065	0.000553079
38	c7_0_action	0.00127572	0.000448311
39	Days since first case	0.0012389	0.000266965
40	c3_1_action	0.00118112	0.000213671
41	Wednesday	0.00109696	0.000298284
42	c2_3_action	0.00106179	0.000300479
43	Monday	0.00104755	0.000259711
44	continent_Asia	0.000900208	0.000476615
45	Tuesday	0.00087217	8.8122e-05
46	c4_3_action	0.000806217	0.000683279
47	c1_1_action	0.000736951	8.16205e-05
48	c2_1_action	0.000587083	0.000540738
49	Saturday	0.000520235	0.00030409

Contribution^?	Feature	Value
+0.460	<BIAS>	1.000
+0.284	h3_1_action	152.000
+0.193	cvd_death_rate	597.029
+0.092	population_density	54.422
+0.086	c3_2_action	147.000
+0.063	windSpeed	2.920
+0.051	Days since first case	151.000
+0.043	humidity	0.290
+0.041	healthExpenditure	0.600
+0.032	gdp_per_capita	1803.987
+0.029	diabetes_prevalence	9.590
+0.019	handwashing_facilities	37.746
+0.011	Monday	0.000
+0.008	life_expectancy	64.830
+0.008	c8_4_action	0.000
+0.007	female_smokers	4.805
+0.006	c6_2_action	123.000
+0.005	aged_65_older_sum	1.959
+0.004	c6_1_action	0.000
+0.004	Sentiments	0.002
+0.003	e2_1_action	0.000
+0.002	male_smokers	39.697
+0.002	c7_1_action	0.000
+0.002	Saturday	1.000
+0.001	Thursday	0.000
+0.001	Wednesday	0.000
+0.000	cloudy	0.000
+0.000	Tuesday	0.000
-0.000	clear-day	1.000
-0.001	Friday	0.000
-0.001	c2_2_action	0.000
-0.002	c4_3_action	0.000
-0.009	Sunday	0.000
-0.012	h2_1_action	152.000
-0.014	c5_1_action	0.000
-0.017	hospital_beds_per_thousand	0.500
-0.017	Number of Tweet	82.182
-0.023	c2_3_action	124.000
-0.024	e1_1_action	0.000
-0.027	c8_1_action	47.000
-0.029	e2_2_action	0.000
-0.036	c7_2_action	105.000
-0.039	c5_2_action	120.000
-0.053	e2_0_action	152.000
-0.063	c1_3_action	134.000
-0.067	c4_4_action	112.000
-0.082	h2_2_action	0.000
-0.144	urbanPopulation	25.754
-0.286	e1_0_action	152.000
-0.402	temperature	79.410