In [183]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [184]:
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
In [185]:
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
In [186]:
print(df_train.shape, df_test.shape)
(18507, 90) (4689, 90)
In [188]:
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
In [189]:
df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)
In [192]:
X_train.head(2)
Out[192]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0

Local Surrogate (LIME)

LIME basically tries to step away from deriving the importance of global features and instead approximates the importance of features for local predictions. It does so by taking the row (or set of data points) from which to predict and generate fake data based on that row. It then calculates the similarity between the fake data and the real data and approximates the effect of the changes based on the similarity between the fake and real data

In [258]:
import lime
import lime.lime_tabular
In [259]:
explainer1 = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, verbose=True, mode='regression')

25

In [260]:
pd.DataFrame(X_test.iloc[25,:]).transpose()
Out[260]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
25 176.0 0.0 0.0 15.0 0.0 0.0 0.0 16.0 0.0 5.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 145.0 0.0 0.0 148.0 0.0 0.0 2.0 0.0 0.0 72.0 0.0 0.0 177.0 0.0 177.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 2.0 54.422 1803.987 597.029 9.59 37.746 0.5 4.805263 39.697368 64.83 1.959 0.32 76.96 2.22 1.0 0.319582 25.754 0.600144 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [406]:
i = 25
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')
Intercept 0.4629940975312947
Prediction_local [0.02338742]
Right: 0.43631627172114307
<Figure size 1600x800 with 0 Axes>

blue -> negative influence

orange -> positive influence

250

In [263]:
pd.DataFrame(X_test.iloc[250,:]).transpose()
Out[263]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
250 209.0 0.0 0.0 0.0 19.0 0.0 0.0 0.0 20.0 0.0 0.0 52.0 0.0 0.0 0.0 0.0 45.0 0.0 96.0 0.0 0.0 0.0 0.0 20.0 0.0 0.0 156.0 0.0 0.0 0.0 155.0 0.0 163.0 0.0 0.0 150.0 0.0 0.0 0.0 0.0 0.0 117.0 0.0 0.0 64.0 3.202 44648.71 107.791 5.07 65.386 3.84 13.0 16.5 83.44 12.8165 0.86 44.46 15.27 3.0 -0.010368 86.124 6.344081 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
In [405]:
i = 250
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_250.pdf')
Intercept 0.15037628302273087
Prediction_local [0.83037792]
Right: 0.18410775678020885
<Figure size 1600x800 with 0 Axes>

2500

In [265]:
pd.DataFrame(X_test.iloc[2500,:]).transpose()
Out[265]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
2500 182.0 0.0 0.0 0.0 144.0 0.0 0.0 134.0 0.0 58.0 0.0 0.0 0.0 0.0 58.0 0.0 0.0 183.0 0.0 0.0 183.0 0.0 0.0 0.0 103.0 0.0 0.0 0.0 0.0 112.0 0.0 0.0 0.0 183.0 0.0 0.0 0.0 0.0 0.0 135.0 0.0 0.0 0.0 135.0 0.0 90.672 3645.07 270.892 4.0 66.229 0.8 2.0 33.7 69.82 3.3985 0.6 92.96 4.84 1.0 -0.02828 23.805 1.408793 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
In [404]:
i = 2500
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_2500.pdf')
Intercept 0.32360037315707413
Prediction_local [0.17171314]
Right: -0.5648482808748628
<Figure size 1600x800 with 0 Axes>

3000

In [267]:
pd.DataFrame(X_test.iloc[3000,:]).transpose()
Out[267]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
3000 128.0 0.0 0.0 61.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 130.0 0.0 0.0 0.0 130.0 0.0 129.0 0.0 0.0 84.0 0.0 0.0 0.0 84.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 123.0 0.0 123.0 0.0 0.0 0.0 0.0 93.0 0.0 0.0 0.0 0.0 110.0 15.196 2014.306 268.024 2.42 52.232 0.1 1.6 23.0 59.31 2.0025 0.99 72.57 6.47 1.0 0.0 43.136 1.325374 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [403]:
i = 3000
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)

from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')
Intercept 0.42599367919267617
Prediction_local [0.07624511]
Right: -0.10981900162251022
<Figure size 1600x800 with 0 Axes>

4688

In [269]:
pd.DataFrame(X_test.iloc[4688,:]).transpose()
Out[269]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
4688 178.0 0.0 0.0 85.0 0.0 0.0 0.0 123.0 0.0 0.0 0.0 169.0 0.0 0.0 0.0 0.0 14.0 0.0 159.0 0.0 0.0 0.0 51.0 0.0 0.0 14.0 0.0 0.0 0.0 0.0 159.0 0.0 133.0 0.0 0.0 95.0 0.0 0.0 0.0 0.0 0.0 123.0 0.0 0.0 178.0 46.754 12294.876 200.38 5.52 43.993 2.32 8.1 33.2 64.13 4.1985 0.62 43.86 4.85 148.0 0.000437 66.856 4.35277 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
In [391]:
i = 4688
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=10)
exp.show_in_notebook(show_table=True)
Intercept 0.10019342237355466
Prediction_local [0.74153209]
Right: 0.5718384449800371
In [476]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')
Out[476]: