In [183]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [184]:
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
In [185]:
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
In [186]:
print(df_train.shape, df_test.shape)
(18507, 90) (4689, 90)
In [188]:
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
In [189]:
df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)
In [192]:
X_train.head(2)
Out[192]:
Days since first case c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_4_action e1_0_action e1_1_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments urbanPopulation healthExpenditure clear-day clear-night cloudy partly-cloudy-day partly-cloudy-night rain wind-fog-snow Monday Tuesday Wednesday Thursday Friday Saturday Sunday continent_Africa continent_Asia continent_Europe continent_North America continent_Oceania continent_South America
0 138 0 0 30 0 0 28 0 0 0 28 0 0 0 44 0 0 65 0 0 0 7 0 0 44 0 0 0 0 0 0 0 0 0 0 107 0 0 0 0 78 0 0 106 0 85.129 23313.199 370.946 9.74 86.979 6.892 22.9 37.1 76.05 14.7700 0.65 64.78 3.66 1.0 0.049127 54.084 4.053393 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 139 0 0 0 134 0 12 0 0 0 0 125 0 0 0 0 124 0 0 123 0 0 34 0 0 0 64 0 0 0 122 0 103 0 67 0 0 0 0 0 124 0 0 0 128 232.128 65530.537 132.235 15.84 97.400 2.000 2.7 37.0 75.49 1.7295 0.21 103.35 14.72 1.0 0.000000 100.000 4.623860 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0

PDP

In [211]:
from sklearn.inspection import plot_partial_dependence
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
Out[211]:
RandomForestRegressor(random_state=0)
In [212]:
disp1 = plot_partial_dependence(model, X_train,features=[1,3])  
In [213]:
disp1 = plot_partial_dependence(reg, X_train,features=[4,5])  
In [214]:
disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])  
In [215]:
disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])  
In [216]:
disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17])  
In [217]:
disp1 = plot_partial_dependence(reg, X_train,features=[18,19,20])  
In [218]:
disp1 = plot_partial_dependence(reg, X_train,features=[21,22,23])  
In [219]:
disp1 = plot_partial_dependence(reg, X_train,features=[24,25,26])  
In [220]:
disp1 = plot_partial_dependence(reg, X_train,features=[27,29])  
In [221]:
disp1 = plot_partial_dependence(reg, X_train,features=[30,31,32])  
In [222]:
disp1 = plot_partial_dependence(reg, X_train,features=[33,35,36])  
In [223]:
disp1 = plot_partial_dependence(reg, X_train,features=[40,41])  
In [224]:
disp1 = plot_partial_dependence(reg, X_train,features=[43,44,45])
In [225]:
disp1 = plot_partial_dependence(reg, X_train,features=[46,47,48])
In [226]:
disp1 = plot_partial_dependence(reg, X_train,features=[49,50,51])  
In [227]:
disp1 = plot_partial_dependence(reg, X_train,features=[52,53,54])  
In [228]:
disp1 = plot_partial_dependence(reg, X_train,features=[55,56,57])  

Individual Conditional Expectation (ICE)

How does the prediction change when 1 feature changes ? Individual Conditional Expectation, as its name suggests, is a plot that shows how a change in an individual feature changes the outcome of each individual prediction (one line per prediction). It can be used for regression tasks only.

In [271]:
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
Out[271]:
RandomForestRegressor(random_state=0)
In [272]:
from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'urbanPopulation', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('urbanPopulation');
In [273]:
from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'gdp_per_capita', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('gdp_per_capita');

if gdp_per_capita is higher than 20000, then

In [274]:
ice_df = ice(X_train, 'e1_0_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('e1_0_action');
In [275]:
ice_df = ice(X_train, 'c4_4_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c4_4_action');
In [276]:
ice_df = ice(X_train, 'cvd_death_rate', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('cvd_death_rate');
In [277]:
ice_df = ice(X_train, 'hospital_beds_per_thousand', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('hospital_beds_per_thousand');
In [278]:
ice_df = ice(X_train, 'male_smokers', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('male_smokers');
In [279]:
ice_df = ice(X_train, 'c3_2_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c3_2_action');
In [476]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')
Out[476]: