import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)

### Combine level of weather situation 
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
    percentage = x.value_counts()/len(x)*100
    labels = percentage.index[percentage < threshold]
    x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow') 


# Apply similar step to test data 
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 

df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage

cut_levels(df_test.weather_situation, 5, 'wind-fog-snow') 

### Remove highly correlated variables: 
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables 
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)


# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
                  , 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)

print(df_train.shape, df_test.shape)

(18507, 90) (4689, 90)

df_train.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_test.rename(columns={'weather_situation_clear-day': 'clear-day',  
                         'weather_situation_cloudy': 'cloudy', 
                         'weather_situation_partly-cloudy-day': 'partly-cloudy-day', 
                         'weather_situation_partly-cloudy-night': 'partly-cloudy-night', 
                         'weather_situation_rain': 'rain',
                         'weather_situation_clear-night': 'clear-night',
                         'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)

df_train.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

df_test.rename(columns={'Day_of_Week_0': 'Monday',  
                         'Day_of_Week_1': 'Tuesday', 
                         'Day_of_Week_2': 'Wednesday', 
                         'Day_of_Week_3': 'Thursday', 
                         'Day_of_Week_4': 'Friday',
                         'Day_of_Week_5': 'Saturday',
                         'Day_of_Week_6': 'Sunday'}, inplace=True)

X_train.head(2)

PDP¶

from sklearn.inspection import plot_partial_dependence
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

disp1 = plot_partial_dependence(model, X_train,features=[1,3])

disp1 = plot_partial_dependence(reg, X_train,features=[4,5])

disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])

disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])

disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17])

disp1 = plot_partial_dependence(reg, X_train,features=[18,19,20])

disp1 = plot_partial_dependence(reg, X_train,features=[21,22,23])

disp1 = plot_partial_dependence(reg, X_train,features=[24,25,26])

disp1 = plot_partial_dependence(reg, X_train,features=[27,29])

disp1 = plot_partial_dependence(reg, X_train,features=[30,31,32])

disp1 = plot_partial_dependence(reg, X_train,features=[33,35,36])

disp1 = plot_partial_dependence(reg, X_train,features=[40,41])

disp1 = plot_partial_dependence(reg, X_train,features=[43,44,45])

disp1 = plot_partial_dependence(reg, X_train,features=[46,47,48])

disp1 = plot_partial_dependence(reg, X_train,features=[49,50,51])

disp1 = plot_partial_dependence(reg, X_train,features=[52,53,54])

disp1 = plot_partial_dependence(reg, X_train,features=[55,56,57])

Individual Conditional Expectation (ICE)¶

How does the prediction change when 1 feature changes ? Individual Conditional Expectation, as its name suggests, is a plot that shows how a change in an individual feature changes the outcome of each individual prediction (one line per prediction). It can be used for regression tasks only.

model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'urbanPopulation', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('urbanPopulation');

from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'gdp_per_capita', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('gdp_per_capita');

if gdp_per_capita is higher than 20000, then

ice_df = ice(X_train, 'e1_0_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('e1_0_action');

ice_df = ice(X_train, 'c4_4_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c4_4_action');

ice_df = ice(X_train, 'cvd_death_rate', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('cvd_death_rate');

ice_df = ice(X_train, 'hospital_beds_per_thousand', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('hospital_beds_per_thousand');

ice_df = ice(X_train, 'male_smokers', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('male_smokers');

ice_df = ice(X_train, 'c3_2_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c3_2_action');

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="         "></form>''')

	Days since first case	c1_0_action	c1_1_action	c1_2_action	c1_3_action	c2_0_action	c2_1_action	c2_2_action	c2_3_action	c3_0_action	c3_1_action	c3_2_action	c4_0_action	c4_1_action	c4_2_action	c4_3_action	c4_4_action	c5_0_action	c5_1_action	c5_2_action	c6_0_action	c6_1_action	c6_2_action	c6_3_action	c7_0_action	c7_1_action	c7_2_action	c8_0_action	c8_1_action	c8_2_action	c8_4_action	e1_0_action	e1_1_action	e2_0_action	e2_1_action	e2_2_action	h1_0_action	h1_1_action	h2_0_action	h2_1_action	h2_2_action	h2_3_action	h3_0_action	h3_1_action	h3_2_action	population_density	gdp_per_capita	cvd_death_rate	diabetes_prevalence	handwashing_facilities	hospital_beds_per_thousand	female_smokers	male_smokers	life_expectancy	aged_65_older_sum	humidity	temperature	windSpeed	Number of Tweet	Sentiments	urbanPopulation	healthExpenditure	clear-day	clear-night	cloudy	partly-cloudy-day	partly-cloudy-night	rain	wind-fog-snow	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday	continent_Africa	continent_Asia	continent_Europe	continent_North America	continent_Oceania	continent_South America
0	138	0	0	30	0	0	28	0	0	0	28	0	0	0	44	0	0	65	0	0	0	7	0	0	44	0	0	0	0	0	0	0	0	0	0	107	0	0	0	0	78	0	0	106	0	85.129	23313.199	370.946	9.74	86.979	6.892	22.9	37.1	76.05	14.7700	0.65	64.78	3.66	1.0	0.049127	54.084	4.053393	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
1	139	0	0	0	134	0	12	0	0	0	0	125	0	0	0	0	124	0	0	123	0	0	34	0	0	0	64	0	0	0	122	0	103	0	67	0	0	0	0	0	124	0	0	0	128	232.128	65530.537	132.235	15.84	97.400	2.000	2.7	37.0	75.49	1.7295	0.21	103.35	14.72	1.0	0.000000	100.000	4.623860	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0