import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)
### Combine level of weather situation
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
percentage = x.value_counts()/len(x)*100
labels = percentage.index[percentage < threshold]
x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow')
# Apply similar step to test data
df_test = pd.read_csv("official_train_test_split/80-20/test_set.csv")
df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage
cut_levels(df_test.weather_situation, 5, 'wind-fog-snow')
### Remove highly correlated variables:
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)
# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
, 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
print(df_train.shape, df_test.shape)
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',
'weather_situation_cloudy': 'cloudy',
'weather_situation_partly-cloudy-day': 'partly-cloudy-day',
'weather_situation_partly-cloudy-night': 'partly-cloudy-night',
'weather_situation_rain': 'rain',
'weather_situation_clear-night': 'clear-night',
'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
df_test.rename(columns={'weather_situation_clear-day': 'clear-day',
'weather_situation_cloudy': 'cloudy',
'weather_situation_partly-cloudy-day': 'partly-cloudy-day',
'weather_situation_partly-cloudy-night': 'partly-cloudy-night',
'weather_situation_rain': 'rain',
'weather_situation_clear-night': 'clear-night',
'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
df_train.rename(columns={'Day_of_Week_0': 'Monday',
'Day_of_Week_1': 'Tuesday',
'Day_of_Week_2': 'Wednesday',
'Day_of_Week_3': 'Thursday',
'Day_of_Week_4': 'Friday',
'Day_of_Week_5': 'Saturday',
'Day_of_Week_6': 'Sunday'}, inplace=True)
df_test.rename(columns={'Day_of_Week_0': 'Monday',
'Day_of_Week_1': 'Tuesday',
'Day_of_Week_2': 'Wednesday',
'Day_of_Week_3': 'Thursday',
'Day_of_Week_4': 'Friday',
'Day_of_Week_5': 'Saturday',
'Day_of_Week_6': 'Sunday'}, inplace=True)
X_train.head(2)
import shap
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(shap_values.shape)
pd.DataFrame(X_train.iloc[0,:]).transpose()
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])
The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The prediction starts from the baseline.
The baseline for Shapley values is the average of all predictions. The base_value here is 0.4597 while our predicted value is 0.61. Month=7 has the highest impact on increasing the prediction. Wheras, e1_0_action has the highest negative impact (decreasing the prediction).
Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue
pd.DataFrame(X_test.iloc[30,:]).transpose()
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[30,:], X_test.iloc[30,:])
pd.DataFrame(X_test.iloc[300,:]).transpose()
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[300,:], X_test.iloc[300,:])
pd.DataFrame(X_test.iloc[3000,:]).transpose()
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[3000,:], X_test.iloc[3000,:])
# load JS visualization code to notebook
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_test)