import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)
### Combine level of weather situation
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
percentage = x.value_counts()/len(x)*100
labels = percentage.index[percentage < threshold]
x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow')
# Apply similar step to test data
df_test = pd.read_csv("official_train_test_split/80-20/test_set.csv")
df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage
cut_levels(df_test.weather_situation, 5, 'wind-fog-snow')
### Remove highly correlated variables:
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)
# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
, 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
print(df_train.shape, df_test.shape)
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',
'weather_situation_cloudy': 'cloudy',
'weather_situation_partly-cloudy-day': 'partly-cloudy-day',
'weather_situation_partly-cloudy-night': 'partly-cloudy-night',
'weather_situation_rain': 'rain',
'weather_situation_clear-night': 'clear-night',
'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
df_test.rename(columns={'weather_situation_clear-day': 'clear-day',
'weather_situation_cloudy': 'cloudy',
'weather_situation_partly-cloudy-day': 'partly-cloudy-day',
'weather_situation_partly-cloudy-night': 'partly-cloudy-night',
'weather_situation_rain': 'rain',
'weather_situation_clear-night': 'clear-night',
'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
df_train.rename(columns={'Day_of_Week_0': 'Monday',
'Day_of_Week_1': 'Tuesday',
'Day_of_Week_2': 'Wednesday',
'Day_of_Week_3': 'Thursday',
'Day_of_Week_4': 'Friday',
'Day_of_Week_5': 'Saturday',
'Day_of_Week_6': 'Sunday'}, inplace=True)
df_test.rename(columns={'Day_of_Week_0': 'Monday',
'Day_of_Week_1': 'Tuesday',
'Day_of_Week_2': 'Wednesday',
'Day_of_Week_3': 'Thursday',
'Day_of_Week_4': 'Friday',
'Day_of_Week_5': 'Saturday',
'Day_of_Week_6': 'Sunday'}, inplace=True)
X_train.head(2)
LIME basically tries to step away from deriving the importance of global features and instead approximates the importance of features for local predictions. It does so by taking the row (or set of data points) from which to predict and generate fake data based on that row. It then calculates the similarity between the fake data and the real data and approximates the effect of the changes based on the similarity between the fake and real data
import lime
import lime.lime_tabular
explainer1 = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, verbose=True, mode='regression')
pd.DataFrame(X_test.iloc[25,:]).transpose()
i = 25
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)
from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')
blue -> negative influence
orange -> positive influence
pd.DataFrame(X_test.iloc[250,:]).transpose()
i = 250
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)
from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_250.pdf')
pd.DataFrame(X_test.iloc[2500,:]).transpose()
i = 2500
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)
from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_2500.pdf')
pd.DataFrame(X_test.iloc[3000,:]).transpose()
i = 3000
figure(figsize=(20, 10), dpi=80)
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=5)
exp.show_in_notebook(show_table=True)
from matplotlib.pyplot import figure
plt.savefig('Figures_high_quality/LIME_3000.pdf')
pd.DataFrame(X_test.iloc[4688,:]).transpose()
i = 4688
exp = explainer1.explain_instance(X_test.iloc[i], model.predict, num_features=10)
exp.show_in_notebook(show_table=True)
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value=" "></form>''')