import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from numpy import mean
from numpy import std
from matplotlib.pyplot import figure
from sklearn import preprocessing
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
# Load train data
df_train = pd.read_csv("official_train_test_split/80-20/train_set.csv")
# Remove column
remove_columns = ['record_date', 'CountryCode', 'Month', 'Day_of_Month']
df_train = df_train.drop(remove_columns, axis=1)
### Combine level of weather situation
df_train['weather_situation'].value_counts()/len(df_train)*100 # percentage
# Combine level of caegorical variables that is < 5%
def cut_levels(x, threshold, new_value):
percentage = x.value_counts()/len(x)*100
labels = percentage.index[percentage < threshold]
x[np.in1d(x, labels)] = new_value
cut_levels(df_train.weather_situation, 5, 'wind-fog-snow')
# Apply similar step to test data
df_test = pd.read_csv("official_train_test_split/80-20/test_set.csv")
df_test = df_test.drop(remove_columns, axis=1)
df_test['weather_situation'].value_counts()/len(df_test)*100 # percentage
cut_levels(df_test.weather_situation, 5, 'wind-fog-snow')
### Remove highly correlated variables:
to_drop = ['c8_3_action', 'e1_2_action', 'h1_2_action']
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)
# Encoding categorical variables
cat_columns = ['weather_situation', 'isHoliday', 'Day_of_Week', 'is_weekend','continent']
df_train = pd.get_dummies(df_train, columns=cat_columns, prefix = cat_columns)
df_test = pd.get_dummies(df_test, columns=cat_columns, prefix = cat_columns)
# Feature selection based on Boruta
column_to_drop = ['E3_Fiscal measures', 'E4_International support', 'H4_Emergency investment in healthcare'
, 'H5_Investment in vaccines', 'is_weekend_0','is_weekend_1', 'isHoliday_0', 'isHoliday_1']
df_train = df_train.drop(column_to_drop, axis=1)
df_test = df_test.drop(column_to_drop, axis=1)
print(df_train.shape, df_test.shape)
df_train.rename(columns={'weather_situation_clear-day': 'clear-day',
'weather_situation_cloudy': 'cloudy',
'weather_situation_partly-cloudy-day': 'partly-cloudy-day',
'weather_situation_partly-cloudy-night': 'partly-cloudy-night',
'weather_situation_rain': 'rain',
'weather_situation_clear-night': 'clear-night',
'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
df_test.rename(columns={'weather_situation_clear-day': 'clear-day',
'weather_situation_cloudy': 'cloudy',
'weather_situation_partly-cloudy-day': 'partly-cloudy-day',
'weather_situation_partly-cloudy-night': 'partly-cloudy-night',
'weather_situation_rain': 'rain',
'weather_situation_clear-night': 'clear-night',
'weather_situation_wind-fog-snow':'wind-fog-snow'}, inplace=True)
df_train.rename(columns={'Day_of_Week_0': 'Monday',
'Day_of_Week_1': 'Tuesday',
'Day_of_Week_2': 'Wednesday',
'Day_of_Week_3': 'Thursday',
'Day_of_Week_4': 'Friday',
'Day_of_Week_5': 'Saturday',
'Day_of_Week_6': 'Sunday'}, inplace=True)
df_test.rename(columns={'Day_of_Week_0': 'Monday',
'Day_of_Week_1': 'Tuesday',
'Day_of_Week_2': 'Wednesday',
'Day_of_Week_3': 'Thursday',
'Day_of_Week_4': 'Friday',
'Day_of_Week_5': 'Saturday',
'Day_of_Week_6': 'Sunday'}, inplace=True)
X_train.head(2)
from sklearn.inspection import plot_partial_dependence
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
disp1 = plot_partial_dependence(model, X_train,features=[1,3])
disp1 = plot_partial_dependence(reg, X_train,features=[4,5])
disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])
disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])
disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17])
disp1 = plot_partial_dependence(reg, X_train,features=[18,19,20])
disp1 = plot_partial_dependence(reg, X_train,features=[21,22,23])
disp1 = plot_partial_dependence(reg, X_train,features=[24,25,26])
disp1 = plot_partial_dependence(reg, X_train,features=[27,29])
disp1 = plot_partial_dependence(reg, X_train,features=[30,31,32])
disp1 = plot_partial_dependence(reg, X_train,features=[33,35,36])
disp1 = plot_partial_dependence(reg, X_train,features=[40,41])
disp1 = plot_partial_dependence(reg, X_train,features=[43,44,45])
disp1 = plot_partial_dependence(reg, X_train,features=[46,47,48])
disp1 = plot_partial_dependence(reg, X_train,features=[49,50,51])
disp1 = plot_partial_dependence(reg, X_train,features=[52,53,54])
disp1 = plot_partial_dependence(reg, X_train,features=[55,56,57])
How does the prediction change when 1 feature changes ? Individual Conditional Expectation, as its name suggests, is a plot that shows how a change in an individual feature changes the outcome of each individual prediction (one line per prediction). It can be used for regression tasks only.
model = RandomForestRegressor(random_state=0)
model.fit(X_train, y_train)
from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'urbanPopulation', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('urbanPopulation');
from pycebox.ice import ice, ice_plot
ice_df = ice(X_train, 'gdp_per_capita', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('gdp_per_capita');
if gdp_per_capita is higher than 20000, then
ice_df = ice(X_train, 'e1_0_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('e1_0_action');
ice_df = ice(X_train, 'c4_4_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c4_4_action');
ice_df = ice(X_train, 'cvd_death_rate', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('cvd_death_rate');
ice_df = ice(X_train, 'hospital_beds_per_thousand', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('hospital_beds_per_thousand');
ice_df = ice(X_train, 'male_smokers', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('male_smokers');
ice_df = ice(X_train, 'c3_2_action', model.predict, num_grid_points=100)
ice_plot(ice_df, c='dimgray', linewidth=0.3)
plt.ylabel('Log_new_cases_per_million')
plt.xlabel('c3_2_action');
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value=" "></form>''')