eRum_2018/examples/regression_boston.Rmd at master · woobe/eRum_2018

History

260 lines (192 loc) · 4.79 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

---

title: "Regression Example: Boston Housing"

author: "Jo-fai (Joe) Chow - [email protected]"

date: "H2O + LIME Workshop at eRum 2018 (Updated for MilanoR Workshop)"

output:

html_document:

df_print: kable

fig_height: 10

fig_width: 14

highlight: tango

number_sections: yes

theme: spacelab

toc: yes

toc_depth: 2

---

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = TRUE)

```

# Get Ready

```{r, message=FALSE}

# Libraries

library(h2o) # for H2O Machine Learning

library(lime) # for Machine Learning Interpretation

library(mlbench) # for Datasets

```

```{r}

# Your lucky seed here ...

n_seed = 12345

```

# Data Prep - Boston Housing

```{r}

data("BostonHousing")

dim(BostonHousing)

head(BostonHousing)

```

## Define Target and Features

```{r}

target = "medv" # Median House Value

features = setdiff(colnames(BostonHousing), target)

print(features)

```

## Convert R dataframe into H2O dataframe (JVM)

```{r}

# Start a local H2O cluster (JVM)

h2o.init()

h2o.no_progress() # disable progress bar for RMarkdown

h2o.removeAll() # Optional: remove anything from previous session

```

```{r}

# H2O dataframe

h_boston = as.h2o(BostonHousing)

head(BostonHousing)

```

```{r}

# Split Train/Test

h_split = h2o.splitFrame(h_boston, ratios = 0.75, seed = n_seed)

h_train = h_split[[1]] # 75% for modelling

h_test = h_split[[2]] # 25% for evaluation

```

# Build H2O Models

## Single Model - Default H2O GBM

```{r}

# Train a Default H2O GBM model

model_gbm = h2o.gbm(x = features,

y = target,

training_frame = h_train,

model_id = "gbm_default_reg",

seed = n_seed)

print(model_gbm)

```

## Evaluate Single Model

```{r}

# Evaluate performance on test

h2o.performance(model_gbm, newdata = h_test)

```

## H2O AutoML: Multiple H2O Models + Stacked Ensemble

```{r}

# Train multiple H2O models with H2O AutoML

# Stacked Ensembles will be created from those H2O models

# You tell H2O ...

# 1) how much time you have and/or

# 2) how many models do you want

# Note: H2O deep learning algo on multi-core is stochastic

model_automl = h2o.automl(x = features,

y = target,

training_frame = h_train,

nfolds = 5, # Cross-Validation

max_runtime_secs = 120, # Max time

max_models = 100, # Max no. of models

stopping_metric = "RMSE", # Metric to optimize

project_name = "automl_reg",

exclude_algos = NULL, # If you want to exclude any algo

seed = n_seed)

```

## AutoML Leaderboard

```{r}

model_automl@leaderboard

```

## Best Model (Single / Stacked Ensemble)

```{r}

# H2O: Model Leader

# Best Model (either an individual model or a stacked ensemble)

model_automl@leader

```

## Evaluate Performance

```{r}

# Default GBM Model

h2o.performance(model_gbm, newdata = h_test)

```

```{r}

# Best model from AutoML

h2o.performance(model_automl@leader, newdata = h_test) # lower RMSE = better

```

## Make Predictions (Optional)

```{r}

yhat_test = h2o.predict(model_automl@leader, h_test)

head(yhat_test)

```

## Export Models (Optional)

- Use `h2o.saveModel()` to save model to disk

- Use `h2o.loadModel()` to re-load model

- Also see `h2o.download_mojo()` and `h2o.download_pojo()`

```{r, eval=FALSE}

# Save model to disk

h2o.saveModel(object = model_automl@leader,

path = "./models/",

force = TRUE)

```

# Explain the Model

## Step 1: Create an `explainer`

```{r}

explainer = lime::lime(x = as.data.frame(h_train[, features]),

model = model_automl@leader)

```

## Step 2: Turn `explainer` into `explanations`

```{r}

# Extract one sample (change `1` to any row you want)

d_samp = as.data.frame(h_test[1, features])

```

```{r}

# Assign a specifc row name (for better visualization)

row.names(d_samp) = "Sample 1"

```

```{r}

# Create explanations

explanations = lime::explain(x = d_samp,

explainer = explainer,

n_permutations = 5000,

feature_select = "auto",

n_features = 13) # Look top x features

```

## Look at Explanations (Bar Chart)

```{r}

lime::plot_features(explanations, ncol = 1)

```

## Look at Explanations (Full Table)

```{r}

# Sort explanations by feature weight

explanations =

explanations[order(explanations$feature_weight, decreasing = TRUE),]

```

```{r}

# Print Table

print(explanations)

```

# Try it Yourself

Replace `BostonHousing` with your own data. Good luck!

# Session Info

```{r}

sessionInfo()

```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

regression_boston.Rmd

Latest commit

History

regression_boston.Rmd

File metadata and controls