m5py/main.py - flake8 annotated source

1 from collections import namedtuple

2 from copy import copy

3 from logging import getLogger

4 from warnings import warn

5

6 import numpy as np

7

8 from scipy.sparse import issparse

9

10 from sklearn import clone

11 from sklearn.base import RegressorMixin, is_classifier

12 from sklearn.linear_model import LinearRegression

13 from sklearn.metrics import mean_squared_error

14 from sklearn.preprocessing import StandardScaler

15 from sklearn.tree import BaseDecisionTree, _tree

16 from sklearn.tree._classes import DTYPE

17 from sklearn.tree._tree import DOUBLE

18 from sklearn.utils import check_array

19 from sklearn.utils.validation import check_is_fitted

20 from sklearn import __version__ as sklearn_version

21

22 from m5py.linreg_utils import linreg_model_to_text, DeNormalizableMixIn, DeNormalizableLinearRegression

23

24 from packaging.version import Version

25

26 SKLEARN_VERSION = Version(sklearn_version)

27 SKLEARN13_OR_GREATER = SKLEARN_VERSION >= Version("1.3.0")

28

29

30 __all__ = ["M5Base", "M5Prime"]

31

32

33 _SmoothingDetails = namedtuple("_SmoothingDetails", ("A", "B", "C"))

34 # Internal structure to contain the recursive smoothed coefficients details in the smoothing algorithm

35

36

37 logger = getLogger("m5p")

38

39

40 class M5Base(BaseDecisionTree):

41 """

42 M5Base. Implements base routines for generating M5 PredictionModel trees and rules.

43

44 The original algorithm M5 was invented by Quinlan:

45

46 - Quinlan J. R. (1992). Learning with continuous classes. Proceedings of the

47 Australian Joint Conference on Artificial Intelligence. 343--348. World

48 Scientific, Singapore.

49

50 Yong Wang and Ian Witten made improvements and created M5':

51

52 - Wang, Y and Witten, I. H. (1997). Induction of model trees for predicting

53 continuous classes. Proceedings of the poster papers of the European

54 Conference on Machine Learning. University of Economics, Faculty of

55 Informatics and Statistics, Prague.

56

57 Pruning and Smoothing can be activated and deactivated on top of the base

58 model. TODO check if 'rules' should be supported too

59

60 Inspired by Weka (https://github.com/bnjmn/weka) M5Base class, from Mark Hall

61

62 Attributes

63 ----------

64 criterion : str, default="friedman_mse"

65 M5 suggests to use the standard deviation (RMSE impurity) instead of

66 variance (MSE impurity) or absolute deviation (MAE impurity) as in CART.

67 According to M5' paper, both 3 criterions are equivalent.

68 splitter : default="best"

69 M5 suggests to take the feature leading to the best gain in criterion.

70 max_depth : default None

71 This is not used in the original M5 article, hence default=None.

72 min_samples_split : int, default=4

73 M5 suggests a value of 4 so that each leaf will have at least 2 samples.

74 min_samples_leaf : int, default=2

75 M5' suggest to add this explicitly to avoid zero-variance in leaves, and

76 n<=p for constant models.

77 min_weight_fraction_leaf : float, default=0.0

78 This is not used in the original M5 article, hence default=0.0.

79 TODO this would actually maybe be better than min_sample_leaf ?

80 max_features : default None

81 This is not used in the original M5 article, hence default is None.

82 max_leaf_nodes : int, default None

83 This is not used in the original M5 article, hence default is None.

84 min_impurity_decrease : float, default=0.0

85 This is not used in the original M5 article, hence default is None.

86 class_weight : default None

87 This is not used (?)

88 leaf_model : RegressorMixin

89 The regression model used in the leaves. This instance will be cloned

90 for each leaf.

91 use_pruning : bool

92 If False, pruning will be disabled.

93 use_smoothing : bool or str {'installed', 'on_prediction'}, default None

94 None and True means 'installed' by default except if smoothing_constant

95 or smoothing_constant_ratio is 0.0

96 smoothing_constant: float, default None

97 The smoothing constant k defined in the original M5 article, used as the

98 weight for each parent model in the recursive weighting process.

99 If None, the default value from the paper (k=15) will be used.

100 smoothing_constant_ratio: float, default None

101 An alternate way to define the smoothing constant, as a ratio of the

102 total number of training samples. The resulting smoothing constant will

103 be smoothing_constant_ratio * n where n is the number of samples.

104 Note that this may not be an integer.

105 debug_prints: bool, default False

106 A boolean to enable debug prints

107 ccp_alpha: float, default 0.0

108 TODO is this relevant ? does that conflict with "use_pruning" ?

109 random_state : None, int, or RandomState, default=None

110 See `RegressionTree.random_state`.

111 """

112

113 def __init__(

114 self,

115 criterion="friedman_mse",

116 splitter="best",

117 max_depth=None,

118 min_samples_split=4,

119 min_samples_leaf=2,

120 min_weight_fraction_leaf=0.0,

121 max_features=None,

122 max_leaf_nodes=None,

123 min_impurity_decrease=0.0,

124 class_weight=None,

125 leaf_model=None,

126 use_pruning=True,

127 use_smoothing=None,

128 smoothing_constant=None,

129 smoothing_constant_ratio=None,

130 debug_prints=False,

131 ccp_alpha=0.0,

132 random_state=None,

133 ):

134

135 # TODO the paper suggests to do this with 5% of total std

136 # min_impurity_split = min_impurity_split_as_initial_ratio * initial_impurity

137

138 super(M5Base, self).__init__(

139 criterion=criterion,

140 splitter=splitter,

141 max_depth=max_depth,

142 min_samples_split=min_samples_split,

143 min_samples_leaf=min_samples_leaf,

144 min_weight_fraction_leaf=min_weight_fraction_leaf,

145 max_features=max_features,

146 max_leaf_nodes=max_leaf_nodes,

147 min_impurity_decrease=min_impurity_decrease,

148 random_state=random_state,

149 class_weight=class_weight,

150 ccp_alpha=ccp_alpha,

151 )

152

153 # warning : if the field names are different from constructor params,

154 # then clone(self) will not work.

155 if leaf_model is None:

156 # to handle case when the model is learnt on normalized data and

157 # we wish to be able to read the model equations.

158 leaf_model = DeNormalizableLinearRegression()

159 self.leaf_model = leaf_model

160

161 # smoothing related

162 if smoothing_constant_ratio is not None and smoothing_constant is not None:

163 raise ValueError("Only one of `smoothing_constant` and `smoothing_constant_ratio` should be provided")

164 elif (smoothing_constant_ratio == 0.0 or smoothing_constant == 0) and (

165 use_smoothing is True or use_smoothing == "installed"

166 ):

167 raise ValueError(

168 "`use_smoothing` was explicitly enabled with "

169 "pre-installed models, while smoothing "

170 "constant/ratio are explicitly set to zero"

171 )

172

173 self.use_pruning = use_pruning

174 self.use_smoothing = use_smoothing

175 self.smoothing_constant = smoothing_constant

176 self.smoothing_constant_ratio = smoothing_constant_ratio

177

178 self.debug_prints = debug_prints

179

180 def as_pretty_text(self, **kwargs):

181 """

182 Returns a multi-line representation of this decision tree, using

183 `tree_to_text`.

184

185 :return: a multi-line string representing this decision tree

186 """

187 from m5py.export import export_text_m5

188 return export_text_m5(self, out_file=None, **kwargs)

189

190 def fit(self, X, y: np.ndarray, sample_weight=None, check_input=True, X_idx_sorted="deprecated"):

191 """Fit a M5Prime model.

192

193 Parameters

194 ----------

195 X : numpy.ndarray

196 y : numpy.ndarray

197 sample_weight

198 check_input

199 X_idx_sorted

200

201 Returns

202 -------

203

204 """

205 # (0) smoothing default values behaviour

206 if self.smoothing_constant_ratio == 0.0 or self.smoothing_constant == 0:

207 self.use_smoothing = False

208 elif self.use_smoothing is None or self.use_smoothing is True:

209 # default behaviour

210 if isinstance(self.leaf_model, LinearRegression):

211 self.use_smoothing = "installed"

212 else:

213 self.use_smoothing = "on_prediction"

214

215 # Finally make sure we are ok now

216 if self.use_smoothing not in [False, np.bool_(False), "installed", "on_prediction"]:

217 raise ValueError("use_smoothing: Unexpected value: %s, please report it as issue." % self.use_smoothing)

218

219

E303 Too many blank lines (2)

220 # (1) Build the initial tree as usual

221

222 # Get the correct fit method name based on the sklearn version used

223 fit_method_name = "_fit" if SKLEARN13_OR_GREATER else "fit"

224

225 fit_method = getattr(super(M5Base, self), fit_method_name)

226 fit_method(X, y, sample_weight=sample_weight, check_input=check_input)

227

228

E303 Too many blank lines (2)

229 if self.debug_prints:

230 logger.debug("(debug_prints) Initial tree:")

231 logger.debug(self.as_pretty_text(node_ids=True))

232

233 # (1b) prune initial tree to take into account min impurity in splits

234 prune_on_min_impurity(self.tree_)

235

236 if self.debug_prints:

237 logger.debug("(debug_prints) Postprocessed tree:")

238 logger.debug(self.as_pretty_text(node_ids=True))

239

240 # (2) Now prune tree and replace pruned branches with linear models

241 # -- unfortunately we have to re-do this input validation

242 # step, because it also converts the input to float32.

243 if check_input:

244 X = check_array(X, dtype=DTYPE, accept_sparse="csc")

245 if issparse(X):

246 X.sort_indices()

247

248 if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:

249 raise ValueError("No support for np.int64 index based " "sparse matrices")

250

251 # -- initialise the structure to contain the leaves and node models

252 self.node_models = np.empty((self.tree_.node_count,), dtype=object)

253

254 # -- Pruning requires to know the global deviation of the target

255 global_std_dev = np.nanstd(y)

256 # global_abs_dev = np.nanmean(np.abs(y))

257

258 # -- Pruning requires to know the samples that reached each node.

259 # From http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html

260 # Retrieve the decision path of each sample.

261 samples_to_nodes = self.decision_path(X)

262 # * row i is to see the nodes (non-empty j) in which sample i appears.

263 # sparse row-first (CSR) format is OK

264 # * column j is to see the samples (non-empty i) that fall into that

265 # node. To do that, we need to make it CSC

266 nodes_to_samples = samples_to_nodes.tocsc()

267

268 # -- execute the pruning

269 # self.features_usage is a dict feature_idx -> nb times used, only

270 # for used features.

271 self.features_usage = build_models_and_get_pruning_info(

272 self.tree_,

273 X,

274 y,

275 nodes_to_samples,

276 self.leaf_model,

277 self.node_models,

278 global_std_dev,

279 use_pruning=self.use_pruning,

280 )

281

282 # -- cleanup to compress inner structures: only keep non-pruned ones

283 self._cleanup_tree()

284

285 if self.debug_prints:

286 logger.debug("(debug_prints) Pruned tree:")

287 logger.debug(self.as_pretty_text(node_ids=True))

288

289 if self.use_smoothing == "installed":

290 # Retrieve the NEW decision path of each sample.

291 samples_to_nodes = self.decision_path(X)

292 # * row i is to see the nodes (non-empty j) in which sample i

293 # appears. sparse row-first (CSR) format is OK

294 # * column j is to see the samples (non-empty i) that fall into

295 # that node. To do that, we need to make it CSC

296 nodes_to_samples = samples_to_nodes.tocsc()

297

298 # default behaviour for smoothing constant and ratio

299 smoothing_constant = self._get_smoothing_constant_to_use(X)

300

301 self.install_smoothing(X, y, nodes_to_samples, smoothing_constant=smoothing_constant)

302

303 if self.debug_prints:

304 logger.debug("(debug_prints) Pruned and smoothed tree:")

305 logger.debug(self.as_pretty_text(node_ids=True))

306

307 return self

308

309 def _get_smoothing_constant_to_use(self, X):

310 """

311 Returns the smoothing_constant to use for smoothing, based on current

312 settings and X data

313 """

314 if self.smoothing_constant_ratio is not None:

315 nb_training_samples = X.shape[0]

316 smoothing_cstt = self.smoothing_constant_ratio * nb_training_samples

317 if smoothing_cstt < 15:

318 warn(

319 "smoothing constant ratio %s is leading to an extremely "

320 "small smoothing constant %s because nb training samples"

321 " is %s. Clipping to 15." % (self.smoothing_constant_ratio, smoothing_cstt, nb_training_samples)

322 )

323 smoothing_cstt = 15

324 else:

325 smoothing_cstt = self.smoothing_constant

326 if smoothing_cstt is None:

327 smoothing_cstt = 15 # default from the original Quinlan paper

328

329 return smoothing_cstt

330

331 def _cleanup_tree(self):

332 """

333 Reduces the size of this object by removing from internal structures

334 all items that are not used any more (all leaves that have been pruned)

335 """

336 old_tree = self.tree_

337 old_node_modls = self.node_models

338

339 # Get all information to create a copy of the inner tree.

340 # Note: _tree.copy() is gone so we use the pickle way

341 # --- Base info: nb features, nb outputs, output classes

342 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631

343 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)

344 # So these remain, we are just interested in changing the node-related arrays

345 new_tree = _tree.Tree(*old_tree.__reduce__()[1])

346

347 # --- Node info

348 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637

349 dct = old_tree.__getstate__().copy()

350

351 # cleanup: only keep the nodes that are not undefined.

352 # note: this is identical to

353 # valid_nodes_indices = dct["nodes"]['left_child'] != TREE_UNDEFINED

354 valid_nodes_indices = old_tree.children_left != _tree.TREE_UNDEFINED

355 # valid_nodes_indices2 = old_tree.children_right != TREE_UNDEFINED

356 # assert np.all(valid_nodes_indices == valid_nodes_indices2)

357 new_node_count = sum(valid_nodes_indices)

358

359 # create empty new structures

360 n_shape = (new_node_count, *dct["nodes"].shape[1:])

361 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)

362 v_shape = (new_node_count, *dct["values"].shape[1:])

363 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype)

364 m_shape = (new_node_count, *old_node_modls.shape[1:])

365 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype)

366

367 # Fill structures while reindexing the tree and remembering the depth

368 global next_free_id

369 next_free_id = 0

370

371 def _compress(old_node_id):

372 """

373

374 Parameters

375 ----------

376 old_node_id

377

378 Returns

379 -------

380 the depth and new indices of left and right children

381

382 """

383 global next_free_id

384 new_node_id = next_free_id

385 next_free_id += 1

386

387 # use the old tree to walk

388 old_node = dct["nodes"][old_node_id]

389 left_id = old_node["left_child"]

390 right_id = old_node["right_child"]

391

392 # Create the new node with a copy of the old

393 new_nodes[new_node_id] = old_node # this is an entire row so it is probably copied already by doing so.

394 new_values[new_node_id] = dct["values"][old_node_id]

395 new_node_models[new_node_id] = copy(old_node_modls[old_node_id])

396

397 if left_id == _tree.TREE_LEAF:

398 # ... and right_id == _tree.TREE_LEAF

399 # => node_id is a leaf. Nothing to do

400 return 1, new_node_id

401 else:

402

403 left_depth, new_id_left = _compress(left_id)

404 right_depth, new_id_right = _compress(right_id)

405

406 # store the new indices

407 new_nodes[new_node_id]["left_child"] = new_id_left

408 new_nodes[new_node_id]["right_child"] = new_id_right

409

410 return 1 + max(left_depth, right_depth), new_node_id

411

412 # complete definition of the new tree

413 dct["max_depth"] = _compress(0)[0] - 1 # root node has depth 0, not 1

414 dct["node_count"] = new_node_count # new_nodes.shape[0]

415 dct["nodes"] = new_nodes

416 dct["values"] = new_values

417 new_tree.__setstate__(dct)

418

419 # Fix an issue on sklearn 0.17.1: setstate was not updating max_depth

420 # See https://github.com/scikit-learn/scikit-learn/blob/0.17.1/sklearn/tree/_tree.pyx#L623

421 new_tree.max_depth = dct["max_depth"]

422

423 # update self fields

424 self.tree_ = new_tree

425 self.node_models = new_node_models

426

427 def install_smoothing(self, X_train_all, y_train_all, nodes_to_samples, smoothing_constant):

428 """

429 Executes the smoothing procedure described in the M5 and M5P paper,

430 "once for all". This means that all models are modified so that after

431 this method has completed, each model in the tree is already a smoothed

432 model.

433

434 This has pros (prediction speed) and cons (the model equations are

435 harder to read - lots of redundancy)

436

437 It can only be done if leaf models are instances of `LinearRegression`

438 """

439 # check validity: leaf models have to support pre-computed smoothing

440 if not isinstance(self.leaf_model, LinearRegression):

441 raise TypeError(

442 "`install_smoothing` is only available if leaf "

443 "models are instances of `LinearRegression` or a "

444 "subclass"

445 )

446

447 # Select the Error metric to compute model errors (used to compare

448 # node with subtree for pruning)

449 # TODO in Weka they use RMSE, but in papers they use MAE. this should be a parameter.

450 # TODO Shall we go further and store the residuals, or a bunch of metrics? not sure

451 err_metric = root_mean_squared_error # mean_absolute_error

452

453 # --- Node info

454 # TODO we should probably not do this once in each method, but once or give access directly (no copy)

455 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637

456 dct = self.tree_.__getstate__().copy()

457 old_node_models = self.node_models

458

459 m_shape = old_node_models.shape

460 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype)

461

462 def smooth__(

463 coefs_,

464 n_samples,

465 features=None,

466 parent=None, # type: _SmoothingDetails

467 parent_features=None,

468 k=smoothing_constant, #

469 ):

470 # type: (...) -> _SmoothingDetails

471 """

472 Smoothes the model coefficients or intercept `coefs_` (an array or

473 a scalar), using the smoothing results at parent node.

474

475 At each node we keep in memory three results A, B, C.

476 - the new coef at each node is A + B

477 - the recursion equations are

478 - B(n) = (k / (n_samples(n) + k)) * A(n-1) + B(n-1)

479 - C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1)

480 - A(n) = coef(n)*C(n)

481 """

482 if parent is None:

483 # A0 = coef(0), C0 = 1, B0 = 0

484 if features is None:

485 # single scalar value

486 return _SmoothingDetails(A=coefs_, B=0, C=1)

487 else:

488 # vector

489 if len(coefs_) == 0:

490 coefs_ = np.asarray(coefs_)

491 if coefs_.shape[0] != len(features):

492 raise ValueError("nb features does not match the nb " "of coefficients")

493 return _SmoothingDetails(

494 A=coefs_,

495 B=np.zeros(coefs_.shape, dtype=coefs_.dtype),

496 C=np.ones(coefs_.shape, dtype=coefs_.dtype),

497 )

498 else:

499 # B(n) = k/(n_samples(n)+k)*A(n-1) + B(n-1)

500 # C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1)

501 Bn = (k / (n_samples + k)) * parent.A + parent.B

502 Cn = (n_samples / (n_samples + k)) * parent.C

503

504 # A(n) = coef(n) * C(n)

505 if features is None:

506 # single scalar value: easy

507 An = coefs_ * Cn

508 return _SmoothingDetails(A=An, B=Bn, C=Cn)

509 else:

510 # vector of coefs: we have to 'expand' the coefs array

511 # because the coefs at this node apply for (features)

512 # while coefs at parent node apply for (parent_features)

513 An = np.zeros(Cn.shape, dtype=Cn.dtype)

514 parent_features = np.array(parent_features)

515 features = np.array(features)

516

517 # Thanks https://stackoverflow.com/a/8251757/7262247 !

518 index = np.argsort(parent_features)

519 sorted_parents = parent_features[index]

520 sorted_index = np.searchsorted(sorted_parents, features)

521

522 features_index = np.take(index, sorted_index, mode="clip")

523 if np.any(parent_features[features_index] != features):

524 # result = np.ma.array(features_index, mask=mask)

525 raise ValueError(

526 "Internal error - please report this."

527 "One feature was found in the child "

528 "node, that was not in the parent "

529 "node."

530 )

531

532 if len(features_index) > 0:

533 An[features_index] = coefs_ * Cn[features_index]

534

535 return _SmoothingDetails(A=An, B=Bn, C=Cn)

536

537 def _smooth(

538 node_id,

539 parent_features=None,

540 parent_coefs: _SmoothingDetails = None,

541 parent_intercept: _SmoothingDetails = None,

542 ):

543 # Gather all info on this node

544 # --base regression tree

545 node_info = dct["nodes"][node_id]

546 left_id = node_info["left_child"]

547 right_id = node_info["right_child"]

548 # --additional model

549 node_model = old_node_models[node_id]

550 # --samples

551 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)

552 n_samples_at_this_node = samples_at_this_node.shape[0]

553

554 # Note: should be equal to tree.n_node_samples[node_id]

555 if n_samples_at_this_node != self.tree_.n_node_samples[node_id]:

556 raise ValueError("n_samples_at_this_node: Unexpected value, please report it as issue.")

557

558 y_true_this_node = y_train_all[samples_at_this_node]

559 X_this_node = X_train_all[samples_at_this_node, :]

560

561 # (1) smooth current node

562 parent_features = parent_features if parent_features is not None else None

563 is_constant_leaf = False

564 if left_id == _tree.TREE_LEAF and isinstance(node_model, ConstantLeafModel):

565 is_constant_leaf = True

566 node_features = ()

567 smoothed_features = parent_features if parent_features is not None else node_features

568 node_coefs = ()

569 node_intercept = dct["values"][node_id]

570

571 # Extract the unique scalar value

572 if len(node_intercept) != 1:

573 raise ValueError("node_intercept: Unexpected value: , please report it as issue." % node_intercept)

574

575 node_intercept = node_intercept.item()

576

577 else:

578 # A leaf LinRegLeafModel or a split SplitNodeModel

579 node_features = node_model.features

580 smoothed_features = parent_features if parent_features is not None else node_features

581 node_coefs = node_model.model.coef_

582 node_intercept = node_model.model.intercept_

583

584 # Create a new linear regression model with smoothed coefficients

585 smoothed_sklearn_model = clone(self.leaf_model)

586 smoothed_coefs = smooth__(

587 node_coefs,

588 features=node_features,

589 n_samples=n_samples_at_this_node,

590 parent=parent_coefs,

591 parent_features=parent_features,

592 )

593 smoothed_intercept = smooth__(node_intercept, n_samples=n_samples_at_this_node, parent=parent_intercept)

594 smoothed_sklearn_model.coef_ = smoothed_coefs.A + smoothed_coefs.B

595 smoothed_sklearn_model.intercept_ = smoothed_intercept.A + smoothed_intercept.B

596

597 # Finally update the node

598 if is_constant_leaf:

599 smoothed_node_model = LinRegLeafModel(smoothed_features, smoothed_sklearn_model, None)

600 else:

601 smoothed_node_model = copy(node_model)

602 smoothed_node_model.features = smoothed_features

603 smoothed_node_model.model = smoothed_sklearn_model

604

605 # Remember the new smoothed model

606 new_node_models[node_id] = smoothed_node_model

607

608 if left_id == _tree.TREE_LEAF:

609 # If this is a leaf, update the prediction error on X

610 y_pred_this_node = smoothed_node_model.predict(X_this_node)

611 smoothed_node_model.error = err_metric(y_true_this_node, y_pred_this_node)

612

613 else:

614 # If this is a split node - recurse on each subtree

615 _smooth(

616 left_id,

617 parent_features=smoothed_features,

618 parent_coefs=smoothed_coefs,

619 parent_intercept=smoothed_intercept,

620 )

621 _smooth(

622 right_id,

623 parent_features=smoothed_features,

624 parent_coefs=smoothed_coefs,

625 parent_intercept=smoothed_intercept,

626 )

627

628 # Update the error using the same formula than the one we used

629 # in build_models_and_get_pruning_info

630 y_pred_children = predict_from_leaves_no_smoothing(self.tree_, new_node_models, X_this_node)

631 err_children = err_metric(y_true_this_node, y_pred_children)

632

633 # the total number of parameters is the sum of params in each

634 # branch PLUS 1 for the split

635 n_params_splitmodel = new_node_models[left_id].n_params + new_node_models[right_id].n_params + 1

636 smoothed_node_model.n_params = n_params_splitmodel

637 # do not adjust the error now, simply store the raw one

638 # smoothed_node_model.error = compute_adjusted_error(

639 # err_children, n_samples_at_this_node, n_params_splitmodel)

640 smoothed_node_model.error = err_children

641

642 return

643

644 # smooth the whole tree

645 _smooth(0)

646

647 # use the new node models now

648 self.node_models = new_node_models

649

650 # remember the smoothing constant installed

651 self.installed_smoothing_constant = smoothing_constant

652 return

653

654 def denormalize(self, x_scaler, y_scaler):

655 """

656 De-normalizes this model according to the provided x and y normalization scalers.

657 Currently only StandardScaler issupported.

658

659 :param x_scaler: a StandardScaler or None

660 :param y_scaler: a StandardScaler or None

661 :return:

662 """

663 # perform denormalization

664 self._denormalize_tree(x_scaler, y_scaler)

665

666 def _denormalize_tree(self, x_scaler, y_scaler):

667 """

668 De-normalizes all models in the tree

669 :return:

670 """

671 old_tree = self.tree_

672 old_node_models = self.node_models

673

674 # Get all information to create a copy of the inner tree.

675 # Note: _tree.copy() is gone so we use the pickle way

676 # --- Base info: nb features, nb outputs, output classes

677 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631

678 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)

679 # So these remain, we are just interested in changing the node-related arrays

680 new_tree = _tree.Tree(*old_tree.__reduce__()[1])

681

682 # --- Node info

683 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637

684 dct = old_tree.__getstate__().copy()

685

686 # create empty new structures for nodes and models

687 n_shape = dct["nodes"].shape

688 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)

689 v_shape = dct["values"].shape

690 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype)

691 m_shape = old_node_models.shape

692 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype)

693

694 def _denormalize(node_id, x_scaler, y_scaler):

695 """

696 denormalizes the subtree below node `node_id`.

697

698 - `new_nodes[node_id]` is filled with a copy of the old node

699 (see old_node.dtype to see the various fields). If the node is

700 a split node, the split threshold

701 `new_nodes[node_id]['threshold']` is denormalized.

702

703 - `new_values[node_id]` is filled with a denormalized copy of the

704 old constant prediction at this node. Reminder: this constant

705 prediction is actually used only when on the leaves now, but

706 we keep it for ref.

707

708 - `new_node_models[node_id]` is filled with a copy of the old

709 model `old_node_models[node_id]`, that is denormalized if it

710 is not a constant leaf

711

712 :param node_id:

713 :return: (nothing)

714 """

715 # use the old tree to walk

716 old_node = dct["nodes"][node_id]

717 left_id = old_node['left_child']

718 right_id = old_node['right_child']

719

720 # Create the new node with a copy of the old

721 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so.

722 new_model = copy(old_node_models[node_id])

723 new_node_models[node_id] = new_model

724

725 # Create the new value by de-scaling y

726 if y_scaler is not None:

727 # Note: if this is a split node with a linear regression model

728 # the value will never be used. However to preserve consistency

729 # of the whole values structure and for debugging purposes, we

730 # choose this safe path of denormalizing ALL.

731 # TODO we could also do it at once outside of the recursive

732 # calls, but we should check for side effects

733 new_values[node_id] = y_scaler.inverse_transform(

734 dct["values"][node_id]

735 )

736 else:

737 # no denormalization: simply copy

738 new_values[node_id] = dct["values"][node_id]

739

740 if left_id == _tree.TREE_LEAF:

741 # ... and right_id == _tree.TREE_LEAF

742 # => node_id is a leaf

743 if isinstance(new_model, ConstantLeafModel):

744 # nothing to do: we already re-scaled the value

745 return

746 elif isinstance(new_model, LinRegLeafModel):

747 # denormalize model

748 new_model.denormalize(x_scaler, y_scaler)

749 return

750 else:

751 raise TypeError("Internal error - Leafs can only be"

752 "constant or linear regression")

753 else:

754 # this is a split node, denormalize each subtree

755 _denormalize(left_id, x_scaler, y_scaler)

756 _denormalize(right_id, x_scaler, y_scaler)

757

758 # denormalize the split value if needed

759 if x_scaler is not None:

760 split_feature = old_node['feature']

761 # The denormalizer requires a vector with all the features,

762 # even if we only want to denormalize one.

763 # -- put split value in a vector where it has pos 'feature'

764 old_threshold_and_zeros = np.zeros((self.n_features_, ), dtype=dct["nodes"]['threshold'].dtype)

765 old_threshold_and_zeros[split_feature] = old_node['threshold']

766 # -- denormalize the vector and retrieve value 'feature'

767 new_nodes[node_id]['threshold'] = x_scaler.inverse_transform(old_threshold_and_zeros)[split_feature]

768 else:

769 # no denormalization: simply copy

770 new_nodes[node_id]['threshold'] = old_node['threshold']

771

772 if isinstance(new_model, SplitNodeModel):

773 # denormalize model at split node too, even if it is not

774 # always used (depending on smoothing mode)

775 new_model.denormalize(x_scaler, y_scaler)

776 else:

777 raise TypeError("Internal error: all intermediate nodes"

778 "should be SplitNodeModel")

779

780 return

781

782 # denormalize the whole tree and put the result in (new_nodes,

783 # new_values, new_node_models) recursively

784 _denormalize(0, x_scaler, y_scaler)

785

786 # complete definition of the new tree

787 dct["nodes"] = new_nodes

788 dct["values"] = new_values

789 new_tree.__setstate__(dct)

790

791 # update the self fields

792 # self.features_usage

793 self.tree_ = new_tree

794 self.node_models = new_node_models

795

796 def compress_features(self):

797 """

798 Compresses the model and returns a vector of required feature indices.

799 This model input will then be X[:, features] instead of X.

800 """

801 used_features = sorted(self.features_usage.keys())

802 new_features_lookup_dct = {old_feature_idx: i for i, old_feature_idx in enumerate(used_features)}

803

804 if used_features == list(range(self.n_features_in_)):

805 # NOTHING TO DO: we need all features

806 return used_features

807

808 # Otherwise, We can compress. For this we have to create a copy of the

809 # tree because we will change its internals

810 old_tree = self.tree_

811 old_node_modls = self.node_models

812

813 # Get all information to create a copy of the inner tree.

814 # Note: _tree.copy() is gone so we use the pickle way

815 # --- Base info: nb features, nb outputs, output classes

816 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631

817 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)

818 # So these remain, we are just interested in changing the node-related arrays

819 new_tree = _tree.Tree(*old_tree.__reduce__()[1])

820

821 # --- Node info

822 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637

823 dct = old_tree.__getstate__().copy()

824

825 # create empty new structures for nodes and models

826 n_shape = dct["nodes"].shape

827 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)

828 m_shape = old_node_modls.shape

829 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype)

830

831 def _compress_features(node_id):

832 """

833

834 Parameters

835 ----------

836 node_id

837

838 Returns

839 -------

840 the depth and new indices of left and right children

841

842 """

843 # use the old tree to walk

844 old_node = dct["nodes"][node_id]

845 left_id = old_node["left_child"]

846 right_id = old_node["right_child"]

847

848 # Create the new node with a copy of the old

849 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so.

850 new_model = copy(old_node_modls[node_id])

851 new_node_models[node_id] = new_model

852

853 if left_id == _tree.TREE_LEAF:

854 # ... and right_id == _tree.TREE_LEAF

855 # => node_id is a leaf

856 if isinstance(new_model, ConstantLeafModel):

857 # no features used

858 return

859 elif isinstance(new_model, LinRegLeafModel):

860 # compress that model

861 new_model.reindex_features(new_features_lookup_dct)

862 return

863 else:

864 raise TypeError("Internal error - Leafs can only be" "constant or linear regression")

865 else:

866

867 _compress_features(left_id)

868 _compress_features(right_id)

869

870 # store the new split feature index in the node

871 new_nodes[node_id]["feature"] = new_features_lookup_dct[old_node["feature"]]

872

873 if not isinstance(new_model, SplitNodeModel):

874 raise TypeError("Internal error: all intermediate nodes" "should be SplitNodeModel")

875

876 # TODO now that split node models have a linear regression

877 # model too, we should update them here

878

879 return

880

881 _compress_features(0)

882

883 # complete definition of the new tree

884 dct["nodes"] = new_nodes

885 new_tree.__setstate__(dct)

886

887 # update the self fields

888 self.features_usage = {new_features_lookup_dct[k]: v for k, v in self.features_usage.items()}

889 self.tree_ = new_tree

890 self.node_models = new_node_models

891 self.n_features_in_ = len(self.features_usage)

892

893 # return the vector of used features

894 return used_features

895

896 @property

897 def feature_importances_(self):

898 """Return the feature importances (the higher, the more important).

899

900 Returns

901 -------

902 feature_importances_ : array, shape = [n_features]

903 """

904 check_is_fitted(self, "tree_")

905

906 # TODO adapt ?

907 # features = np.array([self.features_usage[k] for k in sorted(self.features_usage.keys())], dtype=int)

908 features = self.tree_.compute_feature_importances()

909

910 return features

911

912 def predict(self, X, check_input=True, smooth_predictions=None, smoothing_constant=None):

913 """Predict class or regression value for X.

914

915 For a classification model, the predicted class for each sample in X is

916 returned. For a regression model, the predicted value based on X is

917 returned.

918

919 Parameters

920 ----------

921 X : numpy.ndarray, shape (n_samples, n_features)

922 The input samples. Internally, it will be converted to

923 ``dtype=np.float32`` and if a sparse matrix is provided

924 to a sparse ``csr_matrix``.

925 check_input : boolean, (default=True)

926 Allow to bypass several input checking.

927 Don't use this parameter unless you know what you do.

928 smooth_predictions : boolean, (default=None)

929 None means "use self config"

930 smoothing_constant: int, (default=self.smoothing_constant)

931 Smoothing constant used when smooth_predictions is True. During

932 smoothing, child node models are recursively mixed with their

933 parent node models, and the mix is done using a weighted sum. The

934 weight given to the child model is the number of training samples

935 that reached its node, while the weight given to the parent model

936 is the smoothing constant. Therefore it can be seen as an

937 equivalent number of samples that parent models represent when

938 injected into the recursive weighted sum.

939

940 Returns

941 -------

942 y : array of shape = [n_samples] or [n_samples, n_outputs]

943 The predicted classes, or the predict values.

944 """

945 check_is_fitted(self, "tree_")

946

947 # If this is just a constant node only check the input's shape.

948 if self.n_features_in_ == 0:

949 # perform the input checking manually to set ensure_min_features=0

950 X = check_array(X, dtype=DTYPE, accept_sparse="csr", ensure_min_features=0)

951 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):

952 raise ValueError("No support for np.int64 index based " "sparse matrices")

953 # skip it then

954 check_input = False

955

956 # validate and convert dtype

957 X = self._validate_X_predict(X, check_input)

958

959 # -------- This is the only change wrt parent class. TODO maybe rather replace self.tree_ with a proxy ---------

960 if smooth_predictions is None:

961 # Default: smooth prediction at prediction time if configured as

962 # such in the model. Note: if self.use_smoothing == 'installed',

963 # models are already smoothed models, so no need to smooth again

964 smooth_predictions = self.use_smoothing == "on_prediction"

965 else:

966 # user provided an explicit value for smooth_predictions

967 if not smooth_predictions and self.use_smoothing == "installed":

968 raise ValueError(

969 "Smoothing has been pre-installed on this "

970 "tree, it is not anymore possible to make "

971 "predictions without smoothing"

972 )

973

974 if smooth_predictions and smoothing_constant is None:

975 # default parameter for smoothing is the one defined in the model

976 # (with possible ratio)

977 smoothing_constant = self._get_smoothing_constant_to_use(X)

978

979 # Do not use the embedded tree (like in super): it has been pruned but

980 # still has constant nodes

981 # proba = self.tree_.predict(X)

982 if not smooth_predictions:

983 proba = predict_from_leaves_no_smoothing(self.tree_, self.node_models, X)

984 else:

985 proba = predict_from_leaves(self, X, smoothing=True, smoothing_constant=smoothing_constant)

986 if len(proba.shape) < 2:

987 proba = proba.reshape(-1, 1)

988 # ------------------------------------------

989

990 n_samples = X.shape[0]

991

992 # Classification

993 if is_classifier(self):

994 if self.n_outputs_ == 1:

995 return self.classes_.take(np.argmax(proba, axis=1), axis=0)

996 else:

997 predictions = np.zeros((n_samples, self.n_outputs_))

998 for k in range(self.n_outputs_):

999 predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)

1000 return predictions

1001

1002 # Regression

1003 else:

1004 if self.n_outputs_ == 1:

1005 return proba[:, 0]

1006 else:

1007 return proba[:, :, 0]

1008

1009

1010 class ConstantLeafModel:

1011 """

1012 Represents the additional information about a leaf node that is not pruned.

1013 It contains the error associated with the training samples at this node.

1014

1015 Note: the constant value is not stored here, as it is already available in

1016 the sklearn tree struct. So to know the prediction at this node, use

1017 `tree.value[node_id]`

1018 """

1019

1020 __slots__ = ("error",)

1021

1022 def __init__(self, error):

1023 self.error = error

1024

1025 @property

1026 def n_params(self) -> int:

1027 """

1028 Returns the number of parameters used by this model, including the

1029 constant driver

1030 """

1031 return 1

1032

1033 @staticmethod

1034 def predict_cstt(tree, node_id, n_samples):

1035 """

1036 This static method is a helper to get an array of constant predictions

1037 associated with this node.

1038

1039 Parameters

1040 ----------

1041 tree

1042 node_id

1043 n_samples

1044

1045 Returns

1046 -------

1047

1048 """

1049 cstt_prediction = tree.value[node_id]

1050

1051 # cstt_prediction can be multioutput so it is an array. replicate it

1052 from numpy import matlib # note: np.matlib not available in 1.10.x

1053

1054 return matlib.repmat(cstt_prediction, n_samples, 1)

1055

1056

1057 class LinRegNodeModel(DeNormalizableMixIn):

1058 """

1059 Represents the additional information about a tree node that contains a

1060 linear regression model.

1061

1062 It contains

1063 - the features used by the model,

1064 - the scikit learn model object itself,

1065 - and the error for the training samples that reached this node

1066

1067 """

1068 __slots__ = ("features", "model", "error", "n_params")

1069

1070 def __init__(self, features, model, error) -> None:

1071 self.features = features

1072 self.model = model

1073 self.error = error

1074 self.n_params = len(self.features) + 1

1075

1076 def to_text(self, feature_names=None, target_name=None, precision=3,

1077 line_breaks=False):

1078 """ Returns a text representation of the linear regression model """

1079 return linreg_model_to_text(self.model, feature_names=feature_names,

1080 target_name=target_name,

1081 precision=precision,

1082 line_breaks=line_breaks)

1083

1084 def reindex_features(self, new_features_lookup_dct):

1085 """

1086 Reindexes the required features using the provided lookup dictionary.

1087

1088 Parameters

1089 ----------

1090 new_features_lookup_dct

1091

1092 Returns

1093 -------

1094

1095 """

1096 self.features = [new_features_lookup_dct[f] for f in self.features]

1097

1098 def denormalize(self,

1099 x_scaler: StandardScaler = None,

1100 y_scaler: StandardScaler = None

1101 ):

1102 """

1103 De-normalizes the linear model.

1104

1105 :param x_scaler:

1106 :param y_scaler:

1107 :return:

1108 """

1109 # create a clone of the x scaler with only the used features

1110 x_scaler = copy(x_scaler)

1111 if len(self.features) > 0:

1112 x_scaler.scale_ = x_scaler.scale_[self.features]

1113 x_scaler.mean_ = x_scaler.mean_[self.features]

1114 else:

1115 # in that particular case, the above expression is not working

1116 x_scaler.scale_ = x_scaler.scale_[0:0]

1117 x_scaler.mean_ = x_scaler.mean_[0:0]

1118

1119 # use the denormalize function on the internal model

1120 # TODO what if internal model is not able ?

1121 self.model.denormalize(x_scaler, y_scaler)

1122

1123 def predict(self, X: np.ndarray):

1124 """Performs a prediction for X, only using the required features."""

1125 if len(self.features) < 1 and isinstance(self.model, LinearRegression):

1126 # unfortunately LinearRegression models do not like it when no

1127 # features are needed: their input validation requires at least 1.

1128 # so we do it ourselves.

1129 return self.model.intercept_ * np.ones((X.shape[0], 1))

1130 else:

1131 return self.model.predict(X[:, self.features])

1132

1133

1134 class LinRegLeafModel(LinRegNodeModel):

1135 """

1136 Represents the additional information about a leaf node with a linear

1137 regression model

1138 """

1139 pass

1140

1141

1142 class SplitNodeModel(LinRegNodeModel):

1143 """

1144 Represents the additional information about a split node, with a linear

1145 regression model.

1146 """

1147 __slots__ = ("n_params",)

1148

1149 def __init__(self, n_params, error, features, model):

1150 self.n_params = n_params

1151 super(SplitNodeModel, self).__init__(features=features, model=model, error=error)

1152

1153

1154 PRUNING_MULTIPLIER = 2

1155 # see https://github.com/bnjmn/weka/blob/master/weka/src/main/java/weka/classifiers/trees/m5/RuleNode.java#L124

1156 # TODO check why they use 2 instead of 1 (from the article) ??

1157

1158

1159 def root_mean_squared_error(*args, **kwargs):

1160 return np.sqrt(mean_squared_error(*args, **kwargs))

1161

1162

1163 def prune_on_min_impurity(tree):

1164 """

1165 Edits the given tree so as to prune subbranches that do not respect the min

1166 impurity criterion.

1167

1168 The paper suggests to do this with 5% but min_impurity_split is not

1169 available in 0.17 (and criterion is not std but mse so we have to square)

1170

1171 Parameters

1172 ----------

1173 tree

1174

1175 Returns

1176 -------

1177

1178 """

1179 left_children = tree.children_left

1180 right_children = tree.children_right

1181 impurities = tree.impurity

1182

1183 # The paper suggests to do this with 5% but min_impurity_split is not

1184 # available in 0.17 (and criterion is not std but mse so we have to square)

1185 # TODO adapt the formula to criterion used and/or generalize with min_impurity_split_as_initial_ratio

1186 root_impurity = impurities[0]

1187 impurity_threshold = root_impurity * (0.05 ** 2)

1188

1189 def stop_on_min_impurity(node_id):

1190 # note: in the paper that is 0.05 but criterion is on std. Here

1191 # impurity is mse so a squared equivalent of std.

1192 left_id = left_children[node_id]

1193 right_id = right_children[node_id]

1194 if left_id != _tree.TREE_LEAF: # a split node

1195 if impurities[node_id] < impurity_threshold:

1196 # stop here, this will be a leaf

1197 prune_children(node_id, tree)

1198 else:

1199 stop_on_min_impurity(left_id)

1200 stop_on_min_impurity(right_id)

1201

1202 stop_on_min_impurity(0)

1203

1204

1205 def build_models_and_get_pruning_info(

1206 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model, node_models, global_std_dev, use_pruning, node_id=0

1207 ):

1208 """

1209

1210 Parameters

1211 ----------

1212 tree : Tree

1213 A tree that will be pruned on the way

1214 X_train_all

1215 y_train_all

1216 nodes_to_samples

1217 leaf_model

1218 node_models

1219 global_std_dev

1220 use_pruning

1221 node_id

1222

1223 Returns

1224 -------

1225 a dictionary where the key is the feature index and the value is

1226 the number of samples where this feature is used

1227 """

1228

1229 # Select the Error metric to compute model errors (used to compare node

1230 # with subtree for pruning)

1231 # TODO in Weka they use RMSE, but in papers they use MAE. could be a param

1232 # TODO Shall we go further and store the residuals, or a bunch of metrics?

1233 err_metric = root_mean_squared_error # mean_absolute_error

1234

1235 # Get the samples associated with this node

1236 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)

1237 n_samples_at_this_node = samples_at_this_node.shape[0]

1238 y_true_this_node = y_train_all[samples_at_this_node]

1239

1240 # Is this a leaf node or a split node ?

1241 left_node = tree.children_left[node_id] # this way we do not have to query it again in the else.

1242 if left_node == _tree.TREE_LEAF:

1243 # --> Current node is a LEAF. See is_leaf(node_id, tree) if you have doubts <--

1244

1245 # -- create a linear model for this node

1246 # leaves do not have the right to use any features since they have no

1247 # subtree: keep the constant prediction

1248 y_pred_this_node = ConstantLeafModel.predict_cstt(tree, node_id, y_true_this_node.shape[0])

1249 err_this_node = err_metric(y_true_this_node, y_pred_this_node)

1250

1251 # TODO when use_pruning = False, should we allow all features to be

1252 # used instead of having to stick to the M5 rule of "only use a

1253 # feature if the subtree includes a split with this feature" ?

1254 # OR alternate proposal: should we transform the boolean use_pruning

1255 # into a use_pruning_max_nb integer to say for example "only 2 level

1256 # of pruning" ?

1257

1258 # -- store the model information

1259 node_models[node_id] = ConstantLeafModel(err_this_node)

1260

1261 # -- return an empty dict for features used

1262 return dict() # np.array([], dtype=int)

1263

1264 else:

1265 # --> Current node is a SPLIT <--

1266 right_node = tree.children_right[node_id]

1267

1268 # (1) prune left and right subtree and get some information

1269 features_l = build_models_and_get_pruning_info(

1270 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model,

1271 node_models, global_std_dev, use_pruning, node_id=left_node

1272 )

1273 features_r = build_models_and_get_pruning_info(

1274 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model,

1275 node_models, global_std_dev, use_pruning, node_id=right_node

1276 )

1277

1278 # (2) select only the samples that reach this node

1279 X_this_node = X_train_all[samples_at_this_node, :]

1280

1281 # (3) Create a model for this node

1282 # -- fit a linear regression model taking into account only the

1283 # features used in the subtrees + the split one

1284 # TODO should we normalize=True (variance scaling) or use a whole

1285 # pipeline here ? Not sure..

1286 # estimators = [('scale', StandardScaler()), ('clf', model_type())];

1287 # pipe = Pipeline(estimators)

1288 # skmodel_this_node = leaf_model_type(**leaf_model_params)

1289 skmodel_this_node = clone(leaf_model)

1290

1291 # -- old - we used to only store the array of features

1292 # selected_features = np.union1d(features_l, features_r)

1293 # selected_features = np.union1d(selected_features, tree.feature[node_id])

1294 # -- new - we also gather the nb samples where this feature is used

1295 selected_features_dct = features_l

1296 for feature_id, n_samples_feat_used in features_r.items():

1297 if feature_id in selected_features_dct.keys():

1298 selected_features_dct[feature_id] += n_samples_feat_used

1299 else:

1300 selected_features_dct[feature_id] = n_samples_feat_used

1301 selected_features_dct[tree.feature[node_id]] = n_samples_at_this_node

1302 # -- use only the selected features, in the natural integer order

1303 selected_features = sorted(selected_features_dct.keys())

1304

1305 X_train_this_node = X_this_node[:, selected_features]

1306 skmodel_this_node.fit(X_train_this_node, y_true_this_node)

1307 # -- predict and compute error

1308 y_pred_this_node = skmodel_this_node.predict(X_train_this_node)

1309 err_this_node = err_metric(y_true_this_node, y_pred_this_node)

1310 # -- create the object

1311 # TODO the paper suggest to perform recursive feature elimination in

1312 # this model until adjusted_err_model is minimal, is it same in Weka ?

1313 model_this_node = LinRegLeafModel(selected_features, skmodel_this_node, err_this_node)

1314

1315 # (4) compute adj error criterion = ERR * (n+v)/(n-v) for both models

1316 adjusted_err_model = compute_adjusted_error(err_this_node, n_samples_at_this_node, model_this_node.n_params)

1317

1318 # (5) predict and compute adj error for the combination of child models

1319 # -- Note: this is recursive so the leaves may contain linear models

1320 # already

1321 y_pred_children = predict_from_leaves_no_smoothing(tree, node_models, X_this_node)

1322 err_children = err_metric(y_true_this_node, y_pred_children)

1323

1324 # TODO the Weka implem (below) differs from the paper that suggests a

1325 # weigthed sum of adjusted errors. This is maybe an equivalent formulation, to check.

1326 # the total number of parameters if we do not prune, is the sum of

1327 # params in each branch PLUS 1 for the split

1328 n_params_splitmodel = node_models[left_node].n_params + node_models[right_node].n_params + 1

1329 adjusted_err_children = compute_adjusted_error(err_children, n_samples_at_this_node, n_params_splitmodel)

1330

1331 # (6) compare and either prune at this node or keep the subtrees

1332 std_dev_this_node = np.nanstd(y_true_this_node)

1333 # note: the first criterion is now already checked before that

1334 # function call, in `prune_on_min_impurity`

1335 if use_pruning and (

1336 # TODO these parameters should be in the constructor

1337 # see also 2 comments about min_impurity_split_as_initial_ratio

1338 std_dev_this_node < (global_std_dev * 0.05)

1339 or (adjusted_err_model <= adjusted_err_children)

1340 or (adjusted_err_model < (global_std_dev * 0.00001)) # so this means very good R² already

1341 ):

1342 # Choose model for this node rather than subtree model

1343 # -- prune from this node on

1344 removed_nodes = prune_children(node_id, tree)

1345 node_models[removed_nodes] = _tree.TREE_UNDEFINED

1346

1347 # store the model information

1348 node_models[node_id] = model_this_node

1349

1350 # update and return the features used

1351 selected_features_dct = {k: n_samples_at_this_node for k in selected_features_dct.keys()}

1352 return selected_features_dct

1353

1354 else:

1355 # The subtrees are good or we do not want pruning: keep them.

1356 # This node will remain a split, and will only contain the digest

1357 # about the subtree

1358 node_models[node_id] = SplitNodeModel(

1359 n_params_splitmodel, err_children, selected_features, skmodel_this_node

1360 )

1361

1362 # return the features used

1363 return selected_features_dct

1364

1365

1366 def compute_adjusted_error(err, n_samples, n_parameters, multiplier=PRUNING_MULTIPLIER):

1367 """

1368 Return the penalized error obtained from `err`, the prediction error at a

1369 given tree node, by multiplying it by ``(n+p)/(n-p)``. `n` is the number of

1370 samples at this node and `p` is the number of parameters of the model.

1371

1372 According to the original M5 paper, this is to penalize complex models

1373 used for few samples.

1374

1375 Note that if ``n_samples <= n_parameters`` the denominator is zero or negative.

1376 In this case an arbitrary high penalization factor is used: ``10 * err``.

1377

1378 Parameters

1379 ----------

1380 err : float

1381 The model error for a given tree node. Typically the MAE.

1382

1383 n_samples : int

1384 The number of samples at this node.

1385

1386 n_parameters : int

1387 The number of parameters of the model at this node.

1388

1389 Returns

1390 -------

1391 penalized_mae : float

1392 The penalized error ``err * (n+p)/(n-p)``, or ``10 * err`` if ``n <= p``.

1393 """

1394 #

1395 if n_samples <= n_parameters:

1396 # denominator is zero or negative: use a large factor so as to penalize

1397 # a lot this overly complex model.

1398 factor = 10.0 # Caution says Yong in his code

1399 else:

1400 factor = (n_samples + multiplier * n_parameters) / (n_samples - n_parameters)

1401

1402 return err * factor

1403

1404

1405 def predict_from_leaves_no_smoothing(tree, node_models, X):

1406 """

1407 Returns the prediction for all samples in X, based on using the appropriate

1408 model leaf.

1409

1410 This function

1411 - uses tree.apply(X) to know in which tree leaf each sample in X goes

1412 - then for each of the leaves that are actually touched, uses

1413 node_models[leaf_id] to predict, for the X reaching that leaf

1414

1415 This function assumes that node_models contains non-empty LinRegLeafModel

1416 entries for all leaf nodes that will be reached by the samples X.

1417

1418 Parameters

1419 ----------

1420 tree : Tree

1421 The tree object.

1422 node_models : array-like

1423 The array containing node models

1424 X : 2D array-like

1425 The sample data.

1426

1427 Returns

1428 -------

1429 y_predicted : 1D array-like

1430 The prediction vector.

1431 """

1432 # **** This does the job, but we have one execution of model.predict() per

1433 # sample: probably not efficient

1434 # sample_ids_to_leaf_node_ids = tree.apply(X)

1435 # model_and_x = np.concatenate((node_models[leaf_node_ids].reshape(-1, 1), X), axis=1)

1436 # def pred(m_and_x):

1437 # return m_and_x[0].model.predict(m_and_x[1:].reshape(1,-1))[0]

1438 # y_predicted = np.array(list(map(pred, model_and_x)))

1439

1440 # **** This should be faster because the number of calls to predict() is

1441 # equal to the number of leaves touched

1442 sample_ids_to_leaf_node_ids = tree.apply(X)

1443 y_predicted = -np.ones(sample_ids_to_leaf_node_ids.shape, dtype=DOUBLE)

1444

1445 # -- find the unique list of leaves touched

1446 leaf_node_ids, inverse_idx = np.unique(sample_ids_to_leaf_node_ids, return_inverse=True)

1447

1448 # -- for each leaf perform the prediction for samples reaching that leaf

1449 for leaf_node_id in leaf_node_ids:

1450 # get the indices of the samples reaching that leaf

1451 sample_indices = np.nonzero(sample_ids_to_leaf_node_ids == leaf_node_id)[0]

1452

1453 # predict

1454 node_model = node_models[leaf_node_id]

1455 if isinstance(node_model, LinRegLeafModel):

1456 y_predicted[sample_indices] = np.ravel(node_model.predict(X[sample_indices, :]))

1457 else:

1458 # isinstance(node_model, ConstantLeafModel)

1459 y_predicted[sample_indices] = tree.value[leaf_node_id]

1460

1461 # **** Recursive strategy: not used anymore

1462 # left_node = tree.children_left[node_id] # this way we do not have to query it again in the else.

1463 # if left_node == _tree.TREE_LEAF:

1464 # # --> Current node is a LEAF. See is_leaf(node_id, tree) <--

1465 # y_predicted = node_models[node_id].model.predict()

1466 # else:

1467 # # --> Current node is a SPLIT <--

1468 # right_node = tree.children_right[node_id]

1469 #

1470 #

1471 # samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)

1472 # y_true_this_node = y_train_all[samples_at_this_node]

1473 # # As surprising as it may seem, in numpy [samples_at_this_node, selected_features] does something else.

1474 # X_train_this_node = X_train_all[samples_at_this_node, :][:, selected_features]

1475 #

1476 # X_left

1477

1478 return y_predicted

1479

1480

1481 def predict_from_leaves(m5p, X, smoothing=True, smoothing_constant=15):

1482 """

1483 Predicts using the M5P tree, without using the compiled sklearn

1484 `tree.apply` subroutine.

1485

1486 The main purpose of this function is to apply smoothing to a M5P model tree

1487 where smoothing has not been pre-installed on the models. For examples to

1488 enable a model to be used both without and with smoothing for comparisons

1489 purposes, or for models whose leaves are not Linear Models and therefore

1490 for which no pre-installation method exist.

1491

1492 Note: this method is slower than `predict_from_leaves_no_smoothing` when

1493 `smoothing=False`.

1494

1495 Parameters

1496 ----------

1497 m5p : M5Prime

1498 The model to use for prediction

1499 X : array-like

1500 The input data

1501 smoothing : bool

1502 Whether to apply smoothing

1503 smoothing_constant : int

1504 The smoothing constant `k` used as the prediction weight of parent node model.

1505 (k=15 in the articles).

1506

1507

1508 Returns

1509 -------

1510

1511 """

1512 # validate and converts dtype just in case this was directly called

1513 # e.g. in unit tests

1514 X = m5p._validate_X_predict(X, check_input=True)

1515

1516 tree = m5p.tree_

1517 node_models = m5p.node_models

1518 nb_samples = X.shape[0]

1519 y_predicted = -np.ones((nb_samples, 1), dtype=DOUBLE)

1520

1521 # sample_ids_to_leaf_node_ids = tree.apply(X)

1522 def smooth_predictions(ancestor_nodes, X_at_node, y_pred_at_node):

1523 # note: y_predicted_at_node can be a constant

1524 current_node_model_id = ancestor_nodes[-1]

1525 for _i, parent_model_id in enumerate(reversed(ancestor_nodes[:-1])):

1526 # warning: this is the nb of TRAINING samples at this node

1527 node_nb_train_samples = tree.n_node_samples[current_node_model_id]

1528 parent_model = node_models[parent_model_id]

1529 parent_predictions = parent_model.predict(X_at_node)

1530 y_pred_at_node = (node_nb_train_samples * y_pred_at_node + smoothing_constant * parent_predictions) / (

1531 node_nb_train_samples + smoothing_constant

1532 )

1533 current_node_model_id = parent_model_id

1534

1535 return y_pred_at_node

1536

1537 def apply_prediction(node_id, ids=None, ancestor_nodes=None):

1538 first_call = False

1539 if ids is None:

1540 ids = slice(None)

1541 first_call = True

1542 if smoothing:

1543 if ancestor_nodes is None:

1544 ancestor_nodes = [node_id]

1545 else:

1546 ancestor_nodes.append(node_id)

1547

1548 left_id = tree.children_left[node_id]

1549 if left_id == _tree.TREE_LEAF:

1550 # ... and tree.children_right[node_id] == _tree.TREE_LEAF

1551 # LEAF node: predict

1552 # predict

1553 node_model = node_models[node_id]

1554 # assert (ids == (sample_ids_to_leaf_node_ids == node_id)).all()

1555 if isinstance(node_model, LinRegLeafModel):

1556 X_at_node = X[ids, :]

1557 predictions = node_model.predict(X_at_node)

1558 else:

1559 # isinstance(node_model, ConstantLeafModel)

1560 predictions = tree.value[node_id]

1561 if smoothing:

1562 X_at_node = X[ids, :]

1563

1564 # finally apply smoothing

1565 if smoothing:

1566 y_predicted[ids] = smooth_predictions(ancestor_nodes, X_at_node, predictions)

1567 else:

1568 y_predicted[ids] = predictions

1569 else:

1570 right_id = tree.children_right[node_id]

1571 # non-leaf node: split samples and recurse

1572 left_group = np.zeros(nb_samples, dtype=bool)

1573 left_group[ids] = X[ids, tree.feature[node_id]] <= tree.threshold[node_id]

1574 right_group = (~left_group) if first_call else (ids & (~left_group))

1575

1576 # important: copy ancestor_nodes BEFORE calling anything, otherwise

1577 # it will be modified

1578 apply_prediction(

1579 left_id, ids=left_group, ancestor_nodes=(ancestor_nodes.copy() if ancestor_nodes is not None else None)

1580 )

1581 apply_prediction(right_id, ids=right_group, ancestor_nodes=ancestor_nodes)

1582

1583 # recurse to fill all predictions

1584 apply_prediction(0)

1585

1586 return y_predicted

1587

1588

1589 def prune_children(node_id, tree):

1590 """

1591 Prunes the children of node_id in the given `tree`.

1592

1593 Inspired by https://github.com/shenwanxiang/sklearn-post-prune-tree/blob/master/tree_prune.py#L122

1594

1595 IMPORTANT this relies on the fact that `children_left` and `children_right`

1596 are modificable (and are the only things we need to modify to fix the

1597 tree). It seems to be the case as of now.

1598

1599 Parameters

1600 ----------

1601 node_id : int

1602 tree : Tree

1603

1604 Returns

1605 -------

1606 removed_nodes : List[int]

1607 A list of removed nodes

1608 """

1609

1610 def _prune_below(_node_id):

1611 left_child = tree.children_left[_node_id]

1612 right_child = tree.children_right[_node_id]

1613 if left_child == _tree.TREE_LEAF:

1614 # _node_id is a leaf: left_ & right_child say "leaf".Nothing to do

1615 return []

1616 else:

1617 # Make sure everything is pruned below: children should be leaves

1618 removed_l = _prune_below(left_child)

1619 removed_r = _prune_below(right_child)

1620

1621 # -- First declare that they are not leaves anymore but they do not exist at all

1622 for child in [left_child, right_child]:

1623

1624 if tree.children_left[child] != _tree.TREE_LEAF:

1625 raise ValueError("Unexpected children_left: %s, please report it as issue." % child)

1626

1627 if tree.children_right[child] != _tree.TREE_LEAF:

1628 raise ValueError("Unexpected children_right: %s, please report it as issue." % child)

1629

1630 tree.children_left[child] = _tree.TREE_UNDEFINED

1631 tree.children_right[child] = _tree.TREE_UNDEFINED

1632

1633 # -- Then declare that current node is a leaf

1634 tree.children_left[_node_id] = _tree.TREE_LEAF

1635 tree.children_right[_node_id] = _tree.TREE_LEAF

1636

1637 # Return the list of removed nodes

1638 return removed_l + removed_r + [left_child, right_child]

1639

1640 # Note: we do not change the node count here, as we'll clean all later.

1641 # true_node_count = tree.node_count - sum(tree.children_left == _tree.TREE_UNDEFINED)

1642 # tree.node_count -= 2*len(nodes_to_remove)

1643

1644 return _prune_below(node_id)

1645

1646

1647 def is_leaf(node_id, tree):

1648 """

1649 Returns true if node with id `node_id` is a leaf in tree `tree`.

1650 Is is not actually used in this file because we always need the left child

1651 node id in our code.

1652

1653 But it is kept here as an easy way to remember how it works.

1654

1655 Parameters

1656 ----------

1657 node_id : int

1658 tree : Tree

1659

1660 Returns

1661 -------

1662 _is_leaf : bool

1663 A boolean flag, True if this node is a leaf.

1664 """

1665 if node_id == _tree.TREE_LEAF or node_id == _tree.TREE_UNDEFINED:

1666 raise ValueError("Invalid node_id %s" % node_id)

1667

1668 return tree.children_left[node_id] == _tree.TREE_LEAF

1669

1670

1671 def get_samples_at_node(node_id, nodes_to_samples):

1672 """

1673 Return an array containing the ids of the samples for node `node_id`.

1674

1675 This method requires the user to

1676 - first compute the decision path for the sample matrix X

1677 - then convert it to a csc

1678

1679 >>> samples_to_nodes = estimator.decision_path(X) # returns a Scipy compressed sparse row matrix (CSR)

1680 >>> nodes_to_samples = samples_to_nodes.tocsc() # we need the column equivalent (CSC)

1681 >>> samples = get_samples_at_node(node_id, nodes_to_samples)

1682

1683 Parameters

1684 ----------

1685 node_id : int

1686 The node for which the list of samples is queried.

1687 nodes_to_samples : csc_matrix

1688 A boolean matrix in Compressed Sparse Column (CSC) format where rows are the

1689 tree nodes and columns are samples. The matrix contains a 1 when the samples

1690 go through this node when processed by the decisions.

1691

1692 Returns

1693 -------

1694

1695 """

1696 return nodes_to_samples.indices[nodes_to_samples.indptr[node_id] : nodes_to_samples.indptr[node_id + 1]]

1697

1698

1699 class M5Prime(M5Base, RegressorMixin):

1700 """An M5' (M five prime) model tree regressor.

1701

1702 The original M5 algorithm was invented by R. Quinlan [1]_. Y. Wang [2]_ made

1703 improvements and named the resulting algorithm M5 Prime.

1704 This implementation was inspired by Weka (https://github.com/bnjmn/weka)

1705 M5Prime class, from Mark Hall.

1706

1707 See also

1708 --------

1709 M5Base

1710

1711 References

1712 ----------

1713 .. [1] Ross J. Quinlan, "Learning with Continuous Classes", 5th Australian

1714 Joint Conference on Artificial Intelligence, pp343-348, 1992.

1715 .. [2] Y. Wang and I. H. Witten, "Induction of model trees for predicting

1716 continuous classes", Poster papers of the 9th European Conference

1717 on Machine Learning, 1997.

1718 """

1719

1720

1721 def _empty_contig_ar(shape, dtype):

1722 """Return an empty contiguous array with given shape and dtype."""

1723 return np.ascontiguousarray(np.empty(shape, dtype=dtype))

m5py/main.py source