1 from collections import namedtuple
2 from copy import copy
3 from logging import getLogger
4 from warnings import warn
5
6 import numpy as np
7
8 from scipy.sparse import issparse
9
10 from sklearn import clone
11 from sklearn.base import RegressorMixin, is_classifier
12 from sklearn.linear_model import LinearRegression
13 from sklearn.metrics import mean_squared_error
14 from sklearn.preprocessing import StandardScaler
15 from sklearn.tree import BaseDecisionTree, _tree
16 from sklearn.tree._classes import DTYPE
17 from sklearn.tree._tree import DOUBLE
18 from sklearn.utils import check_array
19 from sklearn.utils.validation import check_is_fitted
20 from sklearn import __version__ as sklearn_version
21
22 from m5py.linreg_utils import linreg_model_to_text, DeNormalizableMixIn, DeNormalizableLinearRegression
23
24 from packaging.version import Version
25
26 SKLEARN_VERSION = Version(sklearn_version)
27 SKLEARN13_OR_GREATER = SKLEARN_VERSION >= Version("1.3.0")
28
29
30 __all__ = ["M5Base", "M5Prime"]
31
32
33 _SmoothingDetails = namedtuple("_SmoothingDetails", ("A", "B", "C"))
34 # Internal structure to contain the recursive smoothed coefficients details in the smoothing algorithm
35
36
37 logger = getLogger("m5p")
38
39
40 class M5Base(BaseDecisionTree):
41 """
42 M5Base. Implements base routines for generating M5 PredictionModel trees and rules.
43
44 The original algorithm M5 was invented by Quinlan:
45
46 - Quinlan J. R. (1992). Learning with continuous classes. Proceedings of the
47 Australian Joint Conference on Artificial Intelligence. 343--348. World
48 Scientific, Singapore.
49
50 Yong Wang and Ian Witten made improvements and created M5':
51
52 - Wang, Y and Witten, I. H. (1997). Induction of model trees for predicting
53 continuous classes. Proceedings of the poster papers of the European
54 Conference on Machine Learning. University of Economics, Faculty of
55 Informatics and Statistics, Prague.
56
57 Pruning and Smoothing can be activated and deactivated on top of the base
58 model. TODO check if 'rules' should be supported too
59
60 Inspired by Weka (https://github.com/bnjmn/weka) M5Base class, from Mark Hall
61
62 Attributes
63 ----------
64 criterion : str, default="friedman_mse"
65 M5 suggests to use the standard deviation (RMSE impurity) instead of
66 variance (MSE impurity) or absolute deviation (MAE impurity) as in CART.
67 According to M5' paper, both 3 criterions are equivalent.
68 splitter : default="best"
69 M5 suggests to take the feature leading to the best gain in criterion.
70 max_depth : default None
71 This is not used in the original M5 article, hence default=None.
72 min_samples_split : int, default=4
73 M5 suggests a value of 4 so that each leaf will have at least 2 samples.
74 min_samples_leaf : int, default=2
75 M5' suggest to add this explicitly to avoid zero-variance in leaves, and
76 n<=p for constant models.
77 min_weight_fraction_leaf : float, default=0.0
78 This is not used in the original M5 article, hence default=0.0.
79 TODO this would actually maybe be better than min_sample_leaf ?
80 max_features : default None
81 This is not used in the original M5 article, hence default is None.
82 max_leaf_nodes : int, default None
83 This is not used in the original M5 article, hence default is None.
84 min_impurity_decrease : float, default=0.0
85 This is not used in the original M5 article, hence default is None.
86 class_weight : default None
87 This is not used (?)
88 leaf_model : RegressorMixin
89 The regression model used in the leaves. This instance will be cloned
90 for each leaf.
91 use_pruning : bool
92 If False, pruning will be disabled.
93 use_smoothing : bool or str {'installed', 'on_prediction'}, default None
94 None and True means 'installed' by default except if smoothing_constant
95 or smoothing_constant_ratio is 0.0
96 smoothing_constant: float, default None
97 The smoothing constant k defined in the original M5 article, used as the
98 weight for each parent model in the recursive weighting process.
99 If None, the default value from the paper (k=15) will be used.
100 smoothing_constant_ratio: float, default None
101 An alternate way to define the smoothing constant, as a ratio of the
102 total number of training samples. The resulting smoothing constant will
103 be smoothing_constant_ratio * n where n is the number of samples.
104 Note that this may not be an integer.
105 debug_prints: bool, default False
106 A boolean to enable debug prints
107 ccp_alpha: float, default 0.0
108 TODO is this relevant ? does that conflict with "use_pruning" ?
109 random_state : None, int, or RandomState, default=None
110 See `RegressionTree.random_state`.
111 """
112
113 def __init__(
114 self,
115 criterion="friedman_mse",
116 splitter="best",
117 max_depth=None,
118 min_samples_split=4,
119 min_samples_leaf=2,
120 min_weight_fraction_leaf=0.0,
121 max_features=None,
122 max_leaf_nodes=None,
123 min_impurity_decrease=0.0,
124 class_weight=None,
125 leaf_model=None,
126 use_pruning=True,
127 use_smoothing=None,
128 smoothing_constant=None,
129 smoothing_constant_ratio=None,
130 debug_prints=False,
131 ccp_alpha=0.0,
132 random_state=None,
133 ):
134
135 # TODO the paper suggests to do this with 5% of total std
136 # min_impurity_split = min_impurity_split_as_initial_ratio * initial_impurity
137
138 super(M5Base, self).__init__(
139 criterion=criterion,
140 splitter=splitter,
141 max_depth=max_depth,
142 min_samples_split=min_samples_split,
143 min_samples_leaf=min_samples_leaf,
144 min_weight_fraction_leaf=min_weight_fraction_leaf,
145 max_features=max_features,
146 max_leaf_nodes=max_leaf_nodes,
147 min_impurity_decrease=min_impurity_decrease,
148 random_state=random_state,
149 class_weight=class_weight,
150 ccp_alpha=ccp_alpha,
151 )
152
153 # warning : if the field names are different from constructor params,
154 # then clone(self) will not work.
155 if leaf_model is None:
156 # to handle case when the model is learnt on normalized data and
157 # we wish to be able to read the model equations.
158 leaf_model = DeNormalizableLinearRegression()
159 self.leaf_model = leaf_model
160
161 # smoothing related
162 if smoothing_constant_ratio is not None and smoothing_constant is not None:
163 raise ValueError("Only one of `smoothing_constant` and `smoothing_constant_ratio` should be provided")
164 elif (smoothing_constant_ratio == 0.0 or smoothing_constant == 0) and (
165 use_smoothing is True or use_smoothing == "installed"
166 ):
167 raise ValueError(
168 "`use_smoothing` was explicitly enabled with "
169 "pre-installed models, while smoothing "
170 "constant/ratio are explicitly set to zero"
171 )
172
173 self.use_pruning = use_pruning
174 self.use_smoothing = use_smoothing
175 self.smoothing_constant = smoothing_constant
176 self.smoothing_constant_ratio = smoothing_constant_ratio
177
178 self.debug_prints = debug_prints
179
180 def as_pretty_text(self, **kwargs):
181 """
182 Returns a multi-line representation of this decision tree, using
183 `tree_to_text`.
184
185 :return: a multi-line string representing this decision tree
186 """
187 from m5py.export import export_text_m5
188 return export_text_m5(self, out_file=None, **kwargs)
189
190 def fit(self, X, y: np.ndarray, sample_weight=None, check_input=True, X_idx_sorted="deprecated"):
191 """Fit a M5Prime model.
192
193 Parameters
194 ----------
195 X : numpy.ndarray
196 y : numpy.ndarray
197 sample_weight
198 check_input
199 X_idx_sorted
200
201 Returns
202 -------
203
204 """
205 # (0) smoothing default values behaviour
206 if self.smoothing_constant_ratio == 0.0 or self.smoothing_constant == 0:
207 self.use_smoothing = False
208 elif self.use_smoothing is None or self.use_smoothing is True:
209 # default behaviour
210 if isinstance(self.leaf_model, LinearRegression):
211 self.use_smoothing = "installed"
212 else:
213 self.use_smoothing = "on_prediction"
214
215 # Finally make sure we are ok now
216 if self.use_smoothing not in [False, np.bool_(False), "installed", "on_prediction"]:
217 raise ValueError("use_smoothing: Unexpected value: %s, please report it as issue." % self.use_smoothing)
218
219
-
E303
Too many blank lines (2)
220 # (1) Build the initial tree as usual
221
222 # Get the correct fit method name based on the sklearn version used
223 fit_method_name = "_fit" if SKLEARN13_OR_GREATER else "fit"
224
225 fit_method = getattr(super(M5Base, self), fit_method_name)
226 fit_method(X, y, sample_weight=sample_weight, check_input=check_input)
227
228
-
E303
Too many blank lines (2)
229 if self.debug_prints:
230 logger.debug("(debug_prints) Initial tree:")
231 logger.debug(self.as_pretty_text(node_ids=True))
232
233 # (1b) prune initial tree to take into account min impurity in splits
234 prune_on_min_impurity(self.tree_)
235
236 if self.debug_prints:
237 logger.debug("(debug_prints) Postprocessed tree:")
238 logger.debug(self.as_pretty_text(node_ids=True))
239
240 # (2) Now prune tree and replace pruned branches with linear models
241 # -- unfortunately we have to re-do this input validation
242 # step, because it also converts the input to float32.
243 if check_input:
244 X = check_array(X, dtype=DTYPE, accept_sparse="csc")
245 if issparse(X):
246 X.sort_indices()
247
248 if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
249 raise ValueError("No support for np.int64 index based " "sparse matrices")
250
251 # -- initialise the structure to contain the leaves and node models
252 self.node_models = np.empty((self.tree_.node_count,), dtype=object)
253
254 # -- Pruning requires to know the global deviation of the target
255 global_std_dev = np.nanstd(y)
256 # global_abs_dev = np.nanmean(np.abs(y))
257
258 # -- Pruning requires to know the samples that reached each node.
259 # From http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
260 # Retrieve the decision path of each sample.
261 samples_to_nodes = self.decision_path(X)
262 # * row i is to see the nodes (non-empty j) in which sample i appears.
263 # sparse row-first (CSR) format is OK
264 # * column j is to see the samples (non-empty i) that fall into that
265 # node. To do that, we need to make it CSC
266 nodes_to_samples = samples_to_nodes.tocsc()
267
268 # -- execute the pruning
269 # self.features_usage is a dict feature_idx -> nb times used, only
270 # for used features.
271 self.features_usage = build_models_and_get_pruning_info(
272 self.tree_,
273 X,
274 y,
275 nodes_to_samples,
276 self.leaf_model,
277 self.node_models,
278 global_std_dev,
279 use_pruning=self.use_pruning,
280 )
281
282 # -- cleanup to compress inner structures: only keep non-pruned ones
283 self._cleanup_tree()
284
285 if self.debug_prints:
286 logger.debug("(debug_prints) Pruned tree:")
287 logger.debug(self.as_pretty_text(node_ids=True))
288
289 if self.use_smoothing == "installed":
290 # Retrieve the NEW decision path of each sample.
291 samples_to_nodes = self.decision_path(X)
292 # * row i is to see the nodes (non-empty j) in which sample i
293 # appears. sparse row-first (CSR) format is OK
294 # * column j is to see the samples (non-empty i) that fall into
295 # that node. To do that, we need to make it CSC
296 nodes_to_samples = samples_to_nodes.tocsc()
297
298 # default behaviour for smoothing constant and ratio
299 smoothing_constant = self._get_smoothing_constant_to_use(X)
300
301 self.install_smoothing(X, y, nodes_to_samples, smoothing_constant=smoothing_constant)
302
303 if self.debug_prints:
304 logger.debug("(debug_prints) Pruned and smoothed tree:")
305 logger.debug(self.as_pretty_text(node_ids=True))
306
307 return self
308
309 def _get_smoothing_constant_to_use(self, X):
310 """
311 Returns the smoothing_constant to use for smoothing, based on current
312 settings and X data
313 """
314 if self.smoothing_constant_ratio is not None:
315 nb_training_samples = X.shape[0]
316 smoothing_cstt = self.smoothing_constant_ratio * nb_training_samples
317 if smoothing_cstt < 15:
318 warn(
319 "smoothing constant ratio %s is leading to an extremely "
320 "small smoothing constant %s because nb training samples"
321 " is %s. Clipping to 15." % (self.smoothing_constant_ratio, smoothing_cstt, nb_training_samples)
322 )
323 smoothing_cstt = 15
324 else:
325 smoothing_cstt = self.smoothing_constant
326 if smoothing_cstt is None:
327 smoothing_cstt = 15 # default from the original Quinlan paper
328
329 return smoothing_cstt
330
331 def _cleanup_tree(self):
332 """
333 Reduces the size of this object by removing from internal structures
334 all items that are not used any more (all leaves that have been pruned)
335 """
336 old_tree = self.tree_
337 old_node_modls = self.node_models
338
339 # Get all information to create a copy of the inner tree.
340 # Note: _tree.copy() is gone so we use the pickle way
341 # --- Base info: nb features, nb outputs, output classes
342 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631
343 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)
344 # So these remain, we are just interested in changing the node-related arrays
345 new_tree = _tree.Tree(*old_tree.__reduce__()[1])
346
347 # --- Node info
348 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
349 dct = old_tree.__getstate__().copy()
350
351 # cleanup: only keep the nodes that are not undefined.
352 # note: this is identical to
353 # valid_nodes_indices = dct["nodes"]['left_child'] != TREE_UNDEFINED
354 valid_nodes_indices = old_tree.children_left != _tree.TREE_UNDEFINED
355 # valid_nodes_indices2 = old_tree.children_right != TREE_UNDEFINED
356 # assert np.all(valid_nodes_indices == valid_nodes_indices2)
357 new_node_count = sum(valid_nodes_indices)
358
359 # create empty new structures
360 n_shape = (new_node_count, *dct["nodes"].shape[1:])
361 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)
362 v_shape = (new_node_count, *dct["values"].shape[1:])
363 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype)
364 m_shape = (new_node_count, *old_node_modls.shape[1:])
365 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype)
366
367 # Fill structures while reindexing the tree and remembering the depth
368 global next_free_id
369 next_free_id = 0
370
371 def _compress(old_node_id):
372 """
373
374 Parameters
375 ----------
376 old_node_id
377
378 Returns
379 -------
380 the depth and new indices of left and right children
381
382 """
383 global next_free_id
384 new_node_id = next_free_id
385 next_free_id += 1
386
387 # use the old tree to walk
388 old_node = dct["nodes"][old_node_id]
389 left_id = old_node["left_child"]
390 right_id = old_node["right_child"]
391
392 # Create the new node with a copy of the old
393 new_nodes[new_node_id] = old_node # this is an entire row so it is probably copied already by doing so.
394 new_values[new_node_id] = dct["values"][old_node_id]
395 new_node_models[new_node_id] = copy(old_node_modls[old_node_id])
396
397 if left_id == _tree.TREE_LEAF:
398 # ... and right_id == _tree.TREE_LEAF
399 # => node_id is a leaf. Nothing to do
400 return 1, new_node_id
401 else:
402
403 left_depth, new_id_left = _compress(left_id)
404 right_depth, new_id_right = _compress(right_id)
405
406 # store the new indices
407 new_nodes[new_node_id]["left_child"] = new_id_left
408 new_nodes[new_node_id]["right_child"] = new_id_right
409
410 return 1 + max(left_depth, right_depth), new_node_id
411
412 # complete definition of the new tree
413 dct["max_depth"] = _compress(0)[0] - 1 # root node has depth 0, not 1
414 dct["node_count"] = new_node_count # new_nodes.shape[0]
415 dct["nodes"] = new_nodes
416 dct["values"] = new_values
417 new_tree.__setstate__(dct)
418
419 # Fix an issue on sklearn 0.17.1: setstate was not updating max_depth
420 # See https://github.com/scikit-learn/scikit-learn/blob/0.17.1/sklearn/tree/_tree.pyx#L623
421 new_tree.max_depth = dct["max_depth"]
422
423 # update self fields
424 self.tree_ = new_tree
425 self.node_models = new_node_models
426
427 def install_smoothing(self, X_train_all, y_train_all, nodes_to_samples, smoothing_constant):
428 """
429 Executes the smoothing procedure described in the M5 and M5P paper,
430 "once for all". This means that all models are modified so that after
431 this method has completed, each model in the tree is already a smoothed
432 model.
433
434 This has pros (prediction speed) and cons (the model equations are
435 harder to read - lots of redundancy)
436
437 It can only be done if leaf models are instances of `LinearRegression`
438 """
439 # check validity: leaf models have to support pre-computed smoothing
440 if not isinstance(self.leaf_model, LinearRegression):
441 raise TypeError(
442 "`install_smoothing` is only available if leaf "
443 "models are instances of `LinearRegression` or a "
444 "subclass"
445 )
446
447 # Select the Error metric to compute model errors (used to compare
448 # node with subtree for pruning)
449 # TODO in Weka they use RMSE, but in papers they use MAE. this should be a parameter.
450 # TODO Shall we go further and store the residuals, or a bunch of metrics? not sure
451 err_metric = root_mean_squared_error # mean_absolute_error
452
453 # --- Node info
454 # TODO we should probably not do this once in each method, but once or give access directly (no copy)
455 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
456 dct = self.tree_.__getstate__().copy()
457 old_node_models = self.node_models
458
459 m_shape = old_node_models.shape
460 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype)
461
462 def smooth__(
463 coefs_,
464 n_samples,
465 features=None,
466 parent=None, # type: _SmoothingDetails
467 parent_features=None,
468 k=smoothing_constant, #
469 ):
470 # type: (...) -> _SmoothingDetails
471 """
472 Smoothes the model coefficients or intercept `coefs_` (an array or
473 a scalar), using the smoothing results at parent node.
474
475 At each node we keep in memory three results A, B, C.
476 - the new coef at each node is A + B
477 - the recursion equations are
478 - B(n) = (k / (n_samples(n) + k)) * A(n-1) + B(n-1)
479 - C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1)
480 - A(n) = coef(n)*C(n)
481 """
482 if parent is None:
483 # A0 = coef(0), C0 = 1, B0 = 0
484 if features is None:
485 # single scalar value
486 return _SmoothingDetails(A=coefs_, B=0, C=1)
487 else:
488 # vector
489 if len(coefs_) == 0:
490 coefs_ = np.asarray(coefs_)
491 if coefs_.shape[0] != len(features):
492 raise ValueError("nb features does not match the nb " "of coefficients")
493 return _SmoothingDetails(
494 A=coefs_,
495 B=np.zeros(coefs_.shape, dtype=coefs_.dtype),
496 C=np.ones(coefs_.shape, dtype=coefs_.dtype),
497 )
498 else:
499 # B(n) = k/(n_samples(n)+k)*A(n-1) + B(n-1)
500 # C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1)
501 Bn = (k / (n_samples + k)) * parent.A + parent.B
502 Cn = (n_samples / (n_samples + k)) * parent.C
503
504 # A(n) = coef(n) * C(n)
505 if features is None:
506 # single scalar value: easy
507 An = coefs_ * Cn
508 return _SmoothingDetails(A=An, B=Bn, C=Cn)
509 else:
510 # vector of coefs: we have to 'expand' the coefs array
511 # because the coefs at this node apply for (features)
512 # while coefs at parent node apply for (parent_features)
513 An = np.zeros(Cn.shape, dtype=Cn.dtype)
514 parent_features = np.array(parent_features)
515 features = np.array(features)
516
517 # Thanks https://stackoverflow.com/a/8251757/7262247 !
518 index = np.argsort(parent_features)
519 sorted_parents = parent_features[index]
520 sorted_index = np.searchsorted(sorted_parents, features)
521
522 features_index = np.take(index, sorted_index, mode="clip")
523 if np.any(parent_features[features_index] != features):
524 # result = np.ma.array(features_index, mask=mask)
525 raise ValueError(
526 "Internal error - please report this."
527 "One feature was found in the child "
528 "node, that was not in the parent "
529 "node."
530 )
531
532 if len(features_index) > 0:
533 An[features_index] = coefs_ * Cn[features_index]
534
535 return _SmoothingDetails(A=An, B=Bn, C=Cn)
536
537 def _smooth(
538 node_id,
539 parent_features=None,
540 parent_coefs: _SmoothingDetails = None,
541 parent_intercept: _SmoothingDetails = None,
542 ):
543 # Gather all info on this node
544 # --base regression tree
545 node_info = dct["nodes"][node_id]
546 left_id = node_info["left_child"]
547 right_id = node_info["right_child"]
548 # --additional model
549 node_model = old_node_models[node_id]
550 # --samples
551 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)
552 n_samples_at_this_node = samples_at_this_node.shape[0]
553
554 # Note: should be equal to tree.n_node_samples[node_id]
555 if n_samples_at_this_node != self.tree_.n_node_samples[node_id]:
556 raise ValueError("n_samples_at_this_node: Unexpected value, please report it as issue.")
557
558 y_true_this_node = y_train_all[samples_at_this_node]
559 X_this_node = X_train_all[samples_at_this_node, :]
560
561 # (1) smooth current node
562 parent_features = parent_features if parent_features is not None else None
563 is_constant_leaf = False
564 if left_id == _tree.TREE_LEAF and isinstance(node_model, ConstantLeafModel):
565 is_constant_leaf = True
566 node_features = ()
567 smoothed_features = parent_features if parent_features is not None else node_features
568 node_coefs = ()
569 node_intercept = dct["values"][node_id]
570
571 # Extract the unique scalar value
572 if len(node_intercept) != 1:
573 raise ValueError("node_intercept: Unexpected value: , please report it as issue." % node_intercept)
574
575 node_intercept = node_intercept.item()
576
577 else:
578 # A leaf LinRegLeafModel or a split SplitNodeModel
579 node_features = node_model.features
580 smoothed_features = parent_features if parent_features is not None else node_features
581 node_coefs = node_model.model.coef_
582 node_intercept = node_model.model.intercept_
583
584 # Create a new linear regression model with smoothed coefficients
585 smoothed_sklearn_model = clone(self.leaf_model)
586 smoothed_coefs = smooth__(
587 node_coefs,
588 features=node_features,
589 n_samples=n_samples_at_this_node,
590 parent=parent_coefs,
591 parent_features=parent_features,
592 )
593 smoothed_intercept = smooth__(node_intercept, n_samples=n_samples_at_this_node, parent=parent_intercept)
594 smoothed_sklearn_model.coef_ = smoothed_coefs.A + smoothed_coefs.B
595 smoothed_sklearn_model.intercept_ = smoothed_intercept.A + smoothed_intercept.B
596
597 # Finally update the node
598 if is_constant_leaf:
599 smoothed_node_model = LinRegLeafModel(smoothed_features, smoothed_sklearn_model, None)
600 else:
601 smoothed_node_model = copy(node_model)
602 smoothed_node_model.features = smoothed_features
603 smoothed_node_model.model = smoothed_sklearn_model
604
605 # Remember the new smoothed model
606 new_node_models[node_id] = smoothed_node_model
607
608 if left_id == _tree.TREE_LEAF:
609 # If this is a leaf, update the prediction error on X
610 y_pred_this_node = smoothed_node_model.predict(X_this_node)
611 smoothed_node_model.error = err_metric(y_true_this_node, y_pred_this_node)
612
613 else:
614 # If this is a split node - recurse on each subtree
615 _smooth(
616 left_id,
617 parent_features=smoothed_features,
618 parent_coefs=smoothed_coefs,
619 parent_intercept=smoothed_intercept,
620 )
621 _smooth(
622 right_id,
623 parent_features=smoothed_features,
624 parent_coefs=smoothed_coefs,
625 parent_intercept=smoothed_intercept,
626 )
627
628 # Update the error using the same formula than the one we used
629 # in build_models_and_get_pruning_info
630 y_pred_children = predict_from_leaves_no_smoothing(self.tree_, new_node_models, X_this_node)
631 err_children = err_metric(y_true_this_node, y_pred_children)
632
633 # the total number of parameters is the sum of params in each
634 # branch PLUS 1 for the split
635 n_params_splitmodel = new_node_models[left_id].n_params + new_node_models[right_id].n_params + 1
636 smoothed_node_model.n_params = n_params_splitmodel
637 # do not adjust the error now, simply store the raw one
638 # smoothed_node_model.error = compute_adjusted_error(
639 # err_children, n_samples_at_this_node, n_params_splitmodel)
640 smoothed_node_model.error = err_children
641
642 return
643
644 # smooth the whole tree
645 _smooth(0)
646
647 # use the new node models now
648 self.node_models = new_node_models
649
650 # remember the smoothing constant installed
651 self.installed_smoothing_constant = smoothing_constant
652 return
653
654 def denormalize(self, x_scaler, y_scaler):
655 """
656 De-normalizes this model according to the provided x and y normalization scalers.
657 Currently only StandardScaler issupported.
658
659 :param x_scaler: a StandardScaler or None
660 :param y_scaler: a StandardScaler or None
661 :return:
662 """
663 # perform denormalization
664 self._denormalize_tree(x_scaler, y_scaler)
665
666 def _denormalize_tree(self, x_scaler, y_scaler):
667 """
668 De-normalizes all models in the tree
669 :return:
670 """
671 old_tree = self.tree_
672 old_node_models = self.node_models
673
674 # Get all information to create a copy of the inner tree.
675 # Note: _tree.copy() is gone so we use the pickle way
676 # --- Base info: nb features, nb outputs, output classes
677 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631
678 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)
679 # So these remain, we are just interested in changing the node-related arrays
680 new_tree = _tree.Tree(*old_tree.__reduce__()[1])
681
682 # --- Node info
683 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
684 dct = old_tree.__getstate__().copy()
685
686 # create empty new structures for nodes and models
687 n_shape = dct["nodes"].shape
688 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)
689 v_shape = dct["values"].shape
690 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype)
691 m_shape = old_node_models.shape
692 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype)
693
694 def _denormalize(node_id, x_scaler, y_scaler):
695 """
696 denormalizes the subtree below node `node_id`.
697
698 - `new_nodes[node_id]` is filled with a copy of the old node
699 (see old_node.dtype to see the various fields). If the node is
700 a split node, the split threshold
701 `new_nodes[node_id]['threshold']` is denormalized.
702
703 - `new_values[node_id]` is filled with a denormalized copy of the
704 old constant prediction at this node. Reminder: this constant
705 prediction is actually used only when on the leaves now, but
706 we keep it for ref.
707
708 - `new_node_models[node_id]` is filled with a copy of the old
709 model `old_node_models[node_id]`, that is denormalized if it
710 is not a constant leaf
711
712 :param node_id:
713 :return: (nothing)
714 """
715 # use the old tree to walk
716 old_node = dct["nodes"][node_id]
717 left_id = old_node['left_child']
718 right_id = old_node['right_child']
719
720 # Create the new node with a copy of the old
721 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so.
722 new_model = copy(old_node_models[node_id])
723 new_node_models[node_id] = new_model
724
725 # Create the new value by de-scaling y
726 if y_scaler is not None:
727 # Note: if this is a split node with a linear regression model
728 # the value will never be used. However to preserve consistency
729 # of the whole values structure and for debugging purposes, we
730 # choose this safe path of denormalizing ALL.
731 # TODO we could also do it at once outside of the recursive
732 # calls, but we should check for side effects
733 new_values[node_id] = y_scaler.inverse_transform(
734 dct["values"][node_id]
735 )
736 else:
737 # no denormalization: simply copy
738 new_values[node_id] = dct["values"][node_id]
739
740 if left_id == _tree.TREE_LEAF:
741 # ... and right_id == _tree.TREE_LEAF
742 # => node_id is a leaf
743 if isinstance(new_model, ConstantLeafModel):
744 # nothing to do: we already re-scaled the value
745 return
746 elif isinstance(new_model, LinRegLeafModel):
747 # denormalize model
748 new_model.denormalize(x_scaler, y_scaler)
749 return
750 else:
751 raise TypeError("Internal error - Leafs can only be"
752 "constant or linear regression")
753 else:
754 # this is a split node, denormalize each subtree
755 _denormalize(left_id, x_scaler, y_scaler)
756 _denormalize(right_id, x_scaler, y_scaler)
757
758 # denormalize the split value if needed
759 if x_scaler is not None:
760 split_feature = old_node['feature']
761 # The denormalizer requires a vector with all the features,
762 # even if we only want to denormalize one.
763 # -- put split value in a vector where it has pos 'feature'
764 old_threshold_and_zeros = np.zeros((self.n_features_, ), dtype=dct["nodes"]['threshold'].dtype)
765 old_threshold_and_zeros[split_feature] = old_node['threshold']
766 # -- denormalize the vector and retrieve value 'feature'
767 new_nodes[node_id]['threshold'] = x_scaler.inverse_transform(old_threshold_and_zeros)[split_feature]
768 else:
769 # no denormalization: simply copy
770 new_nodes[node_id]['threshold'] = old_node['threshold']
771
772 if isinstance(new_model, SplitNodeModel):
773 # denormalize model at split node too, even if it is not
774 # always used (depending on smoothing mode)
775 new_model.denormalize(x_scaler, y_scaler)
776 else:
777 raise TypeError("Internal error: all intermediate nodes"
778 "should be SplitNodeModel")
779
780 return
781
782 # denormalize the whole tree and put the result in (new_nodes,
783 # new_values, new_node_models) recursively
784 _denormalize(0, x_scaler, y_scaler)
785
786 # complete definition of the new tree
787 dct["nodes"] = new_nodes
788 dct["values"] = new_values
789 new_tree.__setstate__(dct)
790
791 # update the self fields
792 # self.features_usage
793 self.tree_ = new_tree
794 self.node_models = new_node_models
795
796 def compress_features(self):
797 """
798 Compresses the model and returns a vector of required feature indices.
799 This model input will then be X[:, features] instead of X.
800 """
801 used_features = sorted(self.features_usage.keys())
802 new_features_lookup_dct = {old_feature_idx: i for i, old_feature_idx in enumerate(used_features)}
803
804 if used_features == list(range(self.n_features_in_)):
805 # NOTHING TO DO: we need all features
806 return used_features
807
808 # Otherwise, We can compress. For this we have to create a copy of the
809 # tree because we will change its internals
810 old_tree = self.tree_
811 old_node_modls = self.node_models
812
813 # Get all information to create a copy of the inner tree.
814 # Note: _tree.copy() is gone so we use the pickle way
815 # --- Base info: nb features, nb outputs, output classes
816 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631
817 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)
818 # So these remain, we are just interested in changing the node-related arrays
819 new_tree = _tree.Tree(*old_tree.__reduce__()[1])
820
821 # --- Node info
822 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
823 dct = old_tree.__getstate__().copy()
824
825 # create empty new structures for nodes and models
826 n_shape = dct["nodes"].shape
827 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)
828 m_shape = old_node_modls.shape
829 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype)
830
831 def _compress_features(node_id):
832 """
833
834 Parameters
835 ----------
836 node_id
837
838 Returns
839 -------
840 the depth and new indices of left and right children
841
842 """
843 # use the old tree to walk
844 old_node = dct["nodes"][node_id]
845 left_id = old_node["left_child"]
846 right_id = old_node["right_child"]
847
848 # Create the new node with a copy of the old
849 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so.
850 new_model = copy(old_node_modls[node_id])
851 new_node_models[node_id] = new_model
852
853 if left_id == _tree.TREE_LEAF:
854 # ... and right_id == _tree.TREE_LEAF
855 # => node_id is a leaf
856 if isinstance(new_model, ConstantLeafModel):
857 # no features used
858 return
859 elif isinstance(new_model, LinRegLeafModel):
860 # compress that model
861 new_model.reindex_features(new_features_lookup_dct)
862 return
863 else:
864 raise TypeError("Internal error - Leafs can only be" "constant or linear regression")
865 else:
866
867 _compress_features(left_id)
868 _compress_features(right_id)
869
870 # store the new split feature index in the node
871 new_nodes[node_id]["feature"] = new_features_lookup_dct[old_node["feature"]]
872
873 if not isinstance(new_model, SplitNodeModel):
874 raise TypeError("Internal error: all intermediate nodes" "should be SplitNodeModel")
875
876 # TODO now that split node models have a linear regression
877 # model too, we should update them here
878
879 return
880
881 _compress_features(0)
882
883 # complete definition of the new tree
884 dct["nodes"] = new_nodes
885 new_tree.__setstate__(dct)
886
887 # update the self fields
888 self.features_usage = {new_features_lookup_dct[k]: v for k, v in self.features_usage.items()}
889 self.tree_ = new_tree
890 self.node_models = new_node_models
891 self.n_features_in_ = len(self.features_usage)
892
893 # return the vector of used features
894 return used_features
895
896 @property
897 def feature_importances_(self):
898 """Return the feature importances (the higher, the more important).
899
900 Returns
901 -------
902 feature_importances_ : array, shape = [n_features]
903 """
904 check_is_fitted(self, "tree_")
905
906 # TODO adapt ?
907 # features = np.array([self.features_usage[k] for k in sorted(self.features_usage.keys())], dtype=int)
908 features = self.tree_.compute_feature_importances()
909
910 return features
911
912 def predict(self, X, check_input=True, smooth_predictions=None, smoothing_constant=None):
913 """Predict class or regression value for X.
914
915 For a classification model, the predicted class for each sample in X is
916 returned. For a regression model, the predicted value based on X is
917 returned.
918
919 Parameters
920 ----------
921 X : numpy.ndarray, shape (n_samples, n_features)
922 The input samples. Internally, it will be converted to
923 ``dtype=np.float32`` and if a sparse matrix is provided
924 to a sparse ``csr_matrix``.
925 check_input : boolean, (default=True)
926 Allow to bypass several input checking.
927 Don't use this parameter unless you know what you do.
928 smooth_predictions : boolean, (default=None)
929 None means "use self config"
930 smoothing_constant: int, (default=self.smoothing_constant)
931 Smoothing constant used when smooth_predictions is True. During
932 smoothing, child node models are recursively mixed with their
933 parent node models, and the mix is done using a weighted sum. The
934 weight given to the child model is the number of training samples
935 that reached its node, while the weight given to the parent model
936 is the smoothing constant. Therefore it can be seen as an
937 equivalent number of samples that parent models represent when
938 injected into the recursive weighted sum.
939
940 Returns
941 -------
942 y : array of shape = [n_samples] or [n_samples, n_outputs]
943 The predicted classes, or the predict values.
944 """
945 check_is_fitted(self, "tree_")
946
947 # If this is just a constant node only check the input's shape.
948 if self.n_features_in_ == 0:
949 # perform the input checking manually to set ensure_min_features=0
950 X = check_array(X, dtype=DTYPE, accept_sparse="csr", ensure_min_features=0)
951 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
952 raise ValueError("No support for np.int64 index based " "sparse matrices")
953 # skip it then
954 check_input = False
955
956 # validate and convert dtype
957 X = self._validate_X_predict(X, check_input)
958
959 # -------- This is the only change wrt parent class. TODO maybe rather replace self.tree_ with a proxy ---------
960 if smooth_predictions is None:
961 # Default: smooth prediction at prediction time if configured as
962 # such in the model. Note: if self.use_smoothing == 'installed',
963 # models are already smoothed models, so no need to smooth again
964 smooth_predictions = self.use_smoothing == "on_prediction"
965 else:
966 # user provided an explicit value for smooth_predictions
967 if not smooth_predictions and self.use_smoothing == "installed":
968 raise ValueError(
969 "Smoothing has been pre-installed on this "
970 "tree, it is not anymore possible to make "
971 "predictions without smoothing"
972 )
973
974 if smooth_predictions and smoothing_constant is None:
975 # default parameter for smoothing is the one defined in the model
976 # (with possible ratio)
977 smoothing_constant = self._get_smoothing_constant_to_use(X)
978
979 # Do not use the embedded tree (like in super): it has been pruned but
980 # still has constant nodes
981 # proba = self.tree_.predict(X)
982 if not smooth_predictions:
983 proba = predict_from_leaves_no_smoothing(self.tree_, self.node_models, X)
984 else:
985 proba = predict_from_leaves(self, X, smoothing=True, smoothing_constant=smoothing_constant)
986 if len(proba.shape) < 2:
987 proba = proba.reshape(-1, 1)
988 # ------------------------------------------
989
990 n_samples = X.shape[0]
991
992 # Classification
993 if is_classifier(self):
994 if self.n_outputs_ == 1:
995 return self.classes_.take(np.argmax(proba, axis=1), axis=0)
996 else:
997 predictions = np.zeros((n_samples, self.n_outputs_))
998 for k in range(self.n_outputs_):
999 predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)
1000 return predictions
1001
1002 # Regression
1003 else:
1004 if self.n_outputs_ == 1:
1005 return proba[:, 0]
1006 else:
1007 return proba[:, :, 0]
1008
1009
1010 class ConstantLeafModel:
1011 """
1012 Represents the additional information about a leaf node that is not pruned.
1013 It contains the error associated with the training samples at this node.
1014
1015 Note: the constant value is not stored here, as it is already available in
1016 the sklearn tree struct. So to know the prediction at this node, use
1017 `tree.value[node_id]`
1018 """
1019
1020 __slots__ = ("error",)
1021
1022 def __init__(self, error):
1023 self.error = error
1024
1025 @property
1026 def n_params(self) -> int:
1027 """
1028 Returns the number of parameters used by this model, including the
1029 constant driver
1030 """
1031 return 1
1032
1033 @staticmethod
1034 def predict_cstt(tree, node_id, n_samples):
1035 """
1036 This static method is a helper to get an array of constant predictions
1037 associated with this node.
1038
1039 Parameters
1040 ----------
1041 tree
1042 node_id
1043 n_samples
1044
1045 Returns
1046 -------
1047
1048 """
1049 cstt_prediction = tree.value[node_id]
1050
1051 # cstt_prediction can be multioutput so it is an array. replicate it
1052 from numpy import matlib # note: np.matlib not available in 1.10.x
1053
1054 return matlib.repmat(cstt_prediction, n_samples, 1)
1055
1056
1057 class LinRegNodeModel(DeNormalizableMixIn):
1058 """
1059 Represents the additional information about a tree node that contains a
1060 linear regression model.
1061
1062 It contains
1063 - the features used by the model,
1064 - the scikit learn model object itself,
1065 - and the error for the training samples that reached this node
1066
1067 """
1068 __slots__ = ("features", "model", "error", "n_params")
1069
1070 def __init__(self, features, model, error) -> None:
1071 self.features = features
1072 self.model = model
1073 self.error = error
1074 self.n_params = len(self.features) + 1
1075
1076 def to_text(self, feature_names=None, target_name=None, precision=3,
1077 line_breaks=False):
1078 """ Returns a text representation of the linear regression model """
1079 return linreg_model_to_text(self.model, feature_names=feature_names,
1080 target_name=target_name,
1081 precision=precision,
1082 line_breaks=line_breaks)
1083
1084 def reindex_features(self, new_features_lookup_dct):
1085 """
1086 Reindexes the required features using the provided lookup dictionary.
1087
1088 Parameters
1089 ----------
1090 new_features_lookup_dct
1091
1092 Returns
1093 -------
1094
1095 """
1096 self.features = [new_features_lookup_dct[f] for f in self.features]
1097
1098 def denormalize(self,
1099 x_scaler: StandardScaler = None,
1100 y_scaler: StandardScaler = None
1101 ):
1102 """
1103 De-normalizes the linear model.
1104
1105 :param x_scaler:
1106 :param y_scaler:
1107 :return:
1108 """
1109 # create a clone of the x scaler with only the used features
1110 x_scaler = copy(x_scaler)
1111 if len(self.features) > 0:
1112 x_scaler.scale_ = x_scaler.scale_[self.features]
1113 x_scaler.mean_ = x_scaler.mean_[self.features]
1114 else:
1115 # in that particular case, the above expression is not working
1116 x_scaler.scale_ = x_scaler.scale_[0:0]
1117 x_scaler.mean_ = x_scaler.mean_[0:0]
1118
1119 # use the denormalize function on the internal model
1120 # TODO what if internal model is not able ?
1121 self.model.denormalize(x_scaler, y_scaler)
1122
1123 def predict(self, X: np.ndarray):
1124 """Performs a prediction for X, only using the required features."""
1125 if len(self.features) < 1 and isinstance(self.model, LinearRegression):
1126 # unfortunately LinearRegression models do not like it when no
1127 # features are needed: their input validation requires at least 1.
1128 # so we do it ourselves.
1129 return self.model.intercept_ * np.ones((X.shape[0], 1))
1130 else:
1131 return self.model.predict(X[:, self.features])
1132
1133
1134 class LinRegLeafModel(LinRegNodeModel):
1135 """
1136 Represents the additional information about a leaf node with a linear
1137 regression model
1138 """
1139 pass
1140
1141
1142 class SplitNodeModel(LinRegNodeModel):
1143 """
1144 Represents the additional information about a split node, with a linear
1145 regression model.
1146 """
1147 __slots__ = ("n_params",)
1148
1149 def __init__(self, n_params, error, features, model):
1150 self.n_params = n_params
1151 super(SplitNodeModel, self).__init__(features=features, model=model, error=error)
1152
1153
1154 PRUNING_MULTIPLIER = 2
1155 # see https://github.com/bnjmn/weka/blob/master/weka/src/main/java/weka/classifiers/trees/m5/RuleNode.java#L124
1156 # TODO check why they use 2 instead of 1 (from the article) ??
1157
1158
1159 def root_mean_squared_error(*args, **kwargs):
1160 return np.sqrt(mean_squared_error(*args, **kwargs))
1161
1162
1163 def prune_on_min_impurity(tree):
1164 """
1165 Edits the given tree so as to prune subbranches that do not respect the min
1166 impurity criterion.
1167
1168 The paper suggests to do this with 5% but min_impurity_split is not
1169 available in 0.17 (and criterion is not std but mse so we have to square)
1170
1171 Parameters
1172 ----------
1173 tree
1174
1175 Returns
1176 -------
1177
1178 """
1179 left_children = tree.children_left
1180 right_children = tree.children_right
1181 impurities = tree.impurity
1182
1183 # The paper suggests to do this with 5% but min_impurity_split is not
1184 # available in 0.17 (and criterion is not std but mse so we have to square)
1185 # TODO adapt the formula to criterion used and/or generalize with min_impurity_split_as_initial_ratio
1186 root_impurity = impurities[0]
1187 impurity_threshold = root_impurity * (0.05 ** 2)
1188
1189 def stop_on_min_impurity(node_id):
1190 # note: in the paper that is 0.05 but criterion is on std. Here
1191 # impurity is mse so a squared equivalent of std.
1192 left_id = left_children[node_id]
1193 right_id = right_children[node_id]
1194 if left_id != _tree.TREE_LEAF: # a split node
1195 if impurities[node_id] < impurity_threshold:
1196 # stop here, this will be a leaf
1197 prune_children(node_id, tree)
1198 else:
1199 stop_on_min_impurity(left_id)
1200 stop_on_min_impurity(right_id)
1201
1202 stop_on_min_impurity(0)
1203
1204
1205 def build_models_and_get_pruning_info(
1206 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model, node_models, global_std_dev, use_pruning, node_id=0
1207 ):
1208 """
1209
1210 Parameters
1211 ----------
1212 tree : Tree
1213 A tree that will be pruned on the way
1214 X_train_all
1215 y_train_all
1216 nodes_to_samples
1217 leaf_model
1218 node_models
1219 global_std_dev
1220 use_pruning
1221 node_id
1222
1223 Returns
1224 -------
1225 a dictionary where the key is the feature index and the value is
1226 the number of samples where this feature is used
1227 """
1228
1229 # Select the Error metric to compute model errors (used to compare node
1230 # with subtree for pruning)
1231 # TODO in Weka they use RMSE, but in papers they use MAE. could be a param
1232 # TODO Shall we go further and store the residuals, or a bunch of metrics?
1233 err_metric = root_mean_squared_error # mean_absolute_error
1234
1235 # Get the samples associated with this node
1236 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)
1237 n_samples_at_this_node = samples_at_this_node.shape[0]
1238 y_true_this_node = y_train_all[samples_at_this_node]
1239
1240 # Is this a leaf node or a split node ?
1241 left_node = tree.children_left[node_id] # this way we do not have to query it again in the else.
1242 if left_node == _tree.TREE_LEAF:
1243 # --> Current node is a LEAF. See is_leaf(node_id, tree) if you have doubts <--
1244
1245 # -- create a linear model for this node
1246 # leaves do not have the right to use any features since they have no
1247 # subtree: keep the constant prediction
1248 y_pred_this_node = ConstantLeafModel.predict_cstt(tree, node_id, y_true_this_node.shape[0])
1249 err_this_node = err_metric(y_true_this_node, y_pred_this_node)
1250
1251 # TODO when use_pruning = False, should we allow all features to be
1252 # used instead of having to stick to the M5 rule of "only use a
1253 # feature if the subtree includes a split with this feature" ?
1254 # OR alternate proposal: should we transform the boolean use_pruning
1255 # into a use_pruning_max_nb integer to say for example "only 2 level
1256 # of pruning" ?
1257
1258 # -- store the model information
1259 node_models[node_id] = ConstantLeafModel(err_this_node)
1260
1261 # -- return an empty dict for features used
1262 return dict() # np.array([], dtype=int)
1263
1264 else:
1265 # --> Current node is a SPLIT <--
1266 right_node = tree.children_right[node_id]
1267
1268 # (1) prune left and right subtree and get some information
1269 features_l = build_models_and_get_pruning_info(
1270 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model,
1271 node_models, global_std_dev, use_pruning, node_id=left_node
1272 )
1273 features_r = build_models_and_get_pruning_info(
1274 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model,
1275 node_models, global_std_dev, use_pruning, node_id=right_node
1276 )
1277
1278 # (2) select only the samples that reach this node
1279 X_this_node = X_train_all[samples_at_this_node, :]
1280
1281 # (3) Create a model for this node
1282 # -- fit a linear regression model taking into account only the
1283 # features used in the subtrees + the split one
1284 # TODO should we normalize=True (variance scaling) or use a whole
1285 # pipeline here ? Not sure..
1286 # estimators = [('scale', StandardScaler()), ('clf', model_type())];
1287 # pipe = Pipeline(estimators)
1288 # skmodel_this_node = leaf_model_type(**leaf_model_params)
1289 skmodel_this_node = clone(leaf_model)
1290
1291 # -- old - we used to only store the array of features
1292 # selected_features = np.union1d(features_l, features_r)
1293 # selected_features = np.union1d(selected_features, tree.feature[node_id])
1294 # -- new - we also gather the nb samples where this feature is used
1295 selected_features_dct = features_l
1296 for feature_id, n_samples_feat_used in features_r.items():
1297 if feature_id in selected_features_dct.keys():
1298 selected_features_dct[feature_id] += n_samples_feat_used
1299 else:
1300 selected_features_dct[feature_id] = n_samples_feat_used
1301 selected_features_dct[tree.feature[node_id]] = n_samples_at_this_node
1302 # -- use only the selected features, in the natural integer order
1303 selected_features = sorted(selected_features_dct.keys())
1304
1305 X_train_this_node = X_this_node[:, selected_features]
1306 skmodel_this_node.fit(X_train_this_node, y_true_this_node)
1307 # -- predict and compute error
1308 y_pred_this_node = skmodel_this_node.predict(X_train_this_node)
1309 err_this_node = err_metric(y_true_this_node, y_pred_this_node)
1310 # -- create the object
1311 # TODO the paper suggest to perform recursive feature elimination in
1312 # this model until adjusted_err_model is minimal, is it same in Weka ?
1313 model_this_node = LinRegLeafModel(selected_features, skmodel_this_node, err_this_node)
1314
1315 # (4) compute adj error criterion = ERR * (n+v)/(n-v) for both models
1316 adjusted_err_model = compute_adjusted_error(err_this_node, n_samples_at_this_node, model_this_node.n_params)
1317
1318 # (5) predict and compute adj error for the combination of child models
1319 # -- Note: this is recursive so the leaves may contain linear models
1320 # already
1321 y_pred_children = predict_from_leaves_no_smoothing(tree, node_models, X_this_node)
1322 err_children = err_metric(y_true_this_node, y_pred_children)
1323
1324 # TODO the Weka implem (below) differs from the paper that suggests a
1325 # weigthed sum of adjusted errors. This is maybe an equivalent formulation, to check.
1326 # the total number of parameters if we do not prune, is the sum of
1327 # params in each branch PLUS 1 for the split
1328 n_params_splitmodel = node_models[left_node].n_params + node_models[right_node].n_params + 1
1329 adjusted_err_children = compute_adjusted_error(err_children, n_samples_at_this_node, n_params_splitmodel)
1330
1331 # (6) compare and either prune at this node or keep the subtrees
1332 std_dev_this_node = np.nanstd(y_true_this_node)
1333 # note: the first criterion is now already checked before that
1334 # function call, in `prune_on_min_impurity`
1335 if use_pruning and (
1336 # TODO these parameters should be in the constructor
1337 # see also 2 comments about min_impurity_split_as_initial_ratio
1338 std_dev_this_node < (global_std_dev * 0.05)
1339 or (adjusted_err_model <= adjusted_err_children)
1340 or (adjusted_err_model < (global_std_dev * 0.00001)) # so this means very good R² already
1341 ):
1342 # Choose model for this node rather than subtree model
1343 # -- prune from this node on
1344 removed_nodes = prune_children(node_id, tree)
1345 node_models[removed_nodes] = _tree.TREE_UNDEFINED
1346
1347 # store the model information
1348 node_models[node_id] = model_this_node
1349
1350 # update and return the features used
1351 selected_features_dct = {k: n_samples_at_this_node for k in selected_features_dct.keys()}
1352 return selected_features_dct
1353
1354 else:
1355 # The subtrees are good or we do not want pruning: keep them.
1356 # This node will remain a split, and will only contain the digest
1357 # about the subtree
1358 node_models[node_id] = SplitNodeModel(
1359 n_params_splitmodel, err_children, selected_features, skmodel_this_node
1360 )
1361
1362 # return the features used
1363 return selected_features_dct
1364
1365
1366 def compute_adjusted_error(err, n_samples, n_parameters, multiplier=PRUNING_MULTIPLIER):
1367 """
1368 Return the penalized error obtained from `err`, the prediction error at a
1369 given tree node, by multiplying it by ``(n+p)/(n-p)``. `n` is the number of
1370 samples at this node and `p` is the number of parameters of the model.
1371
1372 According to the original M5 paper, this is to penalize complex models
1373 used for few samples.
1374
1375 Note that if ``n_samples <= n_parameters`` the denominator is zero or negative.
1376 In this case an arbitrary high penalization factor is used: ``10 * err``.
1377
1378 Parameters
1379 ----------
1380 err : float
1381 The model error for a given tree node. Typically the MAE.
1382
1383 n_samples : int
1384 The number of samples at this node.
1385
1386 n_parameters : int
1387 The number of parameters of the model at this node.
1388
1389 Returns
1390 -------
1391 penalized_mae : float
1392 The penalized error ``err * (n+p)/(n-p)``, or ``10 * err`` if ``n <= p``.
1393 """
1394 #
1395 if n_samples <= n_parameters:
1396 # denominator is zero or negative: use a large factor so as to penalize
1397 # a lot this overly complex model.
1398 factor = 10.0 # Caution says Yong in his code
1399 else:
1400 factor = (n_samples + multiplier * n_parameters) / (n_samples - n_parameters)
1401
1402 return err * factor
1403
1404
1405 def predict_from_leaves_no_smoothing(tree, node_models, X):
1406 """
1407 Returns the prediction for all samples in X, based on using the appropriate
1408 model leaf.
1409
1410 This function
1411 - uses tree.apply(X) to know in which tree leaf each sample in X goes
1412 - then for each of the leaves that are actually touched, uses
1413 node_models[leaf_id] to predict, for the X reaching that leaf
1414
1415 This function assumes that node_models contains non-empty LinRegLeafModel
1416 entries for all leaf nodes that will be reached by the samples X.
1417
1418 Parameters
1419 ----------
1420 tree : Tree
1421 The tree object.
1422 node_models : array-like
1423 The array containing node models
1424 X : 2D array-like
1425 The sample data.
1426
1427 Returns
1428 -------
1429 y_predicted : 1D array-like
1430 The prediction vector.
1431 """
1432 # **** This does the job, but we have one execution of model.predict() per
1433 # sample: probably not efficient
1434 # sample_ids_to_leaf_node_ids = tree.apply(X)
1435 # model_and_x = np.concatenate((node_models[leaf_node_ids].reshape(-1, 1), X), axis=1)
1436 # def pred(m_and_x):
1437 # return m_and_x[0].model.predict(m_and_x[1:].reshape(1,-1))[0]
1438 # y_predicted = np.array(list(map(pred, model_and_x)))
1439
1440 # **** This should be faster because the number of calls to predict() is
1441 # equal to the number of leaves touched
1442 sample_ids_to_leaf_node_ids = tree.apply(X)
1443 y_predicted = -np.ones(sample_ids_to_leaf_node_ids.shape, dtype=DOUBLE)
1444
1445 # -- find the unique list of leaves touched
1446 leaf_node_ids, inverse_idx = np.unique(sample_ids_to_leaf_node_ids, return_inverse=True)
1447
1448 # -- for each leaf perform the prediction for samples reaching that leaf
1449 for leaf_node_id in leaf_node_ids:
1450 # get the indices of the samples reaching that leaf
1451 sample_indices = np.nonzero(sample_ids_to_leaf_node_ids == leaf_node_id)[0]
1452
1453 # predict
1454 node_model = node_models[leaf_node_id]
1455 if isinstance(node_model, LinRegLeafModel):
1456 y_predicted[sample_indices] = np.ravel(node_model.predict(X[sample_indices, :]))
1457 else:
1458 # isinstance(node_model, ConstantLeafModel)
1459 y_predicted[sample_indices] = tree.value[leaf_node_id]
1460
1461 # **** Recursive strategy: not used anymore
1462 # left_node = tree.children_left[node_id] # this way we do not have to query it again in the else.
1463 # if left_node == _tree.TREE_LEAF:
1464 # # --> Current node is a LEAF. See is_leaf(node_id, tree) <--
1465 # y_predicted = node_models[node_id].model.predict()
1466 # else:
1467 # # --> Current node is a SPLIT <--
1468 # right_node = tree.children_right[node_id]
1469 #
1470 #
1471 # samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)
1472 # y_true_this_node = y_train_all[samples_at_this_node]
1473 # # As surprising as it may seem, in numpy [samples_at_this_node, selected_features] does something else.
1474 # X_train_this_node = X_train_all[samples_at_this_node, :][:, selected_features]
1475 #
1476 # X_left
1477
1478 return y_predicted
1479
1480
1481 def predict_from_leaves(m5p, X, smoothing=True, smoothing_constant=15):
1482 """
1483 Predicts using the M5P tree, without using the compiled sklearn
1484 `tree.apply` subroutine.
1485
1486 The main purpose of this function is to apply smoothing to a M5P model tree
1487 where smoothing has not been pre-installed on the models. For examples to
1488 enable a model to be used both without and with smoothing for comparisons
1489 purposes, or for models whose leaves are not Linear Models and therefore
1490 for which no pre-installation method exist.
1491
1492 Note: this method is slower than `predict_from_leaves_no_smoothing` when
1493 `smoothing=False`.
1494
1495 Parameters
1496 ----------
1497 m5p : M5Prime
1498 The model to use for prediction
1499 X : array-like
1500 The input data
1501 smoothing : bool
1502 Whether to apply smoothing
1503 smoothing_constant : int
1504 The smoothing constant `k` used as the prediction weight of parent node model.
1505 (k=15 in the articles).
1506
1507
1508 Returns
1509 -------
1510
1511 """
1512 # validate and converts dtype just in case this was directly called
1513 # e.g. in unit tests
1514 X = m5p._validate_X_predict(X, check_input=True)
1515
1516 tree = m5p.tree_
1517 node_models = m5p.node_models
1518 nb_samples = X.shape[0]
1519 y_predicted = -np.ones((nb_samples, 1), dtype=DOUBLE)
1520
1521 # sample_ids_to_leaf_node_ids = tree.apply(X)
1522 def smooth_predictions(ancestor_nodes, X_at_node, y_pred_at_node):
1523 # note: y_predicted_at_node can be a constant
1524 current_node_model_id = ancestor_nodes[-1]
1525 for _i, parent_model_id in enumerate(reversed(ancestor_nodes[:-1])):
1526 # warning: this is the nb of TRAINING samples at this node
1527 node_nb_train_samples = tree.n_node_samples[current_node_model_id]
1528 parent_model = node_models[parent_model_id]
1529 parent_predictions = parent_model.predict(X_at_node)
1530 y_pred_at_node = (node_nb_train_samples * y_pred_at_node + smoothing_constant * parent_predictions) / (
1531 node_nb_train_samples + smoothing_constant
1532 )
1533 current_node_model_id = parent_model_id
1534
1535 return y_pred_at_node
1536
1537 def apply_prediction(node_id, ids=None, ancestor_nodes=None):
1538 first_call = False
1539 if ids is None:
1540 ids = slice(None)
1541 first_call = True
1542 if smoothing:
1543 if ancestor_nodes is None:
1544 ancestor_nodes = [node_id]
1545 else:
1546 ancestor_nodes.append(node_id)
1547
1548 left_id = tree.children_left[node_id]
1549 if left_id == _tree.TREE_LEAF:
1550 # ... and tree.children_right[node_id] == _tree.TREE_LEAF
1551 # LEAF node: predict
1552 # predict
1553 node_model = node_models[node_id]
1554 # assert (ids == (sample_ids_to_leaf_node_ids == node_id)).all()
1555 if isinstance(node_model, LinRegLeafModel):
1556 X_at_node = X[ids, :]
1557 predictions = node_model.predict(X_at_node)
1558 else:
1559 # isinstance(node_model, ConstantLeafModel)
1560 predictions = tree.value[node_id]
1561 if smoothing:
1562 X_at_node = X[ids, :]
1563
1564 # finally apply smoothing
1565 if smoothing:
1566 y_predicted[ids] = smooth_predictions(ancestor_nodes, X_at_node, predictions)
1567 else:
1568 y_predicted[ids] = predictions
1569 else:
1570 right_id = tree.children_right[node_id]
1571 # non-leaf node: split samples and recurse
1572 left_group = np.zeros(nb_samples, dtype=bool)
1573 left_group[ids] = X[ids, tree.feature[node_id]] <= tree.threshold[node_id]
1574 right_group = (~left_group) if first_call else (ids & (~left_group))
1575
1576 # important: copy ancestor_nodes BEFORE calling anything, otherwise
1577 # it will be modified
1578 apply_prediction(
1579 left_id, ids=left_group, ancestor_nodes=(ancestor_nodes.copy() if ancestor_nodes is not None else None)
1580 )
1581 apply_prediction(right_id, ids=right_group, ancestor_nodes=ancestor_nodes)
1582
1583 # recurse to fill all predictions
1584 apply_prediction(0)
1585
1586 return y_predicted
1587
1588
1589 def prune_children(node_id, tree):
1590 """
1591 Prunes the children of node_id in the given `tree`.
1592
1593 Inspired by https://github.com/shenwanxiang/sklearn-post-prune-tree/blob/master/tree_prune.py#L122
1594
1595 IMPORTANT this relies on the fact that `children_left` and `children_right`
1596 are modificable (and are the only things we need to modify to fix the
1597 tree). It seems to be the case as of now.
1598
1599 Parameters
1600 ----------
1601 node_id : int
1602 tree : Tree
1603
1604 Returns
1605 -------
1606 removed_nodes : List[int]
1607 A list of removed nodes
1608 """
1609
1610 def _prune_below(_node_id):
1611 left_child = tree.children_left[_node_id]
1612 right_child = tree.children_right[_node_id]
1613 if left_child == _tree.TREE_LEAF:
1614 # _node_id is a leaf: left_ & right_child say "leaf".Nothing to do
1615 return []
1616 else:
1617 # Make sure everything is pruned below: children should be leaves
1618 removed_l = _prune_below(left_child)
1619 removed_r = _prune_below(right_child)
1620
1621 # -- First declare that they are not leaves anymore but they do not exist at all
1622 for child in [left_child, right_child]:
1623
1624 if tree.children_left[child] != _tree.TREE_LEAF:
1625 raise ValueError("Unexpected children_left: %s, please report it as issue." % child)
1626
1627 if tree.children_right[child] != _tree.TREE_LEAF:
1628 raise ValueError("Unexpected children_right: %s, please report it as issue." % child)
1629
1630 tree.children_left[child] = _tree.TREE_UNDEFINED
1631 tree.children_right[child] = _tree.TREE_UNDEFINED
1632
1633 # -- Then declare that current node is a leaf
1634 tree.children_left[_node_id] = _tree.TREE_LEAF
1635 tree.children_right[_node_id] = _tree.TREE_LEAF
1636
1637 # Return the list of removed nodes
1638 return removed_l + removed_r + [left_child, right_child]
1639
1640 # Note: we do not change the node count here, as we'll clean all later.
1641 # true_node_count = tree.node_count - sum(tree.children_left == _tree.TREE_UNDEFINED)
1642 # tree.node_count -= 2*len(nodes_to_remove)
1643
1644 return _prune_below(node_id)
1645
1646
1647 def is_leaf(node_id, tree):
1648 """
1649 Returns true if node with id `node_id` is a leaf in tree `tree`.
1650 Is is not actually used in this file because we always need the left child
1651 node id in our code.
1652
1653 But it is kept here as an easy way to remember how it works.
1654
1655 Parameters
1656 ----------
1657 node_id : int
1658 tree : Tree
1659
1660 Returns
1661 -------
1662 _is_leaf : bool
1663 A boolean flag, True if this node is a leaf.
1664 """
1665 if node_id == _tree.TREE_LEAF or node_id == _tree.TREE_UNDEFINED:
1666 raise ValueError("Invalid node_id %s" % node_id)
1667
1668 return tree.children_left[node_id] == _tree.TREE_LEAF
1669
1670
1671 def get_samples_at_node(node_id, nodes_to_samples):
1672 """
1673 Return an array containing the ids of the samples for node `node_id`.
1674
1675 This method requires the user to
1676 - first compute the decision path for the sample matrix X
1677 - then convert it to a csc
1678
1679 >>> samples_to_nodes = estimator.decision_path(X) # returns a Scipy compressed sparse row matrix (CSR)
1680 >>> nodes_to_samples = samples_to_nodes.tocsc() # we need the column equivalent (CSC)
1681 >>> samples = get_samples_at_node(node_id, nodes_to_samples)
1682
1683 Parameters
1684 ----------
1685 node_id : int
1686 The node for which the list of samples is queried.
1687 nodes_to_samples : csc_matrix
1688 A boolean matrix in Compressed Sparse Column (CSC) format where rows are the
1689 tree nodes and columns are samples. The matrix contains a 1 when the samples
1690 go through this node when processed by the decisions.
1691
1692 Returns
1693 -------
1694
1695 """
1696 return nodes_to_samples.indices[nodes_to_samples.indptr[node_id] : nodes_to_samples.indptr[node_id + 1]]
1697
1698
1699 class M5Prime(M5Base, RegressorMixin):
1700 """An M5' (M five prime) model tree regressor.
1701
1702 The original M5 algorithm was invented by R. Quinlan [1]_. Y. Wang [2]_ made
1703 improvements and named the resulting algorithm M5 Prime.
1704 This implementation was inspired by Weka (https://github.com/bnjmn/weka)
1705 M5Prime class, from Mark Hall.
1706
1707 See also
1708 --------
1709 M5Base
1710
1711 References
1712 ----------
1713 .. [1] Ross J. Quinlan, "Learning with Continuous Classes", 5th Australian
1714 Joint Conference on Artificial Intelligence, pp343-348, 1992.
1715 .. [2] Y. Wang and I. H. Witten, "Induction of model trees for predicting
1716 continuous classes", Poster papers of the 9th European Conference
1717 on Machine Learning, 1997.
1718 """
1719
1720
1721 def _empty_contig_ar(shape, dtype):
1722 """Return an empty contiguous array with given shape and dtype."""
1723 return np.ascontiguousarray(np.empty(shape, dtype=dtype))