Coverage for src/m5py/main.py: 69%
534 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-22 17:10 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-22 17:10 +0000
1from collections import namedtuple
2from copy import copy
3from logging import getLogger
4from warnings import warn
6import numpy as np
8from scipy.sparse import issparse
10from sklearn import clone
11from sklearn.base import RegressorMixin, is_classifier
12from sklearn.linear_model import LinearRegression
13from sklearn.metrics import mean_squared_error
14from sklearn.preprocessing import StandardScaler
15from sklearn.tree import BaseDecisionTree, _tree
16from sklearn.tree._classes import DTYPE
17from sklearn.tree._tree import DOUBLE
18from sklearn.utils import check_array
19from sklearn.utils.validation import check_is_fitted
20from sklearn import __version__ as sklearn_version
22from m5py.linreg_utils import linreg_model_to_text, DeNormalizableMixIn, DeNormalizableLinearRegression
24from packaging.version import Version
26SKLEARN_VERSION = Version(sklearn_version)
27SKLEARN13_OR_GREATER = SKLEARN_VERSION >= Version("1.3.0")
30__all__ = ["M5Base", "M5Prime"]
33_SmoothingDetails = namedtuple("_SmoothingDetails", ("A", "B", "C"))
34# Internal structure to contain the recursive smoothed coefficients details in the smoothing algorithm
37logger = getLogger("m5p")
40class M5Base(BaseDecisionTree):
41 """
42 M5Base. Implements base routines for generating M5 PredictionModel trees and rules.
44 The original algorithm M5 was invented by Quinlan:
46 - Quinlan J. R. (1992). Learning with continuous classes. Proceedings of the
47 Australian Joint Conference on Artificial Intelligence. 343--348. World
48 Scientific, Singapore.
50 Yong Wang and Ian Witten made improvements and created M5':
52 - Wang, Y and Witten, I. H. (1997). Induction of model trees for predicting
53 continuous classes. Proceedings of the poster papers of the European
54 Conference on Machine Learning. University of Economics, Faculty of
55 Informatics and Statistics, Prague.
57 Pruning and Smoothing can be activated and deactivated on top of the base
58 model. TODO check if 'rules' should be supported too
60 Inspired by Weka (https://github.com/bnjmn/weka) M5Base class, from Mark Hall
62 Attributes
63 ----------
64 criterion : str, default="friedman_mse"
65 M5 suggests to use the standard deviation (RMSE impurity) instead of
66 variance (MSE impurity) or absolute deviation (MAE impurity) as in CART.
67 According to M5' paper, both 3 criterions are equivalent.
68 splitter : default="best"
69 M5 suggests to take the feature leading to the best gain in criterion.
70 max_depth : default None
71 This is not used in the original M5 article, hence default=None.
72 min_samples_split : int, default=4
73 M5 suggests a value of 4 so that each leaf will have at least 2 samples.
74 min_samples_leaf : int, default=2
75 M5' suggest to add this explicitly to avoid zero-variance in leaves, and
76 n<=p for constant models.
77 min_weight_fraction_leaf : float, default=0.0
78 This is not used in the original M5 article, hence default=0.0.
79 TODO this would actually maybe be better than min_sample_leaf ?
80 max_features : default None
81 This is not used in the original M5 article, hence default is None.
82 max_leaf_nodes : int, default None
83 This is not used in the original M5 article, hence default is None.
84 min_impurity_decrease : float, default=0.0
85 This is not used in the original M5 article, hence default is None.
86 class_weight : default None
87 This is not used (?)
88 leaf_model : RegressorMixin
89 The regression model used in the leaves. This instance will be cloned
90 for each leaf.
91 use_pruning : bool
92 If False, pruning will be disabled.
93 use_smoothing : bool or str {'installed', 'on_prediction'}, default None
94 None and True means 'installed' by default except if smoothing_constant
95 or smoothing_constant_ratio is 0.0
96 smoothing_constant: float, default None
97 The smoothing constant k defined in the original M5 article, used as the
98 weight for each parent model in the recursive weighting process.
99 If None, the default value from the paper (k=15) will be used.
100 smoothing_constant_ratio: float, default None
101 An alternate way to define the smoothing constant, as a ratio of the
102 total number of training samples. The resulting smoothing constant will
103 be smoothing_constant_ratio * n where n is the number of samples.
104 Note that this may not be an integer.
105 debug_prints: bool, default False
106 A boolean to enable debug prints
107 ccp_alpha: float, default 0.0
108 TODO is this relevant ? does that conflict with "use_pruning" ?
109 random_state : None, int, or RandomState, default=None
110 See `RegressionTree.random_state`.
111 """
113 def __init__(
114 self,
115 criterion="friedman_mse",
116 splitter="best",
117 max_depth=None,
118 min_samples_split=4,
119 min_samples_leaf=2,
120 min_weight_fraction_leaf=0.0,
121 max_features=None,
122 max_leaf_nodes=None,
123 min_impurity_decrease=0.0,
124 class_weight=None,
125 leaf_model=None,
126 use_pruning=True,
127 use_smoothing=None,
128 smoothing_constant=None,
129 smoothing_constant_ratio=None,
130 debug_prints=False,
131 ccp_alpha=0.0,
132 random_state=None,
133 ):
135 # TODO the paper suggests to do this with 5% of total std
136 # min_impurity_split = min_impurity_split_as_initial_ratio * initial_impurity
138 super(M5Base, self).__init__(
139 criterion=criterion,
140 splitter=splitter,
141 max_depth=max_depth,
142 min_samples_split=min_samples_split,
143 min_samples_leaf=min_samples_leaf,
144 min_weight_fraction_leaf=min_weight_fraction_leaf,
145 max_features=max_features,
146 max_leaf_nodes=max_leaf_nodes,
147 min_impurity_decrease=min_impurity_decrease,
148 random_state=random_state,
149 class_weight=class_weight,
150 ccp_alpha=ccp_alpha,
151 )
153 # warning : if the field names are different from constructor params,
154 # then clone(self) will not work.
155 if leaf_model is None: 155 ↛ 159line 155 didn't jump to line 159, because the condition on line 155 was never false
156 # to handle case when the model is learnt on normalized data and
157 # we wish to be able to read the model equations.
158 leaf_model = DeNormalizableLinearRegression()
159 self.leaf_model = leaf_model
161 # smoothing related
162 if smoothing_constant_ratio is not None and smoothing_constant is not None: 162 ↛ 163line 162 didn't jump to line 163, because the condition on line 162 was never true
163 raise ValueError("Only one of `smoothing_constant` and `smoothing_constant_ratio` should be provided")
164 elif (smoothing_constant_ratio == 0.0 or smoothing_constant == 0) and ( 164 ↛ 167line 164 didn't jump to line 167, because the condition on line 164 was never true
165 use_smoothing is True or use_smoothing == "installed"
166 ):
167 raise ValueError(
168 "`use_smoothing` was explicitly enabled with "
169 "pre-installed models, while smoothing "
170 "constant/ratio are explicitly set to zero"
171 )
173 self.use_pruning = use_pruning
174 self.use_smoothing = use_smoothing
175 self.smoothing_constant = smoothing_constant
176 self.smoothing_constant_ratio = smoothing_constant_ratio
178 self.debug_prints = debug_prints
180 def as_pretty_text(self, **kwargs):
181 """
182 Returns a multi-line representation of this decision tree, using
183 `tree_to_text`.
185 :return: a multi-line string representing this decision tree
186 """
187 from m5py.export import export_text_m5
188 return export_text_m5(self, out_file=None, **kwargs)
190 def fit(self, X, y: np.ndarray, sample_weight=None, check_input=True, X_idx_sorted="deprecated"):
191 """Fit a M5Prime model.
193 Parameters
194 ----------
195 X : numpy.ndarray
196 y : numpy.ndarray
197 sample_weight
198 check_input
199 X_idx_sorted
201 Returns
202 -------
204 """
205 # (0) smoothing default values behaviour
206 if self.smoothing_constant_ratio == 0.0 or self.smoothing_constant == 0: 206 ↛ 207line 206 didn't jump to line 207, because the condition on line 206 was never true
207 self.use_smoothing = False
208 elif self.use_smoothing is None or self.use_smoothing is True:
209 # default behaviour
210 if isinstance(self.leaf_model, LinearRegression): 210 ↛ 213line 210 didn't jump to line 213, because the condition on line 210 was never false
211 self.use_smoothing = "installed"
212 else:
213 self.use_smoothing = "on_prediction"
215 # Finally make sure we are ok now
216 if self.use_smoothing not in [False, np.bool_(False), "installed", "on_prediction"]: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true
217 raise ValueError("use_smoothing: Unexpected value: %s, please report it as issue." % self.use_smoothing)
220 # (1) Build the initial tree as usual
222 # Get the correct fit method name based on the sklearn version used
223 fit_method_name = "_fit" if SKLEARN13_OR_GREATER else "fit"
225 fit_method = getattr(super(M5Base, self), fit_method_name)
226 fit_method(X, y, sample_weight=sample_weight, check_input=check_input)
229 if self.debug_prints: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true
230 logger.debug("(debug_prints) Initial tree:")
231 logger.debug(self.as_pretty_text(node_ids=True))
233 # (1b) prune initial tree to take into account min impurity in splits
234 prune_on_min_impurity(self.tree_)
236 if self.debug_prints: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 logger.debug("(debug_prints) Postprocessed tree:")
238 logger.debug(self.as_pretty_text(node_ids=True))
240 # (2) Now prune tree and replace pruned branches with linear models
241 # -- unfortunately we have to re-do this input validation
242 # step, because it also converts the input to float32.
243 if check_input: 243 ↛ 252line 243 didn't jump to line 252, because the condition on line 243 was never false
244 X = check_array(X, dtype=DTYPE, accept_sparse="csc")
245 if issparse(X): 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 X.sort_indices()
248 if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
249 raise ValueError("No support for np.int64 index based " "sparse matrices")
251 # -- initialise the structure to contain the leaves and node models
252 self.node_models = np.empty((self.tree_.node_count,), dtype=object)
254 # -- Pruning requires to know the global deviation of the target
255 global_std_dev = np.nanstd(y)
256 # global_abs_dev = np.nanmean(np.abs(y))
258 # -- Pruning requires to know the samples that reached each node.
259 # From http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
260 # Retrieve the decision path of each sample.
261 samples_to_nodes = self.decision_path(X)
262 # * row i is to see the nodes (non-empty j) in which sample i appears.
263 # sparse row-first (CSR) format is OK
264 # * column j is to see the samples (non-empty i) that fall into that
265 # node. To do that, we need to make it CSC
266 nodes_to_samples = samples_to_nodes.tocsc()
268 # -- execute the pruning
269 # self.features_usage is a dict feature_idx -> nb times used, only
270 # for used features.
271 self.features_usage = build_models_and_get_pruning_info(
272 self.tree_,
273 X,
274 y,
275 nodes_to_samples,
276 self.leaf_model,
277 self.node_models,
278 global_std_dev,
279 use_pruning=self.use_pruning,
280 )
282 # -- cleanup to compress inner structures: only keep non-pruned ones
283 self._cleanup_tree()
285 if self.debug_prints: 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true
286 logger.debug("(debug_prints) Pruned tree:")
287 logger.debug(self.as_pretty_text(node_ids=True))
289 if self.use_smoothing == "installed":
290 # Retrieve the NEW decision path of each sample.
291 samples_to_nodes = self.decision_path(X)
292 # * row i is to see the nodes (non-empty j) in which sample i
293 # appears. sparse row-first (CSR) format is OK
294 # * column j is to see the samples (non-empty i) that fall into
295 # that node. To do that, we need to make it CSC
296 nodes_to_samples = samples_to_nodes.tocsc()
298 # default behaviour for smoothing constant and ratio
299 smoothing_constant = self._get_smoothing_constant_to_use(X)
301 self.install_smoothing(X, y, nodes_to_samples, smoothing_constant=smoothing_constant)
303 if self.debug_prints: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true
304 logger.debug("(debug_prints) Pruned and smoothed tree:")
305 logger.debug(self.as_pretty_text(node_ids=True))
307 return self
309 def _get_smoothing_constant_to_use(self, X):
310 """
311 Returns the smoothing_constant to use for smoothing, based on current
312 settings and X data
313 """
314 if self.smoothing_constant_ratio is not None: 314 ↛ 315line 314 didn't jump to line 315, because the condition on line 314 was never true
315 nb_training_samples = X.shape[0]
316 smoothing_cstt = self.smoothing_constant_ratio * nb_training_samples
317 if smoothing_cstt < 15:
318 warn(
319 "smoothing constant ratio %s is leading to an extremely "
320 "small smoothing constant %s because nb training samples"
321 " is %s. Clipping to 15." % (self.smoothing_constant_ratio, smoothing_cstt, nb_training_samples)
322 )
323 smoothing_cstt = 15
324 else:
325 smoothing_cstt = self.smoothing_constant
326 if smoothing_cstt is None: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true
327 smoothing_cstt = 15 # default from the original Quinlan paper
329 return smoothing_cstt
331 def _cleanup_tree(self):
332 """
333 Reduces the size of this object by removing from internal structures
334 all items that are not used any more (all leaves that have been pruned)
335 """
336 old_tree = self.tree_
337 old_node_modls = self.node_models
339 # Get all information to create a copy of the inner tree.
340 # Note: _tree.copy() is gone so we use the pickle way
341 # --- Base info: nb features, nb outputs, output classes
342 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631
343 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)
344 # So these remain, we are just interested in changing the node-related arrays
345 new_tree = _tree.Tree(*old_tree.__reduce__()[1])
347 # --- Node info
348 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
349 dct = old_tree.__getstate__().copy()
351 # cleanup: only keep the nodes that are not undefined.
352 # note: this is identical to
353 # valid_nodes_indices = dct["nodes"]['left_child'] != TREE_UNDEFINED
354 valid_nodes_indices = old_tree.children_left != _tree.TREE_UNDEFINED
355 # valid_nodes_indices2 = old_tree.children_right != TREE_UNDEFINED
356 # assert np.all(valid_nodes_indices == valid_nodes_indices2)
357 new_node_count = sum(valid_nodes_indices)
359 # create empty new structures
360 n_shape = (new_node_count, *dct["nodes"].shape[1:])
361 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)
362 v_shape = (new_node_count, *dct["values"].shape[1:])
363 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype)
364 m_shape = (new_node_count, *old_node_modls.shape[1:])
365 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype)
367 # Fill structures while reindexing the tree and remembering the depth
368 global next_free_id
369 next_free_id = 0
371 def _compress(old_node_id):
372 """
374 Parameters
375 ----------
376 old_node_id
378 Returns
379 -------
380 the depth and new indices of left and right children
382 """
383 global next_free_id
384 new_node_id = next_free_id
385 next_free_id += 1
387 # use the old tree to walk
388 old_node = dct["nodes"][old_node_id]
389 left_id = old_node["left_child"]
390 right_id = old_node["right_child"]
392 # Create the new node with a copy of the old
393 new_nodes[new_node_id] = old_node # this is an entire row so it is probably copied already by doing so.
394 new_values[new_node_id] = dct["values"][old_node_id]
395 new_node_models[new_node_id] = copy(old_node_modls[old_node_id])
397 if left_id == _tree.TREE_LEAF:
398 # ... and right_id == _tree.TREE_LEAF
399 # => node_id is a leaf. Nothing to do
400 return 1, new_node_id
401 else:
403 left_depth, new_id_left = _compress(left_id)
404 right_depth, new_id_right = _compress(right_id)
406 # store the new indices
407 new_nodes[new_node_id]["left_child"] = new_id_left
408 new_nodes[new_node_id]["right_child"] = new_id_right
410 return 1 + max(left_depth, right_depth), new_node_id
412 # complete definition of the new tree
413 dct["max_depth"] = _compress(0)[0] - 1 # root node has depth 0, not 1
414 dct["node_count"] = new_node_count # new_nodes.shape[0]
415 dct["nodes"] = new_nodes
416 dct["values"] = new_values
417 new_tree.__setstate__(dct)
419 # Fix an issue on sklearn 0.17.1: setstate was not updating max_depth
420 # See https://github.com/scikit-learn/scikit-learn/blob/0.17.1/sklearn/tree/_tree.pyx#L623
421 new_tree.max_depth = dct["max_depth"]
423 # update self fields
424 self.tree_ = new_tree
425 self.node_models = new_node_models
427 def install_smoothing(self, X_train_all, y_train_all, nodes_to_samples, smoothing_constant):
428 """
429 Executes the smoothing procedure described in the M5 and M5P paper,
430 "once for all". This means that all models are modified so that after
431 this method has completed, each model in the tree is already a smoothed
432 model.
434 This has pros (prediction speed) and cons (the model equations are
435 harder to read - lots of redundancy)
437 It can only be done if leaf models are instances of `LinearRegression`
438 """
439 # check validity: leaf models have to support pre-computed smoothing
440 if not isinstance(self.leaf_model, LinearRegression): 440 ↛ 441line 440 didn't jump to line 441, because the condition on line 440 was never true
441 raise TypeError(
442 "`install_smoothing` is only available if leaf "
443 "models are instances of `LinearRegression` or a "
444 "subclass"
445 )
447 # Select the Error metric to compute model errors (used to compare
448 # node with subtree for pruning)
449 # TODO in Weka they use RMSE, but in papers they use MAE. this should be a parameter.
450 # TODO Shall we go further and store the residuals, or a bunch of metrics? not sure
451 err_metric = root_mean_squared_error # mean_absolute_error
453 # --- Node info
454 # TODO we should probably not do this once in each method, but once or give access directly (no copy)
455 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
456 dct = self.tree_.__getstate__().copy()
457 old_node_models = self.node_models
459 m_shape = old_node_models.shape
460 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype)
462 def smooth__(
463 coefs_,
464 n_samples,
465 features=None,
466 parent=None, # type: _SmoothingDetails
467 parent_features=None,
468 k=smoothing_constant, #
469 ):
470 # type: (...) -> _SmoothingDetails
471 """
472 Smoothes the model coefficients or intercept `coefs_` (an array or
473 a scalar), using the smoothing results at parent node.
475 At each node we keep in memory three results A, B, C.
476 - the new coef at each node is A + B
477 - the recursion equations are
478 - B(n) = (k / (n_samples(n) + k)) * A(n-1) + B(n-1)
479 - C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1)
480 - A(n) = coef(n)*C(n)
481 """
482 if parent is None:
483 # A0 = coef(0), C0 = 1, B0 = 0
484 if features is None:
485 # single scalar value
486 return _SmoothingDetails(A=coefs_, B=0, C=1)
487 else:
488 # vector
489 if len(coefs_) == 0: 489 ↛ 490line 489 didn't jump to line 490, because the condition on line 489 was never true
490 coefs_ = np.asarray(coefs_)
491 if coefs_.shape[0] != len(features): 491 ↛ 492line 491 didn't jump to line 492, because the condition on line 491 was never true
492 raise ValueError("nb features does not match the nb " "of coefficients")
493 return _SmoothingDetails(
494 A=coefs_,
495 B=np.zeros(coefs_.shape, dtype=coefs_.dtype),
496 C=np.ones(coefs_.shape, dtype=coefs_.dtype),
497 )
498 else:
499 # B(n) = k/(n_samples(n)+k)*A(n-1) + B(n-1)
500 # C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1)
501 Bn = (k / (n_samples + k)) * parent.A + parent.B
502 Cn = (n_samples / (n_samples + k)) * parent.C
504 # A(n) = coef(n) * C(n)
505 if features is None:
506 # single scalar value: easy
507 An = coefs_ * Cn
508 return _SmoothingDetails(A=An, B=Bn, C=Cn)
509 else:
510 # vector of coefs: we have to 'expand' the coefs array
511 # because the coefs at this node apply for (features)
512 # while coefs at parent node apply for (parent_features)
513 An = np.zeros(Cn.shape, dtype=Cn.dtype)
514 parent_features = np.array(parent_features)
515 features = np.array(features)
517 # Thanks https://stackoverflow.com/a/8251757/7262247 !
518 index = np.argsort(parent_features)
519 sorted_parents = parent_features[index]
520 sorted_index = np.searchsorted(sorted_parents, features)
522 features_index = np.take(index, sorted_index, mode="clip")
523 if np.any(parent_features[features_index] != features): 523 ↛ 525line 523 didn't jump to line 525, because the condition on line 523 was never true
524 # result = np.ma.array(features_index, mask=mask)
525 raise ValueError(
526 "Internal error - please report this."
527 "One feature was found in the child "
528 "node, that was not in the parent "
529 "node."
530 )
532 if len(features_index) > 0:
533 An[features_index] = coefs_ * Cn[features_index]
535 return _SmoothingDetails(A=An, B=Bn, C=Cn)
537 def _smooth(
538 node_id,
539 parent_features=None,
540 parent_coefs: _SmoothingDetails = None,
541 parent_intercept: _SmoothingDetails = None,
542 ):
543 # Gather all info on this node
544 # --base regression tree
545 node_info = dct["nodes"][node_id]
546 left_id = node_info["left_child"]
547 right_id = node_info["right_child"]
548 # --additional model
549 node_model = old_node_models[node_id]
550 # --samples
551 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)
552 n_samples_at_this_node = samples_at_this_node.shape[0]
554 # Note: should be equal to tree.n_node_samples[node_id]
555 if n_samples_at_this_node != self.tree_.n_node_samples[node_id]: 555 ↛ 556line 555 didn't jump to line 556, because the condition on line 555 was never true
556 raise ValueError("n_samples_at_this_node: Unexpected value, please report it as issue.")
558 y_true_this_node = y_train_all[samples_at_this_node]
559 X_this_node = X_train_all[samples_at_this_node, :]
561 # (1) smooth current node
562 parent_features = parent_features if parent_features is not None else None
563 is_constant_leaf = False
564 if left_id == _tree.TREE_LEAF and isinstance(node_model, ConstantLeafModel):
565 is_constant_leaf = True
566 node_features = ()
567 smoothed_features = parent_features if parent_features is not None else node_features
568 node_coefs = ()
569 node_intercept = dct["values"][node_id]
571 # Extract the unique scalar value
572 if len(node_intercept) != 1: 572 ↛ 573line 572 didn't jump to line 573, because the condition on line 572 was never true
573 raise ValueError("node_intercept: Unexpected value: , please report it as issue." % node_intercept)
575 node_intercept = node_intercept.item()
577 else:
578 # A leaf LinRegLeafModel or a split SplitNodeModel
579 node_features = node_model.features
580 smoothed_features = parent_features if parent_features is not None else node_features
581 node_coefs = node_model.model.coef_
582 node_intercept = node_model.model.intercept_
584 # Create a new linear regression model with smoothed coefficients
585 smoothed_sklearn_model = clone(self.leaf_model)
586 smoothed_coefs = smooth__(
587 node_coefs,
588 features=node_features,
589 n_samples=n_samples_at_this_node,
590 parent=parent_coefs,
591 parent_features=parent_features,
592 )
593 smoothed_intercept = smooth__(node_intercept, n_samples=n_samples_at_this_node, parent=parent_intercept)
594 smoothed_sklearn_model.coef_ = smoothed_coefs.A + smoothed_coefs.B
595 smoothed_sklearn_model.intercept_ = smoothed_intercept.A + smoothed_intercept.B
597 # Finally update the node
598 if is_constant_leaf:
599 smoothed_node_model = LinRegLeafModel(smoothed_features, smoothed_sklearn_model, None)
600 else:
601 smoothed_node_model = copy(node_model)
602 smoothed_node_model.features = smoothed_features
603 smoothed_node_model.model = smoothed_sklearn_model
605 # Remember the new smoothed model
606 new_node_models[node_id] = smoothed_node_model
608 if left_id == _tree.TREE_LEAF:
609 # If this is a leaf, update the prediction error on X
610 y_pred_this_node = smoothed_node_model.predict(X_this_node)
611 smoothed_node_model.error = err_metric(y_true_this_node, y_pred_this_node)
613 else:
614 # If this is a split node - recurse on each subtree
615 _smooth(
616 left_id,
617 parent_features=smoothed_features,
618 parent_coefs=smoothed_coefs,
619 parent_intercept=smoothed_intercept,
620 )
621 _smooth(
622 right_id,
623 parent_features=smoothed_features,
624 parent_coefs=smoothed_coefs,
625 parent_intercept=smoothed_intercept,
626 )
628 # Update the error using the same formula than the one we used
629 # in build_models_and_get_pruning_info
630 y_pred_children = predict_from_leaves_no_smoothing(self.tree_, new_node_models, X_this_node)
631 err_children = err_metric(y_true_this_node, y_pred_children)
633 # the total number of parameters is the sum of params in each
634 # branch PLUS 1 for the split
635 n_params_splitmodel = new_node_models[left_id].n_params + new_node_models[right_id].n_params + 1
636 smoothed_node_model.n_params = n_params_splitmodel
637 # do not adjust the error now, simply store the raw one
638 # smoothed_node_model.error = compute_adjusted_error(
639 # err_children, n_samples_at_this_node, n_params_splitmodel)
640 smoothed_node_model.error = err_children
642 return
644 # smooth the whole tree
645 _smooth(0)
647 # use the new node models now
648 self.node_models = new_node_models
650 # remember the smoothing constant installed
651 self.installed_smoothing_constant = smoothing_constant
652 return
654 def denormalize(self, x_scaler, y_scaler):
655 """
656 De-normalizes this model according to the provided x and y normalization scalers.
657 Currently only StandardScaler issupported.
659 :param x_scaler: a StandardScaler or None
660 :param y_scaler: a StandardScaler or None
661 :return:
662 """
663 # perform denormalization
664 self._denormalize_tree(x_scaler, y_scaler)
666 def _denormalize_tree(self, x_scaler, y_scaler):
667 """
668 De-normalizes all models in the tree
669 :return:
670 """
671 old_tree = self.tree_
672 old_node_models = self.node_models
674 # Get all information to create a copy of the inner tree.
675 # Note: _tree.copy() is gone so we use the pickle way
676 # --- Base info: nb features, nb outputs, output classes
677 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631
678 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)
679 # So these remain, we are just interested in changing the node-related arrays
680 new_tree = _tree.Tree(*old_tree.__reduce__()[1])
682 # --- Node info
683 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
684 dct = old_tree.__getstate__().copy()
686 # create empty new structures for nodes and models
687 n_shape = dct["nodes"].shape
688 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)
689 v_shape = dct["values"].shape
690 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype)
691 m_shape = old_node_models.shape
692 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype)
694 def _denormalize(node_id, x_scaler, y_scaler):
695 """
696 denormalizes the subtree below node `node_id`.
698 - `new_nodes[node_id]` is filled with a copy of the old node
699 (see old_node.dtype to see the various fields). If the node is
700 a split node, the split threshold
701 `new_nodes[node_id]['threshold']` is denormalized.
703 - `new_values[node_id]` is filled with a denormalized copy of the
704 old constant prediction at this node. Reminder: this constant
705 prediction is actually used only when on the leaves now, but
706 we keep it for ref.
708 - `new_node_models[node_id]` is filled with a copy of the old
709 model `old_node_models[node_id]`, that is denormalized if it
710 is not a constant leaf
712 :param node_id:
713 :return: (nothing)
714 """
715 # use the old tree to walk
716 old_node = dct["nodes"][node_id]
717 left_id = old_node['left_child']
718 right_id = old_node['right_child']
720 # Create the new node with a copy of the old
721 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so.
722 new_model = copy(old_node_models[node_id])
723 new_node_models[node_id] = new_model
725 # Create the new value by de-scaling y
726 if y_scaler is not None:
727 # Note: if this is a split node with a linear regression model
728 # the value will never be used. However to preserve consistency
729 # of the whole values structure and for debugging purposes, we
730 # choose this safe path of denormalizing ALL.
731 # TODO we could also do it at once outside of the recursive
732 # calls, but we should check for side effects
733 new_values[node_id] = y_scaler.inverse_transform(
734 dct["values"][node_id]
735 )
736 else:
737 # no denormalization: simply copy
738 new_values[node_id] = dct["values"][node_id]
740 if left_id == _tree.TREE_LEAF:
741 # ... and right_id == _tree.TREE_LEAF
742 # => node_id is a leaf
743 if isinstance(new_model, ConstantLeafModel):
744 # nothing to do: we already re-scaled the value
745 return
746 elif isinstance(new_model, LinRegLeafModel):
747 # denormalize model
748 new_model.denormalize(x_scaler, y_scaler)
749 return
750 else:
751 raise TypeError("Internal error - Leafs can only be"
752 "constant or linear regression")
753 else:
754 # this is a split node, denormalize each subtree
755 _denormalize(left_id, x_scaler, y_scaler)
756 _denormalize(right_id, x_scaler, y_scaler)
758 # denormalize the split value if needed
759 if x_scaler is not None:
760 split_feature = old_node['feature']
761 # The denormalizer requires a vector with all the features,
762 # even if we only want to denormalize one.
763 # -- put split value in a vector where it has pos 'feature'
764 old_threshold_and_zeros = np.zeros((self.n_features_, ), dtype=dct["nodes"]['threshold'].dtype)
765 old_threshold_and_zeros[split_feature] = old_node['threshold']
766 # -- denormalize the vector and retrieve value 'feature'
767 new_nodes[node_id]['threshold'] = x_scaler.inverse_transform(old_threshold_and_zeros)[split_feature]
768 else:
769 # no denormalization: simply copy
770 new_nodes[node_id]['threshold'] = old_node['threshold']
772 if isinstance(new_model, SplitNodeModel):
773 # denormalize model at split node too, even if it is not
774 # always used (depending on smoothing mode)
775 new_model.denormalize(x_scaler, y_scaler)
776 else:
777 raise TypeError("Internal error: all intermediate nodes"
778 "should be SplitNodeModel")
780 return
782 # denormalize the whole tree and put the result in (new_nodes,
783 # new_values, new_node_models) recursively
784 _denormalize(0, x_scaler, y_scaler)
786 # complete definition of the new tree
787 dct["nodes"] = new_nodes
788 dct["values"] = new_values
789 new_tree.__setstate__(dct)
791 # update the self fields
792 # self.features_usage
793 self.tree_ = new_tree
794 self.node_models = new_node_models
796 def compress_features(self):
797 """
798 Compresses the model and returns a vector of required feature indices.
799 This model input will then be X[:, features] instead of X.
800 """
801 used_features = sorted(self.features_usage.keys())
802 new_features_lookup_dct = {old_feature_idx: i for i, old_feature_idx in enumerate(used_features)}
804 if used_features == list(range(self.n_features_in_)):
805 # NOTHING TO DO: we need all features
806 return used_features
808 # Otherwise, We can compress. For this we have to create a copy of the
809 # tree because we will change its internals
810 old_tree = self.tree_
811 old_node_modls = self.node_models
813 # Get all information to create a copy of the inner tree.
814 # Note: _tree.copy() is gone so we use the pickle way
815 # --- Base info: nb features, nb outputs, output classes
816 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631
817 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs)
818 # So these remain, we are just interested in changing the node-related arrays
819 new_tree = _tree.Tree(*old_tree.__reduce__()[1])
821 # --- Node info
822 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637
823 dct = old_tree.__getstate__().copy()
825 # create empty new structures for nodes and models
826 n_shape = dct["nodes"].shape
827 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype)
828 m_shape = old_node_modls.shape
829 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype)
831 def _compress_features(node_id):
832 """
834 Parameters
835 ----------
836 node_id
838 Returns
839 -------
840 the depth and new indices of left and right children
842 """
843 # use the old tree to walk
844 old_node = dct["nodes"][node_id]
845 left_id = old_node["left_child"]
846 right_id = old_node["right_child"]
848 # Create the new node with a copy of the old
849 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so.
850 new_model = copy(old_node_modls[node_id])
851 new_node_models[node_id] = new_model
853 if left_id == _tree.TREE_LEAF:
854 # ... and right_id == _tree.TREE_LEAF
855 # => node_id is a leaf
856 if isinstance(new_model, ConstantLeafModel):
857 # no features used
858 return
859 elif isinstance(new_model, LinRegLeafModel):
860 # compress that model
861 new_model.reindex_features(new_features_lookup_dct)
862 return
863 else:
864 raise TypeError("Internal error - Leafs can only be" "constant or linear regression")
865 else:
867 _compress_features(left_id)
868 _compress_features(right_id)
870 # store the new split feature index in the node
871 new_nodes[node_id]["feature"] = new_features_lookup_dct[old_node["feature"]]
873 if not isinstance(new_model, SplitNodeModel):
874 raise TypeError("Internal error: all intermediate nodes" "should be SplitNodeModel")
876 # TODO now that split node models have a linear regression
877 # model too, we should update them here
879 return
881 _compress_features(0)
883 # complete definition of the new tree
884 dct["nodes"] = new_nodes
885 new_tree.__setstate__(dct)
887 # update the self fields
888 self.features_usage = {new_features_lookup_dct[k]: v for k, v in self.features_usage.items()}
889 self.tree_ = new_tree
890 self.node_models = new_node_models
891 self.n_features_in_ = len(self.features_usage)
893 # return the vector of used features
894 return used_features
896 @property
897 def feature_importances_(self):
898 """Return the feature importances (the higher, the more important).
900 Returns
901 -------
902 feature_importances_ : array, shape = [n_features]
903 """
904 check_is_fitted(self, "tree_")
906 # TODO adapt ?
907 # features = np.array([self.features_usage[k] for k in sorted(self.features_usage.keys())], dtype=int)
908 features = self.tree_.compute_feature_importances()
910 return features
912 def predict(self, X, check_input=True, smooth_predictions=None, smoothing_constant=None):
913 """Predict class or regression value for X.
915 For a classification model, the predicted class for each sample in X is
916 returned. For a regression model, the predicted value based on X is
917 returned.
919 Parameters
920 ----------
921 X : numpy.ndarray, shape (n_samples, n_features)
922 The input samples. Internally, it will be converted to
923 ``dtype=np.float32`` and if a sparse matrix is provided
924 to a sparse ``csr_matrix``.
925 check_input : boolean, (default=True)
926 Allow to bypass several input checking.
927 Don't use this parameter unless you know what you do.
928 smooth_predictions : boolean, (default=None)
929 None means "use self config"
930 smoothing_constant: int, (default=self.smoothing_constant)
931 Smoothing constant used when smooth_predictions is True. During
932 smoothing, child node models are recursively mixed with their
933 parent node models, and the mix is done using a weighted sum. The
934 weight given to the child model is the number of training samples
935 that reached its node, while the weight given to the parent model
936 is the smoothing constant. Therefore it can be seen as an
937 equivalent number of samples that parent models represent when
938 injected into the recursive weighted sum.
940 Returns
941 -------
942 y : array of shape = [n_samples] or [n_samples, n_outputs]
943 The predicted classes, or the predict values.
944 """
945 check_is_fitted(self, "tree_")
947 # If this is just a constant node only check the input's shape.
948 if self.n_features_in_ == 0: 948 ↛ 950line 948 didn't jump to line 950, because the condition on line 948 was never true
949 # perform the input checking manually to set ensure_min_features=0
950 X = check_array(X, dtype=DTYPE, accept_sparse="csr", ensure_min_features=0)
951 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
952 raise ValueError("No support for np.int64 index based " "sparse matrices")
953 # skip it then
954 check_input = False
956 # validate and convert dtype
957 X = self._validate_X_predict(X, check_input)
959 # -------- This is the only change wrt parent class. TODO maybe rather replace self.tree_ with a proxy ---------
960 if smooth_predictions is None:
961 # Default: smooth prediction at prediction time if configured as
962 # such in the model. Note: if self.use_smoothing == 'installed',
963 # models are already smoothed models, so no need to smooth again
964 smooth_predictions = self.use_smoothing == "on_prediction"
965 else:
966 # user provided an explicit value for smooth_predictions
967 if not smooth_predictions and self.use_smoothing == "installed": 967 ↛ 968line 967 didn't jump to line 968, because the condition on line 967 was never true
968 raise ValueError(
969 "Smoothing has been pre-installed on this "
970 "tree, it is not anymore possible to make "
971 "predictions without smoothing"
972 )
974 if smooth_predictions and smoothing_constant is None: 974 ↛ 977line 974 didn't jump to line 977, because the condition on line 974 was never true
975 # default parameter for smoothing is the one defined in the model
976 # (with possible ratio)
977 smoothing_constant = self._get_smoothing_constant_to_use(X)
979 # Do not use the embedded tree (like in super): it has been pruned but
980 # still has constant nodes
981 # proba = self.tree_.predict(X)
982 if not smooth_predictions:
983 proba = predict_from_leaves_no_smoothing(self.tree_, self.node_models, X)
984 else:
985 proba = predict_from_leaves(self, X, smoothing=True, smoothing_constant=smoothing_constant)
986 if len(proba.shape) < 2:
987 proba = proba.reshape(-1, 1)
988 # ------------------------------------------
990 n_samples = X.shape[0]
992 # Classification
993 if is_classifier(self): 993 ↛ 994line 993 didn't jump to line 994, because the condition on line 993 was never true
994 if self.n_outputs_ == 1:
995 return self.classes_.take(np.argmax(proba, axis=1), axis=0)
996 else:
997 predictions = np.zeros((n_samples, self.n_outputs_))
998 for k in range(self.n_outputs_):
999 predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)
1000 return predictions
1002 # Regression
1003 else:
1004 if self.n_outputs_ == 1: 1004 ↛ 1007line 1004 didn't jump to line 1007, because the condition on line 1004 was never false
1005 return proba[:, 0]
1006 else:
1007 return proba[:, :, 0]
1010class ConstantLeafModel:
1011 """
1012 Represents the additional information about a leaf node that is not pruned.
1013 It contains the error associated with the training samples at this node.
1015 Note: the constant value is not stored here, as it is already available in
1016 the sklearn tree struct. So to know the prediction at this node, use
1017 `tree.value[node_id]`
1018 """
1020 __slots__ = ("error",)
1022 def __init__(self, error):
1023 self.error = error
1025 @property
1026 def n_params(self) -> int:
1027 """
1028 Returns the number of parameters used by this model, including the
1029 constant driver
1030 """
1031 return 1
1033 @staticmethod
1034 def predict_cstt(tree, node_id, n_samples):
1035 """
1036 This static method is a helper to get an array of constant predictions
1037 associated with this node.
1039 Parameters
1040 ----------
1041 tree
1042 node_id
1043 n_samples
1045 Returns
1046 -------
1048 """
1049 cstt_prediction = tree.value[node_id]
1051 # cstt_prediction can be multioutput so it is an array. replicate it
1052 from numpy import matlib # note: np.matlib not available in 1.10.x
1054 return matlib.repmat(cstt_prediction, n_samples, 1)
1057class LinRegNodeModel(DeNormalizableMixIn):
1058 """
1059 Represents the additional information about a tree node that contains a
1060 linear regression model.
1062 It contains
1063 - the features used by the model,
1064 - the scikit learn model object itself,
1065 - and the error for the training samples that reached this node
1067 """
1068 __slots__ = ("features", "model", "error", "n_params")
1070 def __init__(self, features, model, error) -> None:
1071 self.features = features
1072 self.model = model
1073 self.error = error
1074 self.n_params = len(self.features) + 1
1076 def to_text(self, feature_names=None, target_name=None, precision=3,
1077 line_breaks=False):
1078 """ Returns a text representation of the linear regression model """
1079 return linreg_model_to_text(self.model, feature_names=feature_names,
1080 target_name=target_name,
1081 precision=precision,
1082 line_breaks=line_breaks)
1084 def reindex_features(self, new_features_lookup_dct):
1085 """
1086 Reindexes the required features using the provided lookup dictionary.
1088 Parameters
1089 ----------
1090 new_features_lookup_dct
1092 Returns
1093 -------
1095 """
1096 self.features = [new_features_lookup_dct[f] for f in self.features]
1098 def denormalize(self,
1099 x_scaler: StandardScaler = None,
1100 y_scaler: StandardScaler = None
1101 ):
1102 """
1103 De-normalizes the linear model.
1105 :param x_scaler:
1106 :param y_scaler:
1107 :return:
1108 """
1109 # create a clone of the x scaler with only the used features
1110 x_scaler = copy(x_scaler)
1111 if len(self.features) > 0:
1112 x_scaler.scale_ = x_scaler.scale_[self.features]
1113 x_scaler.mean_ = x_scaler.mean_[self.features]
1114 else:
1115 # in that particular case, the above expression is not working
1116 x_scaler.scale_ = x_scaler.scale_[0:0]
1117 x_scaler.mean_ = x_scaler.mean_[0:0]
1119 # use the denormalize function on the internal model
1120 # TODO what if internal model is not able ?
1121 self.model.denormalize(x_scaler, y_scaler)
1123 def predict(self, X: np.ndarray):
1124 """Performs a prediction for X, only using the required features."""
1125 if len(self.features) < 1 and isinstance(self.model, LinearRegression): 1125 ↛ 1129line 1125 didn't jump to line 1129, because the condition on line 1125 was never true
1126 # unfortunately LinearRegression models do not like it when no
1127 # features are needed: their input validation requires at least 1.
1128 # so we do it ourselves.
1129 return self.model.intercept_ * np.ones((X.shape[0], 1))
1130 else:
1131 return self.model.predict(X[:, self.features])
1134class LinRegLeafModel(LinRegNodeModel):
1135 """
1136 Represents the additional information about a leaf node with a linear
1137 regression model
1138 """
1139 pass
1142class SplitNodeModel(LinRegNodeModel):
1143 """
1144 Represents the additional information about a split node, with a linear
1145 regression model.
1146 """
1147 __slots__ = ("n_params",)
1149 def __init__(self, n_params, error, features, model):
1150 self.n_params = n_params
1151 super(SplitNodeModel, self).__init__(features=features, model=model, error=error)
1154PRUNING_MULTIPLIER = 2
1155# see https://github.com/bnjmn/weka/blob/master/weka/src/main/java/weka/classifiers/trees/m5/RuleNode.java#L124
1156# TODO check why they use 2 instead of 1 (from the article) ??
1159def root_mean_squared_error(*args, **kwargs):
1160 return np.sqrt(mean_squared_error(*args, **kwargs))
1163def prune_on_min_impurity(tree):
1164 """
1165 Edits the given tree so as to prune subbranches that do not respect the min
1166 impurity criterion.
1168 The paper suggests to do this with 5% but min_impurity_split is not
1169 available in 0.17 (and criterion is not std but mse so we have to square)
1171 Parameters
1172 ----------
1173 tree
1175 Returns
1176 -------
1178 """
1179 left_children = tree.children_left
1180 right_children = tree.children_right
1181 impurities = tree.impurity
1183 # The paper suggests to do this with 5% but min_impurity_split is not
1184 # available in 0.17 (and criterion is not std but mse so we have to square)
1185 # TODO adapt the formula to criterion used and/or generalize with min_impurity_split_as_initial_ratio
1186 root_impurity = impurities[0]
1187 impurity_threshold = root_impurity * (0.05 ** 2)
1189 def stop_on_min_impurity(node_id):
1190 # note: in the paper that is 0.05 but criterion is on std. Here
1191 # impurity is mse so a squared equivalent of std.
1192 left_id = left_children[node_id]
1193 right_id = right_children[node_id]
1194 if left_id != _tree.TREE_LEAF: # a split node
1195 if impurities[node_id] < impurity_threshold:
1196 # stop here, this will be a leaf
1197 prune_children(node_id, tree)
1198 else:
1199 stop_on_min_impurity(left_id)
1200 stop_on_min_impurity(right_id)
1202 stop_on_min_impurity(0)
1205def build_models_and_get_pruning_info(
1206 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model, node_models, global_std_dev, use_pruning, node_id=0
1207):
1208 """
1210 Parameters
1211 ----------
1212 tree : Tree
1213 A tree that will be pruned on the way
1214 X_train_all
1215 y_train_all
1216 nodes_to_samples
1217 leaf_model
1218 node_models
1219 global_std_dev
1220 use_pruning
1221 node_id
1223 Returns
1224 -------
1225 a dictionary where the key is the feature index and the value is
1226 the number of samples where this feature is used
1227 """
1229 # Select the Error metric to compute model errors (used to compare node
1230 # with subtree for pruning)
1231 # TODO in Weka they use RMSE, but in papers they use MAE. could be a param
1232 # TODO Shall we go further and store the residuals, or a bunch of metrics?
1233 err_metric = root_mean_squared_error # mean_absolute_error
1235 # Get the samples associated with this node
1236 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)
1237 n_samples_at_this_node = samples_at_this_node.shape[0]
1238 y_true_this_node = y_train_all[samples_at_this_node]
1240 # Is this a leaf node or a split node ?
1241 left_node = tree.children_left[node_id] # this way we do not have to query it again in the else.
1242 if left_node == _tree.TREE_LEAF:
1243 # --> Current node is a LEAF. See is_leaf(node_id, tree) if you have doubts <--
1245 # -- create a linear model for this node
1246 # leaves do not have the right to use any features since they have no
1247 # subtree: keep the constant prediction
1248 y_pred_this_node = ConstantLeafModel.predict_cstt(tree, node_id, y_true_this_node.shape[0])
1249 err_this_node = err_metric(y_true_this_node, y_pred_this_node)
1251 # TODO when use_pruning = False, should we allow all features to be
1252 # used instead of having to stick to the M5 rule of "only use a
1253 # feature if the subtree includes a split with this feature" ?
1254 # OR alternate proposal: should we transform the boolean use_pruning
1255 # into a use_pruning_max_nb integer to say for example "only 2 level
1256 # of pruning" ?
1258 # -- store the model information
1259 node_models[node_id] = ConstantLeafModel(err_this_node)
1261 # -- return an empty dict for features used
1262 return dict() # np.array([], dtype=int)
1264 else:
1265 # --> Current node is a SPLIT <--
1266 right_node = tree.children_right[node_id]
1268 # (1) prune left and right subtree and get some information
1269 features_l = build_models_and_get_pruning_info(
1270 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model,
1271 node_models, global_std_dev, use_pruning, node_id=left_node
1272 )
1273 features_r = build_models_and_get_pruning_info(
1274 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model,
1275 node_models, global_std_dev, use_pruning, node_id=right_node
1276 )
1278 # (2) select only the samples that reach this node
1279 X_this_node = X_train_all[samples_at_this_node, :]
1281 # (3) Create a model for this node
1282 # -- fit a linear regression model taking into account only the
1283 # features used in the subtrees + the split one
1284 # TODO should we normalize=True (variance scaling) or use a whole
1285 # pipeline here ? Not sure..
1286 # estimators = [('scale', StandardScaler()), ('clf', model_type())];
1287 # pipe = Pipeline(estimators)
1288 # skmodel_this_node = leaf_model_type(**leaf_model_params)
1289 skmodel_this_node = clone(leaf_model)
1291 # -- old - we used to only store the array of features
1292 # selected_features = np.union1d(features_l, features_r)
1293 # selected_features = np.union1d(selected_features, tree.feature[node_id])
1294 # -- new - we also gather the nb samples where this feature is used
1295 selected_features_dct = features_l
1296 for feature_id, n_samples_feat_used in features_r.items():
1297 if feature_id in selected_features_dct.keys():
1298 selected_features_dct[feature_id] += n_samples_feat_used
1299 else:
1300 selected_features_dct[feature_id] = n_samples_feat_used
1301 selected_features_dct[tree.feature[node_id]] = n_samples_at_this_node
1302 # -- use only the selected features, in the natural integer order
1303 selected_features = sorted(selected_features_dct.keys())
1305 X_train_this_node = X_this_node[:, selected_features]
1306 skmodel_this_node.fit(X_train_this_node, y_true_this_node)
1307 # -- predict and compute error
1308 y_pred_this_node = skmodel_this_node.predict(X_train_this_node)
1309 err_this_node = err_metric(y_true_this_node, y_pred_this_node)
1310 # -- create the object
1311 # TODO the paper suggest to perform recursive feature elimination in
1312 # this model until adjusted_err_model is minimal, is it same in Weka ?
1313 model_this_node = LinRegLeafModel(selected_features, skmodel_this_node, err_this_node)
1315 # (4) compute adj error criterion = ERR * (n+v)/(n-v) for both models
1316 adjusted_err_model = compute_adjusted_error(err_this_node, n_samples_at_this_node, model_this_node.n_params)
1318 # (5) predict and compute adj error for the combination of child models
1319 # -- Note: this is recursive so the leaves may contain linear models
1320 # already
1321 y_pred_children = predict_from_leaves_no_smoothing(tree, node_models, X_this_node)
1322 err_children = err_metric(y_true_this_node, y_pred_children)
1324 # TODO the Weka implem (below) differs from the paper that suggests a
1325 # weigthed sum of adjusted errors. This is maybe an equivalent formulation, to check.
1326 # the total number of parameters if we do not prune, is the sum of
1327 # params in each branch PLUS 1 for the split
1328 n_params_splitmodel = node_models[left_node].n_params + node_models[right_node].n_params + 1
1329 adjusted_err_children = compute_adjusted_error(err_children, n_samples_at_this_node, n_params_splitmodel)
1331 # (6) compare and either prune at this node or keep the subtrees
1332 std_dev_this_node = np.nanstd(y_true_this_node)
1333 # note: the first criterion is now already checked before that
1334 # function call, in `prune_on_min_impurity`
1335 if use_pruning and (
1336 # TODO these parameters should be in the constructor
1337 # see also 2 comments about min_impurity_split_as_initial_ratio
1338 std_dev_this_node < (global_std_dev * 0.05)
1339 or (adjusted_err_model <= adjusted_err_children)
1340 or (adjusted_err_model < (global_std_dev * 0.00001)) # so this means very good R² already
1341 ):
1342 # Choose model for this node rather than subtree model
1343 # -- prune from this node on
1344 removed_nodes = prune_children(node_id, tree)
1345 node_models[removed_nodes] = _tree.TREE_UNDEFINED
1347 # store the model information
1348 node_models[node_id] = model_this_node
1350 # update and return the features used
1351 selected_features_dct = {k: n_samples_at_this_node for k in selected_features_dct.keys()}
1352 return selected_features_dct
1354 else:
1355 # The subtrees are good or we do not want pruning: keep them.
1356 # This node will remain a split, and will only contain the digest
1357 # about the subtree
1358 node_models[node_id] = SplitNodeModel(
1359 n_params_splitmodel, err_children, selected_features, skmodel_this_node
1360 )
1362 # return the features used
1363 return selected_features_dct
1366def compute_adjusted_error(err, n_samples, n_parameters, multiplier=PRUNING_MULTIPLIER):
1367 """
1368 Return the penalized error obtained from `err`, the prediction error at a
1369 given tree node, by multiplying it by ``(n+p)/(n-p)``. `n` is the number of
1370 samples at this node and `p` is the number of parameters of the model.
1372 According to the original M5 paper, this is to penalize complex models
1373 used for few samples.
1375 Note that if ``n_samples <= n_parameters`` the denominator is zero or negative.
1376 In this case an arbitrary high penalization factor is used: ``10 * err``.
1378 Parameters
1379 ----------
1380 err : float
1381 The model error for a given tree node. Typically the MAE.
1383 n_samples : int
1384 The number of samples at this node.
1386 n_parameters : int
1387 The number of parameters of the model at this node.
1389 Returns
1390 -------
1391 penalized_mae : float
1392 The penalized error ``err * (n+p)/(n-p)``, or ``10 * err`` if ``n <= p``.
1393 """
1394 #
1395 if n_samples <= n_parameters: 1395 ↛ 1398line 1395 didn't jump to line 1398, because the condition on line 1395 was never true
1396 # denominator is zero or negative: use a large factor so as to penalize
1397 # a lot this overly complex model.
1398 factor = 10.0 # Caution says Yong in his code
1399 else:
1400 factor = (n_samples + multiplier * n_parameters) / (n_samples - n_parameters)
1402 return err * factor
1405def predict_from_leaves_no_smoothing(tree, node_models, X):
1406 """
1407 Returns the prediction for all samples in X, based on using the appropriate
1408 model leaf.
1410 This function
1411 - uses tree.apply(X) to know in which tree leaf each sample in X goes
1412 - then for each of the leaves that are actually touched, uses
1413 node_models[leaf_id] to predict, for the X reaching that leaf
1415 This function assumes that node_models contains non-empty LinRegLeafModel
1416 entries for all leaf nodes that will be reached by the samples X.
1418 Parameters
1419 ----------
1420 tree : Tree
1421 The tree object.
1422 node_models : array-like
1423 The array containing node models
1424 X : 2D array-like
1425 The sample data.
1427 Returns
1428 -------
1429 y_predicted : 1D array-like
1430 The prediction vector.
1431 """
1432 # **** This does the job, but we have one execution of model.predict() per
1433 # sample: probably not efficient
1434 # sample_ids_to_leaf_node_ids = tree.apply(X)
1435 # model_and_x = np.concatenate((node_models[leaf_node_ids].reshape(-1, 1), X), axis=1)
1436 # def pred(m_and_x):
1437 # return m_and_x[0].model.predict(m_and_x[1:].reshape(1,-1))[0]
1438 # y_predicted = np.array(list(map(pred, model_and_x)))
1440 # **** This should be faster because the number of calls to predict() is
1441 # equal to the number of leaves touched
1442 sample_ids_to_leaf_node_ids = tree.apply(X)
1443 y_predicted = -np.ones(sample_ids_to_leaf_node_ids.shape, dtype=DOUBLE)
1445 # -- find the unique list of leaves touched
1446 leaf_node_ids, inverse_idx = np.unique(sample_ids_to_leaf_node_ids, return_inverse=True)
1448 # -- for each leaf perform the prediction for samples reaching that leaf
1449 for leaf_node_id in leaf_node_ids:
1450 # get the indices of the samples reaching that leaf
1451 sample_indices = np.nonzero(sample_ids_to_leaf_node_ids == leaf_node_id)[0]
1453 # predict
1454 node_model = node_models[leaf_node_id]
1455 if isinstance(node_model, LinRegLeafModel):
1456 y_predicted[sample_indices] = np.ravel(node_model.predict(X[sample_indices, :]))
1457 else:
1458 # isinstance(node_model, ConstantLeafModel)
1459 y_predicted[sample_indices] = tree.value[leaf_node_id]
1461 # **** Recursive strategy: not used anymore
1462 # left_node = tree.children_left[node_id] # this way we do not have to query it again in the else.
1463 # if left_node == _tree.TREE_LEAF:
1464 # # --> Current node is a LEAF. See is_leaf(node_id, tree) <--
1465 # y_predicted = node_models[node_id].model.predict()
1466 # else:
1467 # # --> Current node is a SPLIT <--
1468 # right_node = tree.children_right[node_id]
1469 #
1470 #
1471 # samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples)
1472 # y_true_this_node = y_train_all[samples_at_this_node]
1473 # # As surprising as it may seem, in numpy [samples_at_this_node, selected_features] does something else.
1474 # X_train_this_node = X_train_all[samples_at_this_node, :][:, selected_features]
1475 #
1476 # X_left
1478 return y_predicted
1481def predict_from_leaves(m5p, X, smoothing=True, smoothing_constant=15):
1482 """
1483 Predicts using the M5P tree, without using the compiled sklearn
1484 `tree.apply` subroutine.
1486 The main purpose of this function is to apply smoothing to a M5P model tree
1487 where smoothing has not been pre-installed on the models. For examples to
1488 enable a model to be used both without and with smoothing for comparisons
1489 purposes, or for models whose leaves are not Linear Models and therefore
1490 for which no pre-installation method exist.
1492 Note: this method is slower than `predict_from_leaves_no_smoothing` when
1493 `smoothing=False`.
1495 Parameters
1496 ----------
1497 m5p : M5Prime
1498 The model to use for prediction
1499 X : array-like
1500 The input data
1501 smoothing : bool
1502 Whether to apply smoothing
1503 smoothing_constant : int
1504 The smoothing constant `k` used as the prediction weight of parent node model.
1505 (k=15 in the articles).
1508 Returns
1509 -------
1511 """
1512 # validate and converts dtype just in case this was directly called
1513 # e.g. in unit tests
1514 X = m5p._validate_X_predict(X, check_input=True)
1516 tree = m5p.tree_
1517 node_models = m5p.node_models
1518 nb_samples = X.shape[0]
1519 y_predicted = -np.ones((nb_samples, 1), dtype=DOUBLE)
1521 # sample_ids_to_leaf_node_ids = tree.apply(X)
1522 def smooth_predictions(ancestor_nodes, X_at_node, y_pred_at_node):
1523 # note: y_predicted_at_node can be a constant
1524 current_node_model_id = ancestor_nodes[-1]
1525 for _i, parent_model_id in enumerate(reversed(ancestor_nodes[:-1])):
1526 # warning: this is the nb of TRAINING samples at this node
1527 node_nb_train_samples = tree.n_node_samples[current_node_model_id]
1528 parent_model = node_models[parent_model_id]
1529 parent_predictions = parent_model.predict(X_at_node)
1530 y_pred_at_node = (node_nb_train_samples * y_pred_at_node + smoothing_constant * parent_predictions) / (
1531 node_nb_train_samples + smoothing_constant
1532 )
1533 current_node_model_id = parent_model_id
1535 return y_pred_at_node
1537 def apply_prediction(node_id, ids=None, ancestor_nodes=None):
1538 first_call = False
1539 if ids is None:
1540 ids = slice(None)
1541 first_call = True
1542 if smoothing:
1543 if ancestor_nodes is None:
1544 ancestor_nodes = [node_id]
1545 else:
1546 ancestor_nodes.append(node_id)
1548 left_id = tree.children_left[node_id]
1549 if left_id == _tree.TREE_LEAF:
1550 # ... and tree.children_right[node_id] == _tree.TREE_LEAF
1551 # LEAF node: predict
1552 # predict
1553 node_model = node_models[node_id]
1554 # assert (ids == (sample_ids_to_leaf_node_ids == node_id)).all()
1555 if isinstance(node_model, LinRegLeafModel):
1556 X_at_node = X[ids, :]
1557 predictions = node_model.predict(X_at_node)
1558 else:
1559 # isinstance(node_model, ConstantLeafModel)
1560 predictions = tree.value[node_id]
1561 if smoothing:
1562 X_at_node = X[ids, :]
1564 # finally apply smoothing
1565 if smoothing:
1566 y_predicted[ids] = smooth_predictions(ancestor_nodes, X_at_node, predictions)
1567 else:
1568 y_predicted[ids] = predictions
1569 else:
1570 right_id = tree.children_right[node_id]
1571 # non-leaf node: split samples and recurse
1572 left_group = np.zeros(nb_samples, dtype=bool)
1573 left_group[ids] = X[ids, tree.feature[node_id]] <= tree.threshold[node_id]
1574 right_group = (~left_group) if first_call else (ids & (~left_group))
1576 # important: copy ancestor_nodes BEFORE calling anything, otherwise
1577 # it will be modified
1578 apply_prediction(
1579 left_id, ids=left_group, ancestor_nodes=(ancestor_nodes.copy() if ancestor_nodes is not None else None)
1580 )
1581 apply_prediction(right_id, ids=right_group, ancestor_nodes=ancestor_nodes)
1583 # recurse to fill all predictions
1584 apply_prediction(0)
1586 return y_predicted
1589def prune_children(node_id, tree):
1590 """
1591 Prunes the children of node_id in the given `tree`.
1593 Inspired by https://github.com/shenwanxiang/sklearn-post-prune-tree/blob/master/tree_prune.py#L122
1595 IMPORTANT this relies on the fact that `children_left` and `children_right`
1596 are modificable (and are the only things we need to modify to fix the
1597 tree). It seems to be the case as of now.
1599 Parameters
1600 ----------
1601 node_id : int
1602 tree : Tree
1604 Returns
1605 -------
1606 removed_nodes : List[int]
1607 A list of removed nodes
1608 """
1610 def _prune_below(_node_id):
1611 left_child = tree.children_left[_node_id]
1612 right_child = tree.children_right[_node_id]
1613 if left_child == _tree.TREE_LEAF:
1614 # _node_id is a leaf: left_ & right_child say "leaf".Nothing to do
1615 return []
1616 else:
1617 # Make sure everything is pruned below: children should be leaves
1618 removed_l = _prune_below(left_child)
1619 removed_r = _prune_below(right_child)
1621 # -- First declare that they are not leaves anymore but they do not exist at all
1622 for child in [left_child, right_child]:
1624 if tree.children_left[child] != _tree.TREE_LEAF: 1624 ↛ 1625line 1624 didn't jump to line 1625, because the condition on line 1624 was never true
1625 raise ValueError("Unexpected children_left: %s, please report it as issue." % child)
1627 if tree.children_right[child] != _tree.TREE_LEAF: 1627 ↛ 1628line 1627 didn't jump to line 1628, because the condition on line 1627 was never true
1628 raise ValueError("Unexpected children_right: %s, please report it as issue." % child)
1630 tree.children_left[child] = _tree.TREE_UNDEFINED
1631 tree.children_right[child] = _tree.TREE_UNDEFINED
1633 # -- Then declare that current node is a leaf
1634 tree.children_left[_node_id] = _tree.TREE_LEAF
1635 tree.children_right[_node_id] = _tree.TREE_LEAF
1637 # Return the list of removed nodes
1638 return removed_l + removed_r + [left_child, right_child]
1640 # Note: we do not change the node count here, as we'll clean all later.
1641 # true_node_count = tree.node_count - sum(tree.children_left == _tree.TREE_UNDEFINED)
1642 # tree.node_count -= 2*len(nodes_to_remove)
1644 return _prune_below(node_id)
1647def is_leaf(node_id, tree):
1648 """
1649 Returns true if node with id `node_id` is a leaf in tree `tree`.
1650 Is is not actually used in this file because we always need the left child
1651 node id in our code.
1653 But it is kept here as an easy way to remember how it works.
1655 Parameters
1656 ----------
1657 node_id : int
1658 tree : Tree
1660 Returns
1661 -------
1662 _is_leaf : bool
1663 A boolean flag, True if this node is a leaf.
1664 """
1665 if node_id == _tree.TREE_LEAF or node_id == _tree.TREE_UNDEFINED:
1666 raise ValueError("Invalid node_id %s" % node_id)
1668 return tree.children_left[node_id] == _tree.TREE_LEAF
1671def get_samples_at_node(node_id, nodes_to_samples):
1672 """
1673 Return an array containing the ids of the samples for node `node_id`.
1675 This method requires the user to
1676 - first compute the decision path for the sample matrix X
1677 - then convert it to a csc
1679 >>> samples_to_nodes = estimator.decision_path(X) # returns a Scipy compressed sparse row matrix (CSR)
1680 >>> nodes_to_samples = samples_to_nodes.tocsc() # we need the column equivalent (CSC)
1681 >>> samples = get_samples_at_node(node_id, nodes_to_samples)
1683 Parameters
1684 ----------
1685 node_id : int
1686 The node for which the list of samples is queried.
1687 nodes_to_samples : csc_matrix
1688 A boolean matrix in Compressed Sparse Column (CSC) format where rows are the
1689 tree nodes and columns are samples. The matrix contains a 1 when the samples
1690 go through this node when processed by the decisions.
1692 Returns
1693 -------
1695 """
1696 return nodes_to_samples.indices[nodes_to_samples.indptr[node_id] : nodes_to_samples.indptr[node_id + 1]]
1699class M5Prime(M5Base, RegressorMixin):
1700 """An M5' (M five prime) model tree regressor.
1702 The original M5 algorithm was invented by R. Quinlan [1]_. Y. Wang [2]_ made
1703 improvements and named the resulting algorithm M5 Prime.
1704 This implementation was inspired by Weka (https://github.com/bnjmn/weka)
1705 M5Prime class, from Mark Hall.
1707 See also
1708 --------
1709 M5Base
1711 References
1712 ----------
1713 .. [1] Ross J. Quinlan, "Learning with Continuous Classes", 5th Australian
1714 Joint Conference on Artificial Intelligence, pp343-348, 1992.
1715 .. [2] Y. Wang and I. H. Witten, "Induction of model trees for predicting
1716 continuous classes", Poster papers of the 9th European Conference
1717 on Machine Learning, 1997.
1718 """
1721def _empty_contig_ar(shape, dtype):
1722 """Return an empty contiguous array with given shape and dtype."""
1723 return np.ascontiguousarray(np.empty(shape, dtype=dtype))