Coverage for src/m5py/main.py: 69%

534 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-22 17:10 +0000

1from collections import namedtuple 

2from copy import copy 

3from logging import getLogger 

4from warnings import warn 

5 

6import numpy as np 

7 

8from scipy.sparse import issparse 

9 

10from sklearn import clone 

11from sklearn.base import RegressorMixin, is_classifier 

12from sklearn.linear_model import LinearRegression 

13from sklearn.metrics import mean_squared_error 

14from sklearn.preprocessing import StandardScaler 

15from sklearn.tree import BaseDecisionTree, _tree 

16from sklearn.tree._classes import DTYPE 

17from sklearn.tree._tree import DOUBLE 

18from sklearn.utils import check_array 

19from sklearn.utils.validation import check_is_fitted 

20from sklearn import __version__ as sklearn_version 

21 

22from m5py.linreg_utils import linreg_model_to_text, DeNormalizableMixIn, DeNormalizableLinearRegression 

23 

24from packaging.version import Version 

25 

26SKLEARN_VERSION = Version(sklearn_version) 

27SKLEARN13_OR_GREATER = SKLEARN_VERSION >= Version("1.3.0") 

28 

29 

30__all__ = ["M5Base", "M5Prime"] 

31 

32 

33_SmoothingDetails = namedtuple("_SmoothingDetails", ("A", "B", "C")) 

34# Internal structure to contain the recursive smoothed coefficients details in the smoothing algorithm 

35 

36 

37logger = getLogger("m5p") 

38 

39 

40class M5Base(BaseDecisionTree): 

41 """ 

42 M5Base. Implements base routines for generating M5 PredictionModel trees and rules. 

43 

44 The original algorithm M5 was invented by Quinlan: 

45 

46 - Quinlan J. R. (1992). Learning with continuous classes. Proceedings of the 

47 Australian Joint Conference on Artificial Intelligence. 343--348. World 

48 Scientific, Singapore. 

49 

50 Yong Wang and Ian Witten made improvements and created M5': 

51 

52 - Wang, Y and Witten, I. H. (1997). Induction of model trees for predicting 

53 continuous classes. Proceedings of the poster papers of the European 

54 Conference on Machine Learning. University of Economics, Faculty of 

55 Informatics and Statistics, Prague. 

56 

57 Pruning and Smoothing can be activated and deactivated on top of the base 

58 model. TODO check if 'rules' should be supported too 

59 

60 Inspired by Weka (https://github.com/bnjmn/weka) M5Base class, from Mark Hall 

61 

62 Attributes 

63 ---------- 

64 criterion : str, default="friedman_mse" 

65 M5 suggests to use the standard deviation (RMSE impurity) instead of 

66 variance (MSE impurity) or absolute deviation (MAE impurity) as in CART. 

67 According to M5' paper, both 3 criterions are equivalent. 

68 splitter : default="best" 

69 M5 suggests to take the feature leading to the best gain in criterion. 

70 max_depth : default None 

71 This is not used in the original M5 article, hence default=None. 

72 min_samples_split : int, default=4 

73 M5 suggests a value of 4 so that each leaf will have at least 2 samples. 

74 min_samples_leaf : int, default=2 

75 M5' suggest to add this explicitly to avoid zero-variance in leaves, and 

76 n<=p for constant models. 

77 min_weight_fraction_leaf : float, default=0.0 

78 This is not used in the original M5 article, hence default=0.0. 

79 TODO this would actually maybe be better than min_sample_leaf ? 

80 max_features : default None 

81 This is not used in the original M5 article, hence default is None. 

82 max_leaf_nodes : int, default None 

83 This is not used in the original M5 article, hence default is None. 

84 min_impurity_decrease : float, default=0.0 

85 This is not used in the original M5 article, hence default is None. 

86 class_weight : default None 

87 This is not used (?) 

88 leaf_model : RegressorMixin 

89 The regression model used in the leaves. This instance will be cloned 

90 for each leaf. 

91 use_pruning : bool 

92 If False, pruning will be disabled. 

93 use_smoothing : bool or str {'installed', 'on_prediction'}, default None 

94 None and True means 'installed' by default except if smoothing_constant 

95 or smoothing_constant_ratio is 0.0 

96 smoothing_constant: float, default None 

97 The smoothing constant k defined in the original M5 article, used as the 

98 weight for each parent model in the recursive weighting process. 

99 If None, the default value from the paper (k=15) will be used. 

100 smoothing_constant_ratio: float, default None 

101 An alternate way to define the smoothing constant, as a ratio of the 

102 total number of training samples. The resulting smoothing constant will 

103 be smoothing_constant_ratio * n where n is the number of samples. 

104 Note that this may not be an integer. 

105 debug_prints: bool, default False 

106 A boolean to enable debug prints 

107 ccp_alpha: float, default 0.0 

108 TODO is this relevant ? does that conflict with "use_pruning" ? 

109 random_state : None, int, or RandomState, default=None 

110 See `RegressionTree.random_state`. 

111 """ 

112 

113 def __init__( 

114 self, 

115 criterion="friedman_mse", 

116 splitter="best", 

117 max_depth=None, 

118 min_samples_split=4, 

119 min_samples_leaf=2, 

120 min_weight_fraction_leaf=0.0, 

121 max_features=None, 

122 max_leaf_nodes=None, 

123 min_impurity_decrease=0.0, 

124 class_weight=None, 

125 leaf_model=None, 

126 use_pruning=True, 

127 use_smoothing=None, 

128 smoothing_constant=None, 

129 smoothing_constant_ratio=None, 

130 debug_prints=False, 

131 ccp_alpha=0.0, 

132 random_state=None, 

133 ): 

134 

135 # TODO the paper suggests to do this with 5% of total std 

136 # min_impurity_split = min_impurity_split_as_initial_ratio * initial_impurity 

137 

138 super(M5Base, self).__init__( 

139 criterion=criterion, 

140 splitter=splitter, 

141 max_depth=max_depth, 

142 min_samples_split=min_samples_split, 

143 min_samples_leaf=min_samples_leaf, 

144 min_weight_fraction_leaf=min_weight_fraction_leaf, 

145 max_features=max_features, 

146 max_leaf_nodes=max_leaf_nodes, 

147 min_impurity_decrease=min_impurity_decrease, 

148 random_state=random_state, 

149 class_weight=class_weight, 

150 ccp_alpha=ccp_alpha, 

151 ) 

152 

153 # warning : if the field names are different from constructor params, 

154 # then clone(self) will not work. 

155 if leaf_model is None: 155 ↛ 159line 155 didn't jump to line 159, because the condition on line 155 was never false

156 # to handle case when the model is learnt on normalized data and 

157 # we wish to be able to read the model equations. 

158 leaf_model = DeNormalizableLinearRegression() 

159 self.leaf_model = leaf_model 

160 

161 # smoothing related 

162 if smoothing_constant_ratio is not None and smoothing_constant is not None: 162 ↛ 163line 162 didn't jump to line 163, because the condition on line 162 was never true

163 raise ValueError("Only one of `smoothing_constant` and `smoothing_constant_ratio` should be provided") 

164 elif (smoothing_constant_ratio == 0.0 or smoothing_constant == 0) and ( 164 ↛ 167line 164 didn't jump to line 167, because the condition on line 164 was never true

165 use_smoothing is True or use_smoothing == "installed" 

166 ): 

167 raise ValueError( 

168 "`use_smoothing` was explicitly enabled with " 

169 "pre-installed models, while smoothing " 

170 "constant/ratio are explicitly set to zero" 

171 ) 

172 

173 self.use_pruning = use_pruning 

174 self.use_smoothing = use_smoothing 

175 self.smoothing_constant = smoothing_constant 

176 self.smoothing_constant_ratio = smoothing_constant_ratio 

177 

178 self.debug_prints = debug_prints 

179 

180 def as_pretty_text(self, **kwargs): 

181 """ 

182 Returns a multi-line representation of this decision tree, using 

183 `tree_to_text`. 

184 

185 :return: a multi-line string representing this decision tree 

186 """ 

187 from m5py.export import export_text_m5 

188 return export_text_m5(self, out_file=None, **kwargs) 

189 

190 def fit(self, X, y: np.ndarray, sample_weight=None, check_input=True, X_idx_sorted="deprecated"): 

191 """Fit a M5Prime model. 

192 

193 Parameters 

194 ---------- 

195 X : numpy.ndarray 

196 y : numpy.ndarray 

197 sample_weight 

198 check_input 

199 X_idx_sorted 

200 

201 Returns 

202 ------- 

203 

204 """ 

205 # (0) smoothing default values behaviour 

206 if self.smoothing_constant_ratio == 0.0 or self.smoothing_constant == 0: 206 ↛ 207line 206 didn't jump to line 207, because the condition on line 206 was never true

207 self.use_smoothing = False 

208 elif self.use_smoothing is None or self.use_smoothing is True: 

209 # default behaviour 

210 if isinstance(self.leaf_model, LinearRegression): 210 ↛ 213line 210 didn't jump to line 213, because the condition on line 210 was never false

211 self.use_smoothing = "installed" 

212 else: 

213 self.use_smoothing = "on_prediction" 

214 

215 # Finally make sure we are ok now 

216 if self.use_smoothing not in [False, np.bool_(False), "installed", "on_prediction"]: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true

217 raise ValueError("use_smoothing: Unexpected value: %s, please report it as issue." % self.use_smoothing) 

218 

219 

220 # (1) Build the initial tree as usual 

221 

222 # Get the correct fit method name based on the sklearn version used 

223 fit_method_name = "_fit" if SKLEARN13_OR_GREATER else "fit" 

224 

225 fit_method = getattr(super(M5Base, self), fit_method_name) 

226 fit_method(X, y, sample_weight=sample_weight, check_input=check_input) 

227 

228 

229 if self.debug_prints: 229 ↛ 230line 229 didn't jump to line 230, because the condition on line 229 was never true

230 logger.debug("(debug_prints) Initial tree:") 

231 logger.debug(self.as_pretty_text(node_ids=True)) 

232 

233 # (1b) prune initial tree to take into account min impurity in splits 

234 prune_on_min_impurity(self.tree_) 

235 

236 if self.debug_prints: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true

237 logger.debug("(debug_prints) Postprocessed tree:") 

238 logger.debug(self.as_pretty_text(node_ids=True)) 

239 

240 # (2) Now prune tree and replace pruned branches with linear models 

241 # -- unfortunately we have to re-do this input validation 

242 # step, because it also converts the input to float32. 

243 if check_input: 243 ↛ 252line 243 didn't jump to line 252, because the condition on line 243 was never false

244 X = check_array(X, dtype=DTYPE, accept_sparse="csc") 

245 if issparse(X): 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 X.sort_indices() 

247 

248 if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: 

249 raise ValueError("No support for np.int64 index based " "sparse matrices") 

250 

251 # -- initialise the structure to contain the leaves and node models 

252 self.node_models = np.empty((self.tree_.node_count,), dtype=object) 

253 

254 # -- Pruning requires to know the global deviation of the target 

255 global_std_dev = np.nanstd(y) 

256 # global_abs_dev = np.nanmean(np.abs(y)) 

257 

258 # -- Pruning requires to know the samples that reached each node. 

259 # From http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html 

260 # Retrieve the decision path of each sample. 

261 samples_to_nodes = self.decision_path(X) 

262 # * row i is to see the nodes (non-empty j) in which sample i appears. 

263 # sparse row-first (CSR) format is OK 

264 # * column j is to see the samples (non-empty i) that fall into that 

265 # node. To do that, we need to make it CSC 

266 nodes_to_samples = samples_to_nodes.tocsc() 

267 

268 # -- execute the pruning 

269 # self.features_usage is a dict feature_idx -> nb times used, only 

270 # for used features. 

271 self.features_usage = build_models_and_get_pruning_info( 

272 self.tree_, 

273 X, 

274 y, 

275 nodes_to_samples, 

276 self.leaf_model, 

277 self.node_models, 

278 global_std_dev, 

279 use_pruning=self.use_pruning, 

280 ) 

281 

282 # -- cleanup to compress inner structures: only keep non-pruned ones 

283 self._cleanup_tree() 

284 

285 if self.debug_prints: 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true

286 logger.debug("(debug_prints) Pruned tree:") 

287 logger.debug(self.as_pretty_text(node_ids=True)) 

288 

289 if self.use_smoothing == "installed": 

290 # Retrieve the NEW decision path of each sample. 

291 samples_to_nodes = self.decision_path(X) 

292 # * row i is to see the nodes (non-empty j) in which sample i 

293 # appears. sparse row-first (CSR) format is OK 

294 # * column j is to see the samples (non-empty i) that fall into 

295 # that node. To do that, we need to make it CSC 

296 nodes_to_samples = samples_to_nodes.tocsc() 

297 

298 # default behaviour for smoothing constant and ratio 

299 smoothing_constant = self._get_smoothing_constant_to_use(X) 

300 

301 self.install_smoothing(X, y, nodes_to_samples, smoothing_constant=smoothing_constant) 

302 

303 if self.debug_prints: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true

304 logger.debug("(debug_prints) Pruned and smoothed tree:") 

305 logger.debug(self.as_pretty_text(node_ids=True)) 

306 

307 return self 

308 

309 def _get_smoothing_constant_to_use(self, X): 

310 """ 

311 Returns the smoothing_constant to use for smoothing, based on current 

312 settings and X data 

313 """ 

314 if self.smoothing_constant_ratio is not None: 314 ↛ 315line 314 didn't jump to line 315, because the condition on line 314 was never true

315 nb_training_samples = X.shape[0] 

316 smoothing_cstt = self.smoothing_constant_ratio * nb_training_samples 

317 if smoothing_cstt < 15: 

318 warn( 

319 "smoothing constant ratio %s is leading to an extremely " 

320 "small smoothing constant %s because nb training samples" 

321 " is %s. Clipping to 15." % (self.smoothing_constant_ratio, smoothing_cstt, nb_training_samples) 

322 ) 

323 smoothing_cstt = 15 

324 else: 

325 smoothing_cstt = self.smoothing_constant 

326 if smoothing_cstt is None: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true

327 smoothing_cstt = 15 # default from the original Quinlan paper 

328 

329 return smoothing_cstt 

330 

331 def _cleanup_tree(self): 

332 """ 

333 Reduces the size of this object by removing from internal structures 

334 all items that are not used any more (all leaves that have been pruned) 

335 """ 

336 old_tree = self.tree_ 

337 old_node_modls = self.node_models 

338 

339 # Get all information to create a copy of the inner tree. 

340 # Note: _tree.copy() is gone so we use the pickle way 

341 # --- Base info: nb features, nb outputs, output classes 

342 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631 

343 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs) 

344 # So these remain, we are just interested in changing the node-related arrays 

345 new_tree = _tree.Tree(*old_tree.__reduce__()[1]) 

346 

347 # --- Node info 

348 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637 

349 dct = old_tree.__getstate__().copy() 

350 

351 # cleanup: only keep the nodes that are not undefined. 

352 # note: this is identical to 

353 # valid_nodes_indices = dct["nodes"]['left_child'] != TREE_UNDEFINED 

354 valid_nodes_indices = old_tree.children_left != _tree.TREE_UNDEFINED 

355 # valid_nodes_indices2 = old_tree.children_right != TREE_UNDEFINED 

356 # assert np.all(valid_nodes_indices == valid_nodes_indices2) 

357 new_node_count = sum(valid_nodes_indices) 

358 

359 # create empty new structures 

360 n_shape = (new_node_count, *dct["nodes"].shape[1:]) 

361 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype) 

362 v_shape = (new_node_count, *dct["values"].shape[1:]) 

363 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype) 

364 m_shape = (new_node_count, *old_node_modls.shape[1:]) 

365 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype) 

366 

367 # Fill structures while reindexing the tree and remembering the depth 

368 global next_free_id 

369 next_free_id = 0 

370 

371 def _compress(old_node_id): 

372 """ 

373 

374 Parameters 

375 ---------- 

376 old_node_id 

377 

378 Returns 

379 ------- 

380 the depth and new indices of left and right children 

381 

382 """ 

383 global next_free_id 

384 new_node_id = next_free_id 

385 next_free_id += 1 

386 

387 # use the old tree to walk 

388 old_node = dct["nodes"][old_node_id] 

389 left_id = old_node["left_child"] 

390 right_id = old_node["right_child"] 

391 

392 # Create the new node with a copy of the old 

393 new_nodes[new_node_id] = old_node # this is an entire row so it is probably copied already by doing so. 

394 new_values[new_node_id] = dct["values"][old_node_id] 

395 new_node_models[new_node_id] = copy(old_node_modls[old_node_id]) 

396 

397 if left_id == _tree.TREE_LEAF: 

398 # ... and right_id == _tree.TREE_LEAF 

399 # => node_id is a leaf. Nothing to do 

400 return 1, new_node_id 

401 else: 

402 

403 left_depth, new_id_left = _compress(left_id) 

404 right_depth, new_id_right = _compress(right_id) 

405 

406 # store the new indices 

407 new_nodes[new_node_id]["left_child"] = new_id_left 

408 new_nodes[new_node_id]["right_child"] = new_id_right 

409 

410 return 1 + max(left_depth, right_depth), new_node_id 

411 

412 # complete definition of the new tree 

413 dct["max_depth"] = _compress(0)[0] - 1 # root node has depth 0, not 1 

414 dct["node_count"] = new_node_count # new_nodes.shape[0] 

415 dct["nodes"] = new_nodes 

416 dct["values"] = new_values 

417 new_tree.__setstate__(dct) 

418 

419 # Fix an issue on sklearn 0.17.1: setstate was not updating max_depth 

420 # See https://github.com/scikit-learn/scikit-learn/blob/0.17.1/sklearn/tree/_tree.pyx#L623 

421 new_tree.max_depth = dct["max_depth"] 

422 

423 # update self fields 

424 self.tree_ = new_tree 

425 self.node_models = new_node_models 

426 

427 def install_smoothing(self, X_train_all, y_train_all, nodes_to_samples, smoothing_constant): 

428 """ 

429 Executes the smoothing procedure described in the M5 and M5P paper, 

430 "once for all". This means that all models are modified so that after 

431 this method has completed, each model in the tree is already a smoothed 

432 model. 

433 

434 This has pros (prediction speed) and cons (the model equations are 

435 harder to read - lots of redundancy) 

436 

437 It can only be done if leaf models are instances of `LinearRegression` 

438 """ 

439 # check validity: leaf models have to support pre-computed smoothing 

440 if not isinstance(self.leaf_model, LinearRegression): 440 ↛ 441line 440 didn't jump to line 441, because the condition on line 440 was never true

441 raise TypeError( 

442 "`install_smoothing` is only available if leaf " 

443 "models are instances of `LinearRegression` or a " 

444 "subclass" 

445 ) 

446 

447 # Select the Error metric to compute model errors (used to compare 

448 # node with subtree for pruning) 

449 # TODO in Weka they use RMSE, but in papers they use MAE. this should be a parameter. 

450 # TODO Shall we go further and store the residuals, or a bunch of metrics? not sure 

451 err_metric = root_mean_squared_error # mean_absolute_error 

452 

453 # --- Node info 

454 # TODO we should probably not do this once in each method, but once or give access directly (no copy) 

455 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637 

456 dct = self.tree_.__getstate__().copy() 

457 old_node_models = self.node_models 

458 

459 m_shape = old_node_models.shape 

460 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype) 

461 

462 def smooth__( 

463 coefs_, 

464 n_samples, 

465 features=None, 

466 parent=None, # type: _SmoothingDetails 

467 parent_features=None, 

468 k=smoothing_constant, # 

469 ): 

470 # type: (...) -> _SmoothingDetails 

471 """ 

472 Smoothes the model coefficients or intercept `coefs_` (an array or 

473 a scalar), using the smoothing results at parent node. 

474 

475 At each node we keep in memory three results A, B, C. 

476 - the new coef at each node is A + B 

477 - the recursion equations are 

478 - B(n) = (k / (n_samples(n) + k)) * A(n-1) + B(n-1) 

479 - C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1) 

480 - A(n) = coef(n)*C(n) 

481 """ 

482 if parent is None: 

483 # A0 = coef(0), C0 = 1, B0 = 0 

484 if features is None: 

485 # single scalar value 

486 return _SmoothingDetails(A=coefs_, B=0, C=1) 

487 else: 

488 # vector 

489 if len(coefs_) == 0: 489 ↛ 490line 489 didn't jump to line 490, because the condition on line 489 was never true

490 coefs_ = np.asarray(coefs_) 

491 if coefs_.shape[0] != len(features): 491 ↛ 492line 491 didn't jump to line 492, because the condition on line 491 was never true

492 raise ValueError("nb features does not match the nb " "of coefficients") 

493 return _SmoothingDetails( 

494 A=coefs_, 

495 B=np.zeros(coefs_.shape, dtype=coefs_.dtype), 

496 C=np.ones(coefs_.shape, dtype=coefs_.dtype), 

497 ) 

498 else: 

499 # B(n) = k/(n_samples(n)+k)*A(n-1) + B(n-1) 

500 # C(n) = (n_samples(n) / (n_samples(n) + k)) * C(n-1) 

501 Bn = (k / (n_samples + k)) * parent.A + parent.B 

502 Cn = (n_samples / (n_samples + k)) * parent.C 

503 

504 # A(n) = coef(n) * C(n) 

505 if features is None: 

506 # single scalar value: easy 

507 An = coefs_ * Cn 

508 return _SmoothingDetails(A=An, B=Bn, C=Cn) 

509 else: 

510 # vector of coefs: we have to 'expand' the coefs array 

511 # because the coefs at this node apply for (features) 

512 # while coefs at parent node apply for (parent_features) 

513 An = np.zeros(Cn.shape, dtype=Cn.dtype) 

514 parent_features = np.array(parent_features) 

515 features = np.array(features) 

516 

517 # Thanks https://stackoverflow.com/a/8251757/7262247 ! 

518 index = np.argsort(parent_features) 

519 sorted_parents = parent_features[index] 

520 sorted_index = np.searchsorted(sorted_parents, features) 

521 

522 features_index = np.take(index, sorted_index, mode="clip") 

523 if np.any(parent_features[features_index] != features): 523 ↛ 525line 523 didn't jump to line 525, because the condition on line 523 was never true

524 # result = np.ma.array(features_index, mask=mask) 

525 raise ValueError( 

526 "Internal error - please report this." 

527 "One feature was found in the child " 

528 "node, that was not in the parent " 

529 "node." 

530 ) 

531 

532 if len(features_index) > 0: 

533 An[features_index] = coefs_ * Cn[features_index] 

534 

535 return _SmoothingDetails(A=An, B=Bn, C=Cn) 

536 

537 def _smooth( 

538 node_id, 

539 parent_features=None, 

540 parent_coefs: _SmoothingDetails = None, 

541 parent_intercept: _SmoothingDetails = None, 

542 ): 

543 # Gather all info on this node 

544 # --base regression tree 

545 node_info = dct["nodes"][node_id] 

546 left_id = node_info["left_child"] 

547 right_id = node_info["right_child"] 

548 # --additional model 

549 node_model = old_node_models[node_id] 

550 # --samples 

551 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples) 

552 n_samples_at_this_node = samples_at_this_node.shape[0] 

553 

554 # Note: should be equal to tree.n_node_samples[node_id] 

555 if n_samples_at_this_node != self.tree_.n_node_samples[node_id]: 555 ↛ 556line 555 didn't jump to line 556, because the condition on line 555 was never true

556 raise ValueError("n_samples_at_this_node: Unexpected value, please report it as issue.") 

557 

558 y_true_this_node = y_train_all[samples_at_this_node] 

559 X_this_node = X_train_all[samples_at_this_node, :] 

560 

561 # (1) smooth current node 

562 parent_features = parent_features if parent_features is not None else None 

563 is_constant_leaf = False 

564 if left_id == _tree.TREE_LEAF and isinstance(node_model, ConstantLeafModel): 

565 is_constant_leaf = True 

566 node_features = () 

567 smoothed_features = parent_features if parent_features is not None else node_features 

568 node_coefs = () 

569 node_intercept = dct["values"][node_id] 

570 

571 # Extract the unique scalar value 

572 if len(node_intercept) != 1: 572 ↛ 573line 572 didn't jump to line 573, because the condition on line 572 was never true

573 raise ValueError("node_intercept: Unexpected value: , please report it as issue." % node_intercept) 

574 

575 node_intercept = node_intercept.item() 

576 

577 else: 

578 # A leaf LinRegLeafModel or a split SplitNodeModel 

579 node_features = node_model.features 

580 smoothed_features = parent_features if parent_features is not None else node_features 

581 node_coefs = node_model.model.coef_ 

582 node_intercept = node_model.model.intercept_ 

583 

584 # Create a new linear regression model with smoothed coefficients 

585 smoothed_sklearn_model = clone(self.leaf_model) 

586 smoothed_coefs = smooth__( 

587 node_coefs, 

588 features=node_features, 

589 n_samples=n_samples_at_this_node, 

590 parent=parent_coefs, 

591 parent_features=parent_features, 

592 ) 

593 smoothed_intercept = smooth__(node_intercept, n_samples=n_samples_at_this_node, parent=parent_intercept) 

594 smoothed_sklearn_model.coef_ = smoothed_coefs.A + smoothed_coefs.B 

595 smoothed_sklearn_model.intercept_ = smoothed_intercept.A + smoothed_intercept.B 

596 

597 # Finally update the node 

598 if is_constant_leaf: 

599 smoothed_node_model = LinRegLeafModel(smoothed_features, smoothed_sklearn_model, None) 

600 else: 

601 smoothed_node_model = copy(node_model) 

602 smoothed_node_model.features = smoothed_features 

603 smoothed_node_model.model = smoothed_sklearn_model 

604 

605 # Remember the new smoothed model 

606 new_node_models[node_id] = smoothed_node_model 

607 

608 if left_id == _tree.TREE_LEAF: 

609 # If this is a leaf, update the prediction error on X 

610 y_pred_this_node = smoothed_node_model.predict(X_this_node) 

611 smoothed_node_model.error = err_metric(y_true_this_node, y_pred_this_node) 

612 

613 else: 

614 # If this is a split node - recurse on each subtree 

615 _smooth( 

616 left_id, 

617 parent_features=smoothed_features, 

618 parent_coefs=smoothed_coefs, 

619 parent_intercept=smoothed_intercept, 

620 ) 

621 _smooth( 

622 right_id, 

623 parent_features=smoothed_features, 

624 parent_coefs=smoothed_coefs, 

625 parent_intercept=smoothed_intercept, 

626 ) 

627 

628 # Update the error using the same formula than the one we used 

629 # in build_models_and_get_pruning_info 

630 y_pred_children = predict_from_leaves_no_smoothing(self.tree_, new_node_models, X_this_node) 

631 err_children = err_metric(y_true_this_node, y_pred_children) 

632 

633 # the total number of parameters is the sum of params in each 

634 # branch PLUS 1 for the split 

635 n_params_splitmodel = new_node_models[left_id].n_params + new_node_models[right_id].n_params + 1 

636 smoothed_node_model.n_params = n_params_splitmodel 

637 # do not adjust the error now, simply store the raw one 

638 # smoothed_node_model.error = compute_adjusted_error( 

639 # err_children, n_samples_at_this_node, n_params_splitmodel) 

640 smoothed_node_model.error = err_children 

641 

642 return 

643 

644 # smooth the whole tree 

645 _smooth(0) 

646 

647 # use the new node models now 

648 self.node_models = new_node_models 

649 

650 # remember the smoothing constant installed 

651 self.installed_smoothing_constant = smoothing_constant 

652 return 

653 

654 def denormalize(self, x_scaler, y_scaler): 

655 """ 

656 De-normalizes this model according to the provided x and y normalization scalers. 

657 Currently only StandardScaler issupported. 

658 

659 :param x_scaler: a StandardScaler or None 

660 :param y_scaler: a StandardScaler or None 

661 :return: 

662 """ 

663 # perform denormalization 

664 self._denormalize_tree(x_scaler, y_scaler) 

665 

666 def _denormalize_tree(self, x_scaler, y_scaler): 

667 """ 

668 De-normalizes all models in the tree 

669 :return: 

670 """ 

671 old_tree = self.tree_ 

672 old_node_models = self.node_models 

673 

674 # Get all information to create a copy of the inner tree. 

675 # Note: _tree.copy() is gone so we use the pickle way 

676 # --- Base info: nb features, nb outputs, output classes 

677 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631 

678 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs) 

679 # So these remain, we are just interested in changing the node-related arrays 

680 new_tree = _tree.Tree(*old_tree.__reduce__()[1]) 

681 

682 # --- Node info 

683 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637 

684 dct = old_tree.__getstate__().copy() 

685 

686 # create empty new structures for nodes and models 

687 n_shape = dct["nodes"].shape 

688 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype) 

689 v_shape = dct["values"].shape 

690 new_values = _empty_contig_ar(v_shape, dtype=dct["values"].dtype) 

691 m_shape = old_node_models.shape 

692 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_models.dtype) 

693 

694 def _denormalize(node_id, x_scaler, y_scaler): 

695 """ 

696 denormalizes the subtree below node `node_id`. 

697 

698 - `new_nodes[node_id]` is filled with a copy of the old node 

699 (see old_node.dtype to see the various fields). If the node is 

700 a split node, the split threshold 

701 `new_nodes[node_id]['threshold']` is denormalized. 

702 

703 - `new_values[node_id]` is filled with a denormalized copy of the 

704 old constant prediction at this node. Reminder: this constant 

705 prediction is actually used only when on the leaves now, but 

706 we keep it for ref. 

707 

708 - `new_node_models[node_id]` is filled with a copy of the old 

709 model `old_node_models[node_id]`, that is denormalized if it 

710 is not a constant leaf 

711 

712 :param node_id: 

713 :return: (nothing) 

714 """ 

715 # use the old tree to walk 

716 old_node = dct["nodes"][node_id] 

717 left_id = old_node['left_child'] 

718 right_id = old_node['right_child'] 

719 

720 # Create the new node with a copy of the old 

721 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so. 

722 new_model = copy(old_node_models[node_id]) 

723 new_node_models[node_id] = new_model 

724 

725 # Create the new value by de-scaling y 

726 if y_scaler is not None: 

727 # Note: if this is a split node with a linear regression model 

728 # the value will never be used. However to preserve consistency 

729 # of the whole values structure and for debugging purposes, we 

730 # choose this safe path of denormalizing ALL. 

731 # TODO we could also do it at once outside of the recursive 

732 # calls, but we should check for side effects 

733 new_values[node_id] = y_scaler.inverse_transform( 

734 dct["values"][node_id] 

735 ) 

736 else: 

737 # no denormalization: simply copy 

738 new_values[node_id] = dct["values"][node_id] 

739 

740 if left_id == _tree.TREE_LEAF: 

741 # ... and right_id == _tree.TREE_LEAF 

742 # => node_id is a leaf 

743 if isinstance(new_model, ConstantLeafModel): 

744 # nothing to do: we already re-scaled the value 

745 return 

746 elif isinstance(new_model, LinRegLeafModel): 

747 # denormalize model 

748 new_model.denormalize(x_scaler, y_scaler) 

749 return 

750 else: 

751 raise TypeError("Internal error - Leafs can only be" 

752 "constant or linear regression") 

753 else: 

754 # this is a split node, denormalize each subtree 

755 _denormalize(left_id, x_scaler, y_scaler) 

756 _denormalize(right_id, x_scaler, y_scaler) 

757 

758 # denormalize the split value if needed 

759 if x_scaler is not None: 

760 split_feature = old_node['feature'] 

761 # The denormalizer requires a vector with all the features, 

762 # even if we only want to denormalize one. 

763 # -- put split value in a vector where it has pos 'feature' 

764 old_threshold_and_zeros = np.zeros((self.n_features_, ), dtype=dct["nodes"]['threshold'].dtype) 

765 old_threshold_and_zeros[split_feature] = old_node['threshold'] 

766 # -- denormalize the vector and retrieve value 'feature' 

767 new_nodes[node_id]['threshold'] = x_scaler.inverse_transform(old_threshold_and_zeros)[split_feature] 

768 else: 

769 # no denormalization: simply copy 

770 new_nodes[node_id]['threshold'] = old_node['threshold'] 

771 

772 if isinstance(new_model, SplitNodeModel): 

773 # denormalize model at split node too, even if it is not 

774 # always used (depending on smoothing mode) 

775 new_model.denormalize(x_scaler, y_scaler) 

776 else: 

777 raise TypeError("Internal error: all intermediate nodes" 

778 "should be SplitNodeModel") 

779 

780 return 

781 

782 # denormalize the whole tree and put the result in (new_nodes, 

783 # new_values, new_node_models) recursively 

784 _denormalize(0, x_scaler, y_scaler) 

785 

786 # complete definition of the new tree 

787 dct["nodes"] = new_nodes 

788 dct["values"] = new_values 

789 new_tree.__setstate__(dct) 

790 

791 # update the self fields 

792 # self.features_usage 

793 self.tree_ = new_tree 

794 self.node_models = new_node_models 

795 

796 def compress_features(self): 

797 """ 

798 Compresses the model and returns a vector of required feature indices. 

799 This model input will then be X[:, features] instead of X. 

800 """ 

801 used_features = sorted(self.features_usage.keys()) 

802 new_features_lookup_dct = {old_feature_idx: i for i, old_feature_idx in enumerate(used_features)} 

803 

804 if used_features == list(range(self.n_features_in_)): 

805 # NOTHING TO DO: we need all features 

806 return used_features 

807 

808 # Otherwise, We can compress. For this we have to create a copy of the 

809 # tree because we will change its internals 

810 old_tree = self.tree_ 

811 old_node_modls = self.node_models 

812 

813 # Get all information to create a copy of the inner tree. 

814 # Note: _tree.copy() is gone so we use the pickle way 

815 # --- Base info: nb features, nb outputs, output classes 

816 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L631 

817 # [1] = (self.n_features, sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), self.n_outputs) 

818 # So these remain, we are just interested in changing the node-related arrays 

819 new_tree = _tree.Tree(*old_tree.__reduce__()[1]) 

820 

821 # --- Node info 

822 # see https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L637 

823 dct = old_tree.__getstate__().copy() 

824 

825 # create empty new structures for nodes and models 

826 n_shape = dct["nodes"].shape 

827 new_nodes = _empty_contig_ar(n_shape, dtype=dct["nodes"].dtype) 

828 m_shape = old_node_modls.shape 

829 new_node_models = _empty_contig_ar(m_shape, dtype=old_node_modls.dtype) 

830 

831 def _compress_features(node_id): 

832 """ 

833 

834 Parameters 

835 ---------- 

836 node_id 

837 

838 Returns 

839 ------- 

840 the depth and new indices of left and right children 

841 

842 """ 

843 # use the old tree to walk 

844 old_node = dct["nodes"][node_id] 

845 left_id = old_node["left_child"] 

846 right_id = old_node["right_child"] 

847 

848 # Create the new node with a copy of the old 

849 new_nodes[node_id] = old_node # this is an entire row so it is probably copied already by doing so. 

850 new_model = copy(old_node_modls[node_id]) 

851 new_node_models[node_id] = new_model 

852 

853 if left_id == _tree.TREE_LEAF: 

854 # ... and right_id == _tree.TREE_LEAF 

855 # => node_id is a leaf 

856 if isinstance(new_model, ConstantLeafModel): 

857 # no features used 

858 return 

859 elif isinstance(new_model, LinRegLeafModel): 

860 # compress that model 

861 new_model.reindex_features(new_features_lookup_dct) 

862 return 

863 else: 

864 raise TypeError("Internal error - Leafs can only be" "constant or linear regression") 

865 else: 

866 

867 _compress_features(left_id) 

868 _compress_features(right_id) 

869 

870 # store the new split feature index in the node 

871 new_nodes[node_id]["feature"] = new_features_lookup_dct[old_node["feature"]] 

872 

873 if not isinstance(new_model, SplitNodeModel): 

874 raise TypeError("Internal error: all intermediate nodes" "should be SplitNodeModel") 

875 

876 # TODO now that split node models have a linear regression 

877 # model too, we should update them here 

878 

879 return 

880 

881 _compress_features(0) 

882 

883 # complete definition of the new tree 

884 dct["nodes"] = new_nodes 

885 new_tree.__setstate__(dct) 

886 

887 # update the self fields 

888 self.features_usage = {new_features_lookup_dct[k]: v for k, v in self.features_usage.items()} 

889 self.tree_ = new_tree 

890 self.node_models = new_node_models 

891 self.n_features_in_ = len(self.features_usage) 

892 

893 # return the vector of used features 

894 return used_features 

895 

896 @property 

897 def feature_importances_(self): 

898 """Return the feature importances (the higher, the more important). 

899 

900 Returns 

901 ------- 

902 feature_importances_ : array, shape = [n_features] 

903 """ 

904 check_is_fitted(self, "tree_") 

905 

906 # TODO adapt ? 

907 # features = np.array([self.features_usage[k] for k in sorted(self.features_usage.keys())], dtype=int) 

908 features = self.tree_.compute_feature_importances() 

909 

910 return features 

911 

912 def predict(self, X, check_input=True, smooth_predictions=None, smoothing_constant=None): 

913 """Predict class or regression value for X. 

914 

915 For a classification model, the predicted class for each sample in X is 

916 returned. For a regression model, the predicted value based on X is 

917 returned. 

918 

919 Parameters 

920 ---------- 

921 X : numpy.ndarray, shape (n_samples, n_features) 

922 The input samples. Internally, it will be converted to 

923 ``dtype=np.float32`` and if a sparse matrix is provided 

924 to a sparse ``csr_matrix``. 

925 check_input : boolean, (default=True) 

926 Allow to bypass several input checking. 

927 Don't use this parameter unless you know what you do. 

928 smooth_predictions : boolean, (default=None) 

929 None means "use self config" 

930 smoothing_constant: int, (default=self.smoothing_constant) 

931 Smoothing constant used when smooth_predictions is True. During 

932 smoothing, child node models are recursively mixed with their 

933 parent node models, and the mix is done using a weighted sum. The 

934 weight given to the child model is the number of training samples 

935 that reached its node, while the weight given to the parent model 

936 is the smoothing constant. Therefore it can be seen as an 

937 equivalent number of samples that parent models represent when 

938 injected into the recursive weighted sum. 

939 

940 Returns 

941 ------- 

942 y : array of shape = [n_samples] or [n_samples, n_outputs] 

943 The predicted classes, or the predict values. 

944 """ 

945 check_is_fitted(self, "tree_") 

946 

947 # If this is just a constant node only check the input's shape. 

948 if self.n_features_in_ == 0: 948 ↛ 950line 948 didn't jump to line 950, because the condition on line 948 was never true

949 # perform the input checking manually to set ensure_min_features=0 

950 X = check_array(X, dtype=DTYPE, accept_sparse="csr", ensure_min_features=0) 

951 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): 

952 raise ValueError("No support for np.int64 index based " "sparse matrices") 

953 # skip it then 

954 check_input = False 

955 

956 # validate and convert dtype 

957 X = self._validate_X_predict(X, check_input) 

958 

959 # -------- This is the only change wrt parent class. TODO maybe rather replace self.tree_ with a proxy --------- 

960 if smooth_predictions is None: 

961 # Default: smooth prediction at prediction time if configured as 

962 # such in the model. Note: if self.use_smoothing == 'installed', 

963 # models are already smoothed models, so no need to smooth again 

964 smooth_predictions = self.use_smoothing == "on_prediction" 

965 else: 

966 # user provided an explicit value for smooth_predictions 

967 if not smooth_predictions and self.use_smoothing == "installed": 967 ↛ 968line 967 didn't jump to line 968, because the condition on line 967 was never true

968 raise ValueError( 

969 "Smoothing has been pre-installed on this " 

970 "tree, it is not anymore possible to make " 

971 "predictions without smoothing" 

972 ) 

973 

974 if smooth_predictions and smoothing_constant is None: 974 ↛ 977line 974 didn't jump to line 977, because the condition on line 974 was never true

975 # default parameter for smoothing is the one defined in the model 

976 # (with possible ratio) 

977 smoothing_constant = self._get_smoothing_constant_to_use(X) 

978 

979 # Do not use the embedded tree (like in super): it has been pruned but 

980 # still has constant nodes 

981 # proba = self.tree_.predict(X) 

982 if not smooth_predictions: 

983 proba = predict_from_leaves_no_smoothing(self.tree_, self.node_models, X) 

984 else: 

985 proba = predict_from_leaves(self, X, smoothing=True, smoothing_constant=smoothing_constant) 

986 if len(proba.shape) < 2: 

987 proba = proba.reshape(-1, 1) 

988 # ------------------------------------------ 

989 

990 n_samples = X.shape[0] 

991 

992 # Classification 

993 if is_classifier(self): 993 ↛ 994line 993 didn't jump to line 994, because the condition on line 993 was never true

994 if self.n_outputs_ == 1: 

995 return self.classes_.take(np.argmax(proba, axis=1), axis=0) 

996 else: 

997 predictions = np.zeros((n_samples, self.n_outputs_)) 

998 for k in range(self.n_outputs_): 

999 predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) 

1000 return predictions 

1001 

1002 # Regression 

1003 else: 

1004 if self.n_outputs_ == 1: 1004 ↛ 1007line 1004 didn't jump to line 1007, because the condition on line 1004 was never false

1005 return proba[:, 0] 

1006 else: 

1007 return proba[:, :, 0] 

1008 

1009 

1010class ConstantLeafModel: 

1011 """ 

1012 Represents the additional information about a leaf node that is not pruned. 

1013 It contains the error associated with the training samples at this node. 

1014 

1015 Note: the constant value is not stored here, as it is already available in 

1016 the sklearn tree struct. So to know the prediction at this node, use 

1017 `tree.value[node_id]` 

1018 """ 

1019 

1020 __slots__ = ("error",) 

1021 

1022 def __init__(self, error): 

1023 self.error = error 

1024 

1025 @property 

1026 def n_params(self) -> int: 

1027 """ 

1028 Returns the number of parameters used by this model, including the 

1029 constant driver 

1030 """ 

1031 return 1 

1032 

1033 @staticmethod 

1034 def predict_cstt(tree, node_id, n_samples): 

1035 """ 

1036 This static method is a helper to get an array of constant predictions 

1037 associated with this node. 

1038 

1039 Parameters 

1040 ---------- 

1041 tree 

1042 node_id 

1043 n_samples 

1044 

1045 Returns 

1046 ------- 

1047 

1048 """ 

1049 cstt_prediction = tree.value[node_id] 

1050 

1051 # cstt_prediction can be multioutput so it is an array. replicate it 

1052 from numpy import matlib # note: np.matlib not available in 1.10.x 

1053 

1054 return matlib.repmat(cstt_prediction, n_samples, 1) 

1055 

1056 

1057class LinRegNodeModel(DeNormalizableMixIn): 

1058 """ 

1059 Represents the additional information about a tree node that contains a 

1060 linear regression model. 

1061 

1062 It contains 

1063 - the features used by the model, 

1064 - the scikit learn model object itself, 

1065 - and the error for the training samples that reached this node 

1066 

1067 """ 

1068 __slots__ = ("features", "model", "error", "n_params") 

1069 

1070 def __init__(self, features, model, error) -> None: 

1071 self.features = features 

1072 self.model = model 

1073 self.error = error 

1074 self.n_params = len(self.features) + 1 

1075 

1076 def to_text(self, feature_names=None, target_name=None, precision=3, 

1077 line_breaks=False): 

1078 """ Returns a text representation of the linear regression model """ 

1079 return linreg_model_to_text(self.model, feature_names=feature_names, 

1080 target_name=target_name, 

1081 precision=precision, 

1082 line_breaks=line_breaks) 

1083 

1084 def reindex_features(self, new_features_lookup_dct): 

1085 """ 

1086 Reindexes the required features using the provided lookup dictionary. 

1087 

1088 Parameters 

1089 ---------- 

1090 new_features_lookup_dct 

1091 

1092 Returns 

1093 ------- 

1094 

1095 """ 

1096 self.features = [new_features_lookup_dct[f] for f in self.features] 

1097 

1098 def denormalize(self, 

1099 x_scaler: StandardScaler = None, 

1100 y_scaler: StandardScaler = None 

1101 ): 

1102 """ 

1103 De-normalizes the linear model. 

1104 

1105 :param x_scaler: 

1106 :param y_scaler: 

1107 :return: 

1108 """ 

1109 # create a clone of the x scaler with only the used features 

1110 x_scaler = copy(x_scaler) 

1111 if len(self.features) > 0: 

1112 x_scaler.scale_ = x_scaler.scale_[self.features] 

1113 x_scaler.mean_ = x_scaler.mean_[self.features] 

1114 else: 

1115 # in that particular case, the above expression is not working 

1116 x_scaler.scale_ = x_scaler.scale_[0:0] 

1117 x_scaler.mean_ = x_scaler.mean_[0:0] 

1118 

1119 # use the denormalize function on the internal model 

1120 # TODO what if internal model is not able ? 

1121 self.model.denormalize(x_scaler, y_scaler) 

1122 

1123 def predict(self, X: np.ndarray): 

1124 """Performs a prediction for X, only using the required features.""" 

1125 if len(self.features) < 1 and isinstance(self.model, LinearRegression): 1125 ↛ 1129line 1125 didn't jump to line 1129, because the condition on line 1125 was never true

1126 # unfortunately LinearRegression models do not like it when no 

1127 # features are needed: their input validation requires at least 1. 

1128 # so we do it ourselves. 

1129 return self.model.intercept_ * np.ones((X.shape[0], 1)) 

1130 else: 

1131 return self.model.predict(X[:, self.features]) 

1132 

1133 

1134class LinRegLeafModel(LinRegNodeModel): 

1135 """ 

1136 Represents the additional information about a leaf node with a linear 

1137 regression model 

1138 """ 

1139 pass 

1140 

1141 

1142class SplitNodeModel(LinRegNodeModel): 

1143 """ 

1144 Represents the additional information about a split node, with a linear 

1145 regression model. 

1146 """ 

1147 __slots__ = ("n_params",) 

1148 

1149 def __init__(self, n_params, error, features, model): 

1150 self.n_params = n_params 

1151 super(SplitNodeModel, self).__init__(features=features, model=model, error=error) 

1152 

1153 

1154PRUNING_MULTIPLIER = 2 

1155# see https://github.com/bnjmn/weka/blob/master/weka/src/main/java/weka/classifiers/trees/m5/RuleNode.java#L124 

1156# TODO check why they use 2 instead of 1 (from the article) ?? 

1157 

1158 

1159def root_mean_squared_error(*args, **kwargs): 

1160 return np.sqrt(mean_squared_error(*args, **kwargs)) 

1161 

1162 

1163def prune_on_min_impurity(tree): 

1164 """ 

1165 Edits the given tree so as to prune subbranches that do not respect the min 

1166 impurity criterion. 

1167 

1168 The paper suggests to do this with 5% but min_impurity_split is not 

1169 available in 0.17 (and criterion is not std but mse so we have to square) 

1170 

1171 Parameters 

1172 ---------- 

1173 tree 

1174 

1175 Returns 

1176 ------- 

1177 

1178 """ 

1179 left_children = tree.children_left 

1180 right_children = tree.children_right 

1181 impurities = tree.impurity 

1182 

1183 # The paper suggests to do this with 5% but min_impurity_split is not 

1184 # available in 0.17 (and criterion is not std but mse so we have to square) 

1185 # TODO adapt the formula to criterion used and/or generalize with min_impurity_split_as_initial_ratio 

1186 root_impurity = impurities[0] 

1187 impurity_threshold = root_impurity * (0.05 ** 2) 

1188 

1189 def stop_on_min_impurity(node_id): 

1190 # note: in the paper that is 0.05 but criterion is on std. Here 

1191 # impurity is mse so a squared equivalent of std. 

1192 left_id = left_children[node_id] 

1193 right_id = right_children[node_id] 

1194 if left_id != _tree.TREE_LEAF: # a split node 

1195 if impurities[node_id] < impurity_threshold: 

1196 # stop here, this will be a leaf 

1197 prune_children(node_id, tree) 

1198 else: 

1199 stop_on_min_impurity(left_id) 

1200 stop_on_min_impurity(right_id) 

1201 

1202 stop_on_min_impurity(0) 

1203 

1204 

1205def build_models_and_get_pruning_info( 

1206 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model, node_models, global_std_dev, use_pruning, node_id=0 

1207): 

1208 """ 

1209 

1210 Parameters 

1211 ---------- 

1212 tree : Tree 

1213 A tree that will be pruned on the way 

1214 X_train_all 

1215 y_train_all 

1216 nodes_to_samples 

1217 leaf_model 

1218 node_models 

1219 global_std_dev 

1220 use_pruning 

1221 node_id 

1222 

1223 Returns 

1224 ------- 

1225 a dictionary where the key is the feature index and the value is 

1226 the number of samples where this feature is used 

1227 """ 

1228 

1229 # Select the Error metric to compute model errors (used to compare node 

1230 # with subtree for pruning) 

1231 # TODO in Weka they use RMSE, but in papers they use MAE. could be a param 

1232 # TODO Shall we go further and store the residuals, or a bunch of metrics? 

1233 err_metric = root_mean_squared_error # mean_absolute_error 

1234 

1235 # Get the samples associated with this node 

1236 samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples) 

1237 n_samples_at_this_node = samples_at_this_node.shape[0] 

1238 y_true_this_node = y_train_all[samples_at_this_node] 

1239 

1240 # Is this a leaf node or a split node ? 

1241 left_node = tree.children_left[node_id] # this way we do not have to query it again in the else. 

1242 if left_node == _tree.TREE_LEAF: 

1243 # --> Current node is a LEAF. See is_leaf(node_id, tree) if you have doubts <-- 

1244 

1245 # -- create a linear model for this node 

1246 # leaves do not have the right to use any features since they have no 

1247 # subtree: keep the constant prediction 

1248 y_pred_this_node = ConstantLeafModel.predict_cstt(tree, node_id, y_true_this_node.shape[0]) 

1249 err_this_node = err_metric(y_true_this_node, y_pred_this_node) 

1250 

1251 # TODO when use_pruning = False, should we allow all features to be 

1252 # used instead of having to stick to the M5 rule of "only use a 

1253 # feature if the subtree includes a split with this feature" ? 

1254 # OR alternate proposal: should we transform the boolean use_pruning 

1255 # into a use_pruning_max_nb integer to say for example "only 2 level 

1256 # of pruning" ? 

1257 

1258 # -- store the model information 

1259 node_models[node_id] = ConstantLeafModel(err_this_node) 

1260 

1261 # -- return an empty dict for features used 

1262 return dict() # np.array([], dtype=int) 

1263 

1264 else: 

1265 # --> Current node is a SPLIT <-- 

1266 right_node = tree.children_right[node_id] 

1267 

1268 # (1) prune left and right subtree and get some information 

1269 features_l = build_models_and_get_pruning_info( 

1270 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model, 

1271 node_models, global_std_dev, use_pruning, node_id=left_node 

1272 ) 

1273 features_r = build_models_and_get_pruning_info( 

1274 tree, X_train_all, y_train_all, nodes_to_samples, leaf_model, 

1275 node_models, global_std_dev, use_pruning, node_id=right_node 

1276 ) 

1277 

1278 # (2) select only the samples that reach this node 

1279 X_this_node = X_train_all[samples_at_this_node, :] 

1280 

1281 # (3) Create a model for this node 

1282 # -- fit a linear regression model taking into account only the 

1283 # features used in the subtrees + the split one 

1284 # TODO should we normalize=True (variance scaling) or use a whole 

1285 # pipeline here ? Not sure.. 

1286 # estimators = [('scale', StandardScaler()), ('clf', model_type())]; 

1287 # pipe = Pipeline(estimators) 

1288 # skmodel_this_node = leaf_model_type(**leaf_model_params) 

1289 skmodel_this_node = clone(leaf_model) 

1290 

1291 # -- old - we used to only store the array of features 

1292 # selected_features = np.union1d(features_l, features_r) 

1293 # selected_features = np.union1d(selected_features, tree.feature[node_id]) 

1294 # -- new - we also gather the nb samples where this feature is used 

1295 selected_features_dct = features_l 

1296 for feature_id, n_samples_feat_used in features_r.items(): 

1297 if feature_id in selected_features_dct.keys(): 

1298 selected_features_dct[feature_id] += n_samples_feat_used 

1299 else: 

1300 selected_features_dct[feature_id] = n_samples_feat_used 

1301 selected_features_dct[tree.feature[node_id]] = n_samples_at_this_node 

1302 # -- use only the selected features, in the natural integer order 

1303 selected_features = sorted(selected_features_dct.keys()) 

1304 

1305 X_train_this_node = X_this_node[:, selected_features] 

1306 skmodel_this_node.fit(X_train_this_node, y_true_this_node) 

1307 # -- predict and compute error 

1308 y_pred_this_node = skmodel_this_node.predict(X_train_this_node) 

1309 err_this_node = err_metric(y_true_this_node, y_pred_this_node) 

1310 # -- create the object 

1311 # TODO the paper suggest to perform recursive feature elimination in 

1312 # this model until adjusted_err_model is minimal, is it same in Weka ? 

1313 model_this_node = LinRegLeafModel(selected_features, skmodel_this_node, err_this_node) 

1314 

1315 # (4) compute adj error criterion = ERR * (n+v)/(n-v) for both models 

1316 adjusted_err_model = compute_adjusted_error(err_this_node, n_samples_at_this_node, model_this_node.n_params) 

1317 

1318 # (5) predict and compute adj error for the combination of child models 

1319 # -- Note: this is recursive so the leaves may contain linear models 

1320 # already 

1321 y_pred_children = predict_from_leaves_no_smoothing(tree, node_models, X_this_node) 

1322 err_children = err_metric(y_true_this_node, y_pred_children) 

1323 

1324 # TODO the Weka implem (below) differs from the paper that suggests a 

1325 # weigthed sum of adjusted errors. This is maybe an equivalent formulation, to check. 

1326 # the total number of parameters if we do not prune, is the sum of 

1327 # params in each branch PLUS 1 for the split 

1328 n_params_splitmodel = node_models[left_node].n_params + node_models[right_node].n_params + 1 

1329 adjusted_err_children = compute_adjusted_error(err_children, n_samples_at_this_node, n_params_splitmodel) 

1330 

1331 # (6) compare and either prune at this node or keep the subtrees 

1332 std_dev_this_node = np.nanstd(y_true_this_node) 

1333 # note: the first criterion is now already checked before that 

1334 # function call, in `prune_on_min_impurity` 

1335 if use_pruning and ( 

1336 # TODO these parameters should be in the constructor 

1337 # see also 2 comments about min_impurity_split_as_initial_ratio 

1338 std_dev_this_node < (global_std_dev * 0.05) 

1339 or (adjusted_err_model <= adjusted_err_children) 

1340 or (adjusted_err_model < (global_std_dev * 0.00001)) # so this means very good R² already 

1341 ): 

1342 # Choose model for this node rather than subtree model 

1343 # -- prune from this node on 

1344 removed_nodes = prune_children(node_id, tree) 

1345 node_models[removed_nodes] = _tree.TREE_UNDEFINED 

1346 

1347 # store the model information 

1348 node_models[node_id] = model_this_node 

1349 

1350 # update and return the features used 

1351 selected_features_dct = {k: n_samples_at_this_node for k in selected_features_dct.keys()} 

1352 return selected_features_dct 

1353 

1354 else: 

1355 # The subtrees are good or we do not want pruning: keep them. 

1356 # This node will remain a split, and will only contain the digest 

1357 # about the subtree 

1358 node_models[node_id] = SplitNodeModel( 

1359 n_params_splitmodel, err_children, selected_features, skmodel_this_node 

1360 ) 

1361 

1362 # return the features used 

1363 return selected_features_dct 

1364 

1365 

1366def compute_adjusted_error(err, n_samples, n_parameters, multiplier=PRUNING_MULTIPLIER): 

1367 """ 

1368 Return the penalized error obtained from `err`, the prediction error at a 

1369 given tree node, by multiplying it by ``(n+p)/(n-p)``. `n` is the number of 

1370 samples at this node and `p` is the number of parameters of the model. 

1371 

1372 According to the original M5 paper, this is to penalize complex models 

1373 used for few samples. 

1374 

1375 Note that if ``n_samples <= n_parameters`` the denominator is zero or negative. 

1376 In this case an arbitrary high penalization factor is used: ``10 * err``. 

1377 

1378 Parameters 

1379 ---------- 

1380 err : float 

1381 The model error for a given tree node. Typically the MAE. 

1382 

1383 n_samples : int 

1384 The number of samples at this node. 

1385 

1386 n_parameters : int 

1387 The number of parameters of the model at this node. 

1388 

1389 Returns 

1390 ------- 

1391 penalized_mae : float 

1392 The penalized error ``err * (n+p)/(n-p)``, or ``10 * err`` if ``n <= p``. 

1393 """ 

1394 # 

1395 if n_samples <= n_parameters: 1395 ↛ 1398line 1395 didn't jump to line 1398, because the condition on line 1395 was never true

1396 # denominator is zero or negative: use a large factor so as to penalize 

1397 # a lot this overly complex model. 

1398 factor = 10.0 # Caution says Yong in his code 

1399 else: 

1400 factor = (n_samples + multiplier * n_parameters) / (n_samples - n_parameters) 

1401 

1402 return err * factor 

1403 

1404 

1405def predict_from_leaves_no_smoothing(tree, node_models, X): 

1406 """ 

1407 Returns the prediction for all samples in X, based on using the appropriate 

1408 model leaf. 

1409 

1410 This function 

1411 - uses tree.apply(X) to know in which tree leaf each sample in X goes 

1412 - then for each of the leaves that are actually touched, uses 

1413 node_models[leaf_id] to predict, for the X reaching that leaf 

1414 

1415 This function assumes that node_models contains non-empty LinRegLeafModel 

1416 entries for all leaf nodes that will be reached by the samples X. 

1417 

1418 Parameters 

1419 ---------- 

1420 tree : Tree 

1421 The tree object. 

1422 node_models : array-like 

1423 The array containing node models 

1424 X : 2D array-like 

1425 The sample data. 

1426 

1427 Returns 

1428 ------- 

1429 y_predicted : 1D array-like 

1430 The prediction vector. 

1431 """ 

1432 # **** This does the job, but we have one execution of model.predict() per 

1433 # sample: probably not efficient 

1434 # sample_ids_to_leaf_node_ids = tree.apply(X) 

1435 # model_and_x = np.concatenate((node_models[leaf_node_ids].reshape(-1, 1), X), axis=1) 

1436 # def pred(m_and_x): 

1437 # return m_and_x[0].model.predict(m_and_x[1:].reshape(1,-1))[0] 

1438 # y_predicted = np.array(list(map(pred, model_and_x))) 

1439 

1440 # **** This should be faster because the number of calls to predict() is 

1441 # equal to the number of leaves touched 

1442 sample_ids_to_leaf_node_ids = tree.apply(X) 

1443 y_predicted = -np.ones(sample_ids_to_leaf_node_ids.shape, dtype=DOUBLE) 

1444 

1445 # -- find the unique list of leaves touched 

1446 leaf_node_ids, inverse_idx = np.unique(sample_ids_to_leaf_node_ids, return_inverse=True) 

1447 

1448 # -- for each leaf perform the prediction for samples reaching that leaf 

1449 for leaf_node_id in leaf_node_ids: 

1450 # get the indices of the samples reaching that leaf 

1451 sample_indices = np.nonzero(sample_ids_to_leaf_node_ids == leaf_node_id)[0] 

1452 

1453 # predict 

1454 node_model = node_models[leaf_node_id] 

1455 if isinstance(node_model, LinRegLeafModel): 

1456 y_predicted[sample_indices] = np.ravel(node_model.predict(X[sample_indices, :])) 

1457 else: 

1458 # isinstance(node_model, ConstantLeafModel) 

1459 y_predicted[sample_indices] = tree.value[leaf_node_id] 

1460 

1461 # **** Recursive strategy: not used anymore 

1462 # left_node = tree.children_left[node_id] # this way we do not have to query it again in the else. 

1463 # if left_node == _tree.TREE_LEAF: 

1464 # # --> Current node is a LEAF. See is_leaf(node_id, tree) <-- 

1465 # y_predicted = node_models[node_id].model.predict() 

1466 # else: 

1467 # # --> Current node is a SPLIT <-- 

1468 # right_node = tree.children_right[node_id] 

1469 # 

1470 # 

1471 # samples_at_this_node = get_samples_at_node(node_id, nodes_to_samples) 

1472 # y_true_this_node = y_train_all[samples_at_this_node] 

1473 # # As surprising as it may seem, in numpy [samples_at_this_node, selected_features] does something else. 

1474 # X_train_this_node = X_train_all[samples_at_this_node, :][:, selected_features] 

1475 # 

1476 # X_left 

1477 

1478 return y_predicted 

1479 

1480 

1481def predict_from_leaves(m5p, X, smoothing=True, smoothing_constant=15): 

1482 """ 

1483 Predicts using the M5P tree, without using the compiled sklearn 

1484 `tree.apply` subroutine. 

1485 

1486 The main purpose of this function is to apply smoothing to a M5P model tree 

1487 where smoothing has not been pre-installed on the models. For examples to 

1488 enable a model to be used both without and with smoothing for comparisons 

1489 purposes, or for models whose leaves are not Linear Models and therefore 

1490 for which no pre-installation method exist. 

1491 

1492 Note: this method is slower than `predict_from_leaves_no_smoothing` when 

1493 `smoothing=False`. 

1494 

1495 Parameters 

1496 ---------- 

1497 m5p : M5Prime 

1498 The model to use for prediction 

1499 X : array-like 

1500 The input data 

1501 smoothing : bool 

1502 Whether to apply smoothing 

1503 smoothing_constant : int 

1504 The smoothing constant `k` used as the prediction weight of parent node model. 

1505 (k=15 in the articles). 

1506 

1507 

1508 Returns 

1509 ------- 

1510 

1511 """ 

1512 # validate and converts dtype just in case this was directly called 

1513 # e.g. in unit tests 

1514 X = m5p._validate_X_predict(X, check_input=True) 

1515 

1516 tree = m5p.tree_ 

1517 node_models = m5p.node_models 

1518 nb_samples = X.shape[0] 

1519 y_predicted = -np.ones((nb_samples, 1), dtype=DOUBLE) 

1520 

1521 # sample_ids_to_leaf_node_ids = tree.apply(X) 

1522 def smooth_predictions(ancestor_nodes, X_at_node, y_pred_at_node): 

1523 # note: y_predicted_at_node can be a constant 

1524 current_node_model_id = ancestor_nodes[-1] 

1525 for _i, parent_model_id in enumerate(reversed(ancestor_nodes[:-1])): 

1526 # warning: this is the nb of TRAINING samples at this node 

1527 node_nb_train_samples = tree.n_node_samples[current_node_model_id] 

1528 parent_model = node_models[parent_model_id] 

1529 parent_predictions = parent_model.predict(X_at_node) 

1530 y_pred_at_node = (node_nb_train_samples * y_pred_at_node + smoothing_constant * parent_predictions) / ( 

1531 node_nb_train_samples + smoothing_constant 

1532 ) 

1533 current_node_model_id = parent_model_id 

1534 

1535 return y_pred_at_node 

1536 

1537 def apply_prediction(node_id, ids=None, ancestor_nodes=None): 

1538 first_call = False 

1539 if ids is None: 

1540 ids = slice(None) 

1541 first_call = True 

1542 if smoothing: 

1543 if ancestor_nodes is None: 

1544 ancestor_nodes = [node_id] 

1545 else: 

1546 ancestor_nodes.append(node_id) 

1547 

1548 left_id = tree.children_left[node_id] 

1549 if left_id == _tree.TREE_LEAF: 

1550 # ... and tree.children_right[node_id] == _tree.TREE_LEAF 

1551 # LEAF node: predict 

1552 # predict 

1553 node_model = node_models[node_id] 

1554 # assert (ids == (sample_ids_to_leaf_node_ids == node_id)).all() 

1555 if isinstance(node_model, LinRegLeafModel): 

1556 X_at_node = X[ids, :] 

1557 predictions = node_model.predict(X_at_node) 

1558 else: 

1559 # isinstance(node_model, ConstantLeafModel) 

1560 predictions = tree.value[node_id] 

1561 if smoothing: 

1562 X_at_node = X[ids, :] 

1563 

1564 # finally apply smoothing 

1565 if smoothing: 

1566 y_predicted[ids] = smooth_predictions(ancestor_nodes, X_at_node, predictions) 

1567 else: 

1568 y_predicted[ids] = predictions 

1569 else: 

1570 right_id = tree.children_right[node_id] 

1571 # non-leaf node: split samples and recurse 

1572 left_group = np.zeros(nb_samples, dtype=bool) 

1573 left_group[ids] = X[ids, tree.feature[node_id]] <= tree.threshold[node_id] 

1574 right_group = (~left_group) if first_call else (ids & (~left_group)) 

1575 

1576 # important: copy ancestor_nodes BEFORE calling anything, otherwise 

1577 # it will be modified 

1578 apply_prediction( 

1579 left_id, ids=left_group, ancestor_nodes=(ancestor_nodes.copy() if ancestor_nodes is not None else None) 

1580 ) 

1581 apply_prediction(right_id, ids=right_group, ancestor_nodes=ancestor_nodes) 

1582 

1583 # recurse to fill all predictions 

1584 apply_prediction(0) 

1585 

1586 return y_predicted 

1587 

1588 

1589def prune_children(node_id, tree): 

1590 """ 

1591 Prunes the children of node_id in the given `tree`. 

1592 

1593 Inspired by https://github.com/shenwanxiang/sklearn-post-prune-tree/blob/master/tree_prune.py#L122 

1594 

1595 IMPORTANT this relies on the fact that `children_left` and `children_right` 

1596 are modificable (and are the only things we need to modify to fix the 

1597 tree). It seems to be the case as of now. 

1598 

1599 Parameters 

1600 ---------- 

1601 node_id : int 

1602 tree : Tree 

1603 

1604 Returns 

1605 ------- 

1606 removed_nodes : List[int] 

1607 A list of removed nodes 

1608 """ 

1609 

1610 def _prune_below(_node_id): 

1611 left_child = tree.children_left[_node_id] 

1612 right_child = tree.children_right[_node_id] 

1613 if left_child == _tree.TREE_LEAF: 

1614 # _node_id is a leaf: left_ & right_child say "leaf".Nothing to do 

1615 return [] 

1616 else: 

1617 # Make sure everything is pruned below: children should be leaves 

1618 removed_l = _prune_below(left_child) 

1619 removed_r = _prune_below(right_child) 

1620 

1621 # -- First declare that they are not leaves anymore but they do not exist at all 

1622 for child in [left_child, right_child]: 

1623 

1624 if tree.children_left[child] != _tree.TREE_LEAF: 1624 ↛ 1625line 1624 didn't jump to line 1625, because the condition on line 1624 was never true

1625 raise ValueError("Unexpected children_left: %s, please report it as issue." % child) 

1626 

1627 if tree.children_right[child] != _tree.TREE_LEAF: 1627 ↛ 1628line 1627 didn't jump to line 1628, because the condition on line 1627 was never true

1628 raise ValueError("Unexpected children_right: %s, please report it as issue." % child) 

1629 

1630 tree.children_left[child] = _tree.TREE_UNDEFINED 

1631 tree.children_right[child] = _tree.TREE_UNDEFINED 

1632 

1633 # -- Then declare that current node is a leaf 

1634 tree.children_left[_node_id] = _tree.TREE_LEAF 

1635 tree.children_right[_node_id] = _tree.TREE_LEAF 

1636 

1637 # Return the list of removed nodes 

1638 return removed_l + removed_r + [left_child, right_child] 

1639 

1640 # Note: we do not change the node count here, as we'll clean all later. 

1641 # true_node_count = tree.node_count - sum(tree.children_left == _tree.TREE_UNDEFINED) 

1642 # tree.node_count -= 2*len(nodes_to_remove) 

1643 

1644 return _prune_below(node_id) 

1645 

1646 

1647def is_leaf(node_id, tree): 

1648 """ 

1649 Returns true if node with id `node_id` is a leaf in tree `tree`. 

1650 Is is not actually used in this file because we always need the left child 

1651 node id in our code. 

1652 

1653 But it is kept here as an easy way to remember how it works. 

1654 

1655 Parameters 

1656 ---------- 

1657 node_id : int 

1658 tree : Tree 

1659 

1660 Returns 

1661 ------- 

1662 _is_leaf : bool 

1663 A boolean flag, True if this node is a leaf. 

1664 """ 

1665 if node_id == _tree.TREE_LEAF or node_id == _tree.TREE_UNDEFINED: 

1666 raise ValueError("Invalid node_id %s" % node_id) 

1667 

1668 return tree.children_left[node_id] == _tree.TREE_LEAF 

1669 

1670 

1671def get_samples_at_node(node_id, nodes_to_samples): 

1672 """ 

1673 Return an array containing the ids of the samples for node `node_id`. 

1674 

1675 This method requires the user to 

1676 - first compute the decision path for the sample matrix X 

1677 - then convert it to a csc 

1678 

1679 >>> samples_to_nodes = estimator.decision_path(X) # returns a Scipy compressed sparse row matrix (CSR) 

1680 >>> nodes_to_samples = samples_to_nodes.tocsc() # we need the column equivalent (CSC) 

1681 >>> samples = get_samples_at_node(node_id, nodes_to_samples) 

1682 

1683 Parameters 

1684 ---------- 

1685 node_id : int 

1686 The node for which the list of samples is queried. 

1687 nodes_to_samples : csc_matrix 

1688 A boolean matrix in Compressed Sparse Column (CSC) format where rows are the 

1689 tree nodes and columns are samples. The matrix contains a 1 when the samples 

1690 go through this node when processed by the decisions. 

1691 

1692 Returns 

1693 ------- 

1694 

1695 """ 

1696 return nodes_to_samples.indices[nodes_to_samples.indptr[node_id] : nodes_to_samples.indptr[node_id + 1]] 

1697 

1698 

1699class M5Prime(M5Base, RegressorMixin): 

1700 """An M5' (M five prime) model tree regressor. 

1701 

1702 The original M5 algorithm was invented by R. Quinlan [1]_. Y. Wang [2]_ made 

1703 improvements and named the resulting algorithm M5 Prime. 

1704 This implementation was inspired by Weka (https://github.com/bnjmn/weka) 

1705 M5Prime class, from Mark Hall. 

1706 

1707 See also 

1708 -------- 

1709 M5Base 

1710 

1711 References 

1712 ---------- 

1713 .. [1] Ross J. Quinlan, "Learning with Continuous Classes", 5th Australian 

1714 Joint Conference on Artificial Intelligence, pp343-348, 1992. 

1715 .. [2] Y. Wang and I. H. Witten, "Induction of model trees for predicting 

1716 continuous classes", Poster papers of the 9th European Conference 

1717 on Machine Learning, 1997. 

1718 """ 

1719 

1720 

1721def _empty_contig_ar(shape, dtype): 

1722 """Return an empty contiguous array with given shape and dtype.""" 

1723 return np.ascontiguousarray(np.empty(shape, dtype=dtype))