9
9
from warnings import warn
10
10
11
11
from ..tree import ExtraTreeRegressor
12
- from ..utils import check_random_state , check_array
12
+ from ..utils import (
13
+ check_random_state ,
14
+ check_array ,
15
+ gen_batches ,
16
+ get_chunk_n_rows ,
17
+ )
13
18
from ..utils .fixes import _joblib_parallel_args
14
- from ..utils .validation import check_is_fitted
19
+ from ..utils .validation import check_is_fitted , _num_samples
15
20
from ..base import OutlierMixin
16
21
17
22
from .bagging import BaseBagging
@@ -388,21 +393,69 @@ def score_samples(self, X):
388
393
"match the input. Model n_features is {0} and "
389
394
"input n_features is {1}."
390
395
"" .format (self .n_features_ , X .shape [1 ]))
391
- n_samples = X .shape [0 ]
392
396
393
- n_samples_leaf = np .zeros (n_samples , order = "f" )
394
- depths = np .zeros (n_samples , order = "f" )
397
+ # Take the opposite of the scores as bigger is better (here less
398
+ # abnormal)
399
+ return - self ._compute_chunked_score_samples (X )
400
+
401
+ @property
402
+ def threshold_ (self ):
403
+ if self .behaviour != 'old' :
404
+ raise AttributeError ("threshold_ attribute does not exist when "
405
+ "behaviour != 'old'" )
406
+ warn ("threshold_ attribute is deprecated in 0.20 and will"
407
+ " be removed in 0.22." , DeprecationWarning )
408
+ return self ._threshold_
409
+
410
+ def _compute_chunked_score_samples (self , X ):
411
+
412
+ n_samples = _num_samples (X )
395
413
396
414
if self ._max_features == X .shape [1 ]:
397
415
subsample_features = False
398
416
else :
399
417
subsample_features = True
400
418
419
+ # We get as many rows as possible within our working_memory budget
420
+ # (defined by sklearn.get_config()['working_memory']) to store
421
+ # self._max_features in each row during computation.
422
+ #
423
+ # Note:
424
+ # - this will get at least 1 row, even if 1 row of score will
425
+ # exceed working_memory.
426
+ # - this does only account for temporary memory usage while loading
427
+ # the data needed to compute the scores -- the returned scores
428
+ # themselves are 1D.
429
+
430
+ chunk_n_rows = get_chunk_n_rows (row_bytes = 16 * self ._max_features ,
431
+ max_n_rows = n_samples )
432
+ slices = gen_batches (n_samples , chunk_n_rows )
433
+
434
+ scores = np .zeros (n_samples , order = "f" )
435
+
436
+ for sl in slices :
437
+ # compute score on the slices of test samples:
438
+ scores [sl ] = self ._compute_score_samples (X [sl ], subsample_features )
439
+
440
+ return scores
441
+
442
+ def _compute_score_samples (self , X , subsample_features ):
443
+ """Compute the score of each samples in X going through the extra trees.
444
+
445
+ Parameters
446
+ ----------
447
+ X : array-like or sparse matrix
448
+
449
+ subsample_features : bool,
450
+ whether features should be subsampled
451
+ """
452
+ n_samples = X .shape [0 ]
453
+
454
+ depths = np .zeros (n_samples , order = "f" )
455
+
401
456
for tree , features in zip (self .estimators_ , self .estimators_features_ ):
402
- if subsample_features :
403
- X_subset = X [:, features ]
404
- else :
405
- X_subset = X
457
+ X_subset = X [:, features ] if subsample_features else X
458
+
406
459
leaves_index = tree .apply (X_subset )
407
460
node_indicator = tree .decision_path (X_subset )
408
461
n_samples_leaf = tree .tree_ .n_node_samples [leaves_index ]
@@ -418,19 +471,7 @@ def score_samples(self, X):
418
471
/ (len (self .estimators_ )
419
472
* _average_path_length ([self .max_samples_ ]))
420
473
)
421
-
422
- # Take the opposite of the scores as bigger is better (here less
423
- # abnormal)
424
- return - scores
425
-
426
- @property
427
- def threshold_ (self ):
428
- if self .behaviour != 'old' :
429
- raise AttributeError ("threshold_ attribute does not exist when "
430
- "behaviour != 'old'" )
431
- warn ("threshold_ attribute is deprecated in 0.20 and will"
432
- " be removed in 0.22." , DeprecationWarning )
433
- return self ._threshold_
474
+ return scores
434
475
435
476
436
477
def _average_path_length (n_samples_leaf ):
0 commit comments