diff --git a/sklearn/base.py b/sklearn/base.py index d13619617bde4..c4eb14aeca8c5 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -8,6 +8,7 @@ from collections import defaultdict import platform import inspect +import re import numpy as np @@ -233,10 +234,13 @@ def set_params(self, **params): return self - def __repr__(self): + def __repr__(self, N_CHAR_MAX=700): + # N_CHAR_MAX is the (approximate) maximum number of non-blank + # characters to render. We pass it as an optional parameter to ease + # the tests. + from .utils._pprint import _EstimatorPrettyPrinter - N_CHAR_MAX = 700 # number of non-whitespace or newline chars N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences # use ellipsis for sequences with a lot of elements @@ -246,10 +250,37 @@ def __repr__(self): repr_ = pp.pformat(self) - # Use bruteforce ellipsis if string is very long - if len(''.join(repr_.split())) > N_CHAR_MAX: # check non-blank chars - lim = N_CHAR_MAX // 2 - repr_ = repr_[:lim] + '...' + repr_[-lim:] + # Use bruteforce ellipsis when there are a lot of non-blank characters + n_nonblank = len(''.join(repr_.split())) + if n_nonblank > N_CHAR_MAX: + lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends + regex = r'^(\s*\S){%d}' % lim + # The regex '^(\s*\S){%d}' % n + # matches from the start of the string until the nth non-blank + # character: + # - ^ matches the start of string + # - (pattern){n} matches n repetitions of pattern + # - \s*\S matches a non-blank char following zero or more blanks + left_lim = re.match(regex, repr_).end() + right_lim = re.match(regex, repr_[::-1]).end() + + if '\n' in repr_[left_lim:-right_lim]: + # The left side and right side aren't on the same line. + # To avoid weird cuts, e.g.: + # categoric...ore', + # we need to start the right side with an appropriate newline + # character so that it renders properly as: + # categoric... + # handle_unknown='ignore', + # so we add [^\n]*\n which matches until the next \n + regex += r'[^\n]*\n' + right_lim = re.match(regex, repr_[::-1]).end() + + ellipsis = '...' + if left_lim + len(ellipsis) < len(repr_) - right_lim: + # Only add ellipsis if it results in a shorter repr + repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:] + return repr_ def __getstate__(self): diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index f63e3373aa809..0fa536675ea38 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -459,16 +459,78 @@ def test_n_max_elements_to_show(): assert pp.pformat(gs) == expected -def test_length_constraint(): - # When repr is still too long, use bruteforce ellipsis - # repr is a very long line so we don't check for equality here, just that - # ellipsis has been done. It's not the ellipsis from before because the - # number of elements in the dict is only 1. - vocabulary = {0: 'hello' * 1000} - vectorizer = CountVectorizer(vocabulary=vocabulary) - repr_ = vectorizer.__repr__() - assert '...' in repr_ +def test_bruteforce_ellipsis(): + # Check that the bruteforce ellipsis (used when the number of non-blank + # characters exceeds N_CHAR_MAX) renders correctly. + + lr = LogisticRegression() + + # test when the left and right side of the ellipsis aren't on the same + # line. + expected = """ +LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + in... + multi_class='warn', n_jobs=None, penalty='l2', + random_state=None, solver='warn', tol=0.0001, verbose=0, + warm_start=False)""" + + expected = expected[1:] # remove first \n + assert expected == lr.__repr__(N_CHAR_MAX=150) + + # test with very small N_CHAR_MAX + # Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid + # weird reprs we still keep the whole line of the right part (after the + # ellipsis). + expected = """ +Lo... + warm_start=False)""" + + expected = expected[1:] # remove first \n + assert expected == lr.__repr__(N_CHAR_MAX=4) + + # test with N_CHAR_MAX == number of non-blank characters: In this case we + # don't want ellipsis + full_repr = lr.__repr__(N_CHAR_MAX=float('inf')) + n_nonblank = len(''.join(full_repr.split())) + assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr + assert '...' not in full_repr + + # test with N_CHAR_MAX == number of non-blank characters - 10: the left and + # right side of the ellispsis are on different lines. In this case we + # want to expend the whole line of the right side + expected = """ +LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, l1_ratio=None, max_i... + multi_class='warn', n_jobs=None, penalty='l2', + random_state=None, solver='warn', tol=0.0001, verbose=0, + warm_start=False)""" + expected = expected[1:] # remove first \n + assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10) + + # test with N_CHAR_MAX == number of non-blank characters - 10: the left and + # right side of the ellispsis are on the same line. In this case we don't + # want to expend the whole line of the right side, just add the ellispsis + # between the 2 sides. + expected = """ +LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, l1_ratio=None, max_iter..., + multi_class='warn', n_jobs=None, penalty='l2', + random_state=None, solver='warn', tol=0.0001, verbose=0, + warm_start=False)""" + expected = expected[1:] # remove first \n + assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4) + # test with N_CHAR_MAX == number of non-blank characters - 2: the left and + # right side of the ellispsis are on the same line, but adding the ellipsis + # would actually make the repr longer. So we don't add the ellipsis. + expected = """ +LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, l1_ratio=None, max_iter=100, + multi_class='warn', n_jobs=None, penalty='l2', + random_state=None, solver='warn', tol=0.0001, verbose=0, + warm_start=False)""" + expected = expected[1:] # remove first \n + assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2) def test_builtin_prettyprinter(): # non regression test than ensures we can still use the builtin