8000 [MRG + 1] Fix pprint ellipsis by NicolasHug · Pull Request #13436 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG + 1] Fix pprint ellipsis #13436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 37 additions & 6 deletions sklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from collections import defaultdict
import platform
import inspect
import re

import numpy as np

Expand Down Expand Up @@ -233,10 +234,13 @@ def set_params(self, **params):

return self

def __repr__(self):
def __repr__(self, N_CHAR_MAX=700):
# N_CHAR_MAX is the (approximate) maximum number of non-blank
# characters to render. We pass it as an optional parameter to ease
# the tests.

from .utils._pprint import _EstimatorPrettyPrinter

N_CHAR_MAX = 700 # number of non-whitespace or newline chars
N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences

# use ellipsis for sequences with a lot of elements
Expand All @@ -246,10 +250,37 @@ def __repr__(self):

repr_ = pp.pformat(self)

# Use bruteforce ellipsis if string is very long
if len(''.join(repr_.split())) > N_CHAR_MAX: # check non-blank chars
lim = N_CHAR_MAX // 2
repr_ = repr_[:lim] + '...' + repr_[-lim:]
# Use bruteforce ellipsis when there are a lot of non-blank characters
n_nonblank = len(''.join(repr_.split()))
if n_nonblank > N_CHAR_MAX:
lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
regex = r'^(\s*\S){%d}' % lim
# The regex '^(\s*\S){%d}' % n
# matches from the start of the string until the nth non-blank
# character:
# - ^ matches the start of string
# - (pattern){n} matches n repetitions of pattern
# - \s*\S matches a non-blank char following zero or more blanks
left_lim = re.match(regex, repr_).end()
right_lim = re.match(regex, repr_[::-1]).end()

if '\n' in repr_[left_lim:-right_lim]:
# The left side and right side aren't on the same line.
# To avoid weird cuts, e.g.:
# categoric...ore',
# we need to start the right side with an appropriate newline
# character so that it renders properly as:
# categoric...
# handle_unknown='ignore',
# so we add [^\n]*\n which matches until the next \n
regex += r'[^\n]*\n'
right_lim = re.match(regex, repr_[::-1]).end()

ellipsis = '...'
if left_lim + len(ellipsis) < len(repr_) - right_lim:
# Only add ellipsis if it results in a shorter repr
repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:]

return repr_

def __getstate__(self):
Expand Down
80 changes: 71 additions & 9 deletions sklearn/utils/tests/test_pprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,16 +459,78 @@ def test_n_max_elements_to_show():
assert pp.pformat(gs) == expected


def test_length_constraint():
# When repr is still too long, use bruteforce ellipsis
# repr is a very long line so we don't check for equality here, just that
# ellipsis has been done. It's not the ellipsis from before because the
# number of elements in the dict is only 1.
vocabulary = {0: 'hello' * 1000}
vectorizer = CountVectorizer(vocabulary=vocabulary)
repr_ = vectorizer.__repr__()
assert '...' in repr_
def test_bruteforce_ellipsis():
# Check that the bruteforce ellipsis (used when the number of non-blank
# characters exceeds N_CHAR_MAX) renders correctly.

lr = LogisticRegression()

# test when the left and right side of the ellipsis aren't on the same
# line.
expected = """
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had, btw, expected that with a configurable N_CHAR_MAX you'd make this test case much simpler...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you mean testing with an estimator that has a shorted repr?

I agree it'd be better (I'll do it if that's what you meant)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Looking at the long, pathological cases give us a better sense of the user experience, but makes the tests harder to work with.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated, thanks it looks much better

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
in...
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""

expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=150)

# test with very small N_CHAR_MAX
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
# weird reprs we still keep the whole line of the right part (after the
# ellipsis).
expected = """
Lo...
warm_start=False)"""

expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=4)

# test with N_CHAR_MAX == number of non-blank characters: In this case we
# don't want ellipsis
full_repr = lr.__repr__(N_CHAR_MAX=float('inf'))
n_nonblank = len(''.join(full_repr.split()))
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
assert '...' not in full_repr

# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on different lines. In this case we
# want to expend the whole line of the right side
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_i...
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)

# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on the same line. In this case we don't
# want to expend the whole line of the right side, just add the ellispsis
# between the 2 sides.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter...,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)

# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
# right side of the ellispsis are on the same line, but adding the ellipsis
# would actually make the repr longer. So we don't add the ellipsis.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)

def test_builtin_prettyprinter():
# non regression test than ensures we can still use the builtin
Expand Down
0