8000 Fix the last two GNU tests for fmt by sylvestre · Pull Request #8264 · uutils/coreutils · GitHub
[go: up one dir, main page]

Skip to content

Fix the last two GNU tests for fmt #8264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/GnuTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ jobs:
sudo locale-gen --keep-existing sv_SE
sudo locale-gen --keep-existing sv_SE.UTF-8
sudo locale-gen --keep-existing en_US
sudo locale-gen --keep-existing en_US.UTF-8
sudo locale-gen --keep-existing ru_RU.KOI8-R

sudo update-locale
Expand Down
10 changes: 7 additions & 3 deletions src/uu/fmt/src/linebreak.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,11 @@ fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>(
let mut next_active_breaks = vec![];

let stretch = args.opts.width - args.opts.goal;
let minlength = args.opts.goal.max(stretch + 1) - stretch;
let minlength = if args.opts.goal <= 10 {
1
} else {
args.opts.goal.max(stretch + 1) - stretch
};
let mut new_linebreaks = vec![];
let mut is_sentence_start = false;
let mut least_demerits = 0;
Expand Down Expand Up @@ -384,11 +388,11 @@ fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a Wo
const BAD_INFTY: i64 = 10_000_000;
const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
// badness = BAD_MULT * abs(r) ^ 3
const BAD_MULT: f32 = 100.0;
const BAD_MULT: f32 = 200.0;
// DR_MULT is multiplier for delta-R between lines
const DR_MULT: f32 = 600.0;
// DL_MULT is penalty multiplier for short words at end of line
const DL_MULT: f32 = 300.0;
const DL_MULT: f32 = 10.0;

fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
// how much stretch are we using?
Expand Down
22 changes: 15 additions & 7 deletions src/uu/fmt/src/parasplit.rs
8000
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
}
}

// GNU fmt has a more restrictive definition of whitespace than Unicode.
// It only considers ASCII whitespace characters (space, tab, newline, etc.)
// and excludes many Unicode whitespace characters like non-breaking spaces.
fn is_fmt_whitespace(c: char) -> bool {
// Only ASCII whitespace characters are considered whitespace in GNU fmt
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C')
}

// lines with PSKIP, lacking PREFIX, or which are entirely blank are
// NoFormatLines; otherwise, they are FormatLines
#[derive(Debug)]
Expand Down Expand Up @@ -109,7 +117,7 @@
for (i, char) in line.char_indices() {
if line[i..].starts_with(pfx) {
return (true, i);
} else if !char.is_whitespace() {
} else if !is_fmt_whitespace(char) {

Check warning on line 120 in src/uu/fmt/src/parasplit.rs

View check run for this annotation

Codecov / codecov/patch

src/uu/fmt/src/parasplit.rs#L120

Added line #L120 was not covered by tests
break;
}
}
Expand All @@ -128,7 +136,7 @@
prefix_len = indent_len;
}

if (os >= prefix_end) && !c.is_whitespace() {
if (os >= prefix_end) && !is_fmt_whitespace(c) {
// found first non-whitespace after prefix, this is indent_end
indent_end = os;
break;
Expand All @@ -154,7 +162,7 @@
// emit a blank line
// Err(true) indicates that this was a linebreak,
// which is important to know when detecting mail headers
if n.chars().all(char::is_whitespace) {
if n.chars().all(is_fmt_whitespace) {
return Some(Line::NoFormatLine(String::new(), true));
}

Expand All @@ -174,7 +182,7 @@
if pmatch
&& n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
.chars()
.all(char::is_whitespace)
.all(is_fmt_whitespace)
{
return Some(Line::NoFormatLine(n, false));
}
Expand Down Expand Up @@ -498,7 +506,7 @@
let mut aftertab = 0;
let mut word_start = None;
for (os, c) in string.char_indices() {
if !c.is_whitespace() {
if !is_fmt_whitespace(c) {
word_start = Some(os);
break;
} else if c == '\t' {
Expand All @@ -519,7 +527,7 @@
impl WordSplit<'_> {
fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
// wordsplits *must* start at a non-whitespace character
let trim_string = string.trim_start();
let trim_string = string.trim_start_matches(is_fmt_whitespace);
WordSplit {
opts,
string: trim_string,
Expand Down Expand Up @@ -571,7 +579,7 @@
// points to whitespace character OR end of string
let mut word_nchars = 0;
self.position = match self.string[word_start..].find(|x: char| {
if x.is_whitespace() {
if is_fmt_whitespace(x) {
true
} else {
word_nchars += char_width(x);
Expand Down
73 changes: 73 additions & 0 deletions tests/by-util/test_fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// spell-checker:ignore plass samp

use uutests::new_ucmd;
use uutests::util::TestScenario;
use uutests::util_name;
Expand Down Expand Up @@ -303,3 +306,73 @@ fn prefix_equal_skip_prefix_equal_two() {
.stdout_is_fixture("prefixed-one-word-per-line_p=_P=2.txt");
}
}

#[test]
fn test_fmt_unicode_whitespace_handling() {
// Character classification fix: Test that Unicode whitespace characters like non-breaking space
// are NOT treated as whitespace by fmt, maintaining GNU fmt compatibility.
// GNU fmt only recognizes ASCII whitespace (space, tab, newline, etc.) and excludes
// Unicode whitespace characters to ensure consistent formatting behavior.
// This prevents regression of the character classification fix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand what "character classification fix" refers to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i improved the wording

let non_breaking_space = "\u{00A0}"; // U+00A0 NO-BREAK SPACE
let figure_space = "\u{2007}"; // U+2007 FIGURE SPACE
let narrow_no_break_space = "\u{202F}"; // U+202F NARROW NO-BREAK SPACE

// When fmt splits on width=1, these characters should NOT cause line breaks
// because they should not be considered whitespace
for (name, char) in [
("non-breaking space", non_breaking_space),
("figure space", figure_space),
("narrow no-break space", narrow_no_break_space),
] {
let input = format!("={char}=");
let result = new_ucmd!()
.args(&["-s", "-w1"])
.pipe_in(input.as_bytes())
.succeeds();

// Should be 1 line since the Unicode char is not treated as whitespace
assert_eq!(
result.stdout_str().lines().count(),
1,
"Failed for {name}: Unicode character should not be treated as whitespace"
);
}
}

#[test]
fn test_fmt_knuth_plass_line_breaking() {
// Line breaking algorithm improvements: Test the enhanced Knuth-Plass optimal line breaking
// algorithm that better handles sentence boundaries, word positioning constraints,
// and produces more natural line breaks for complex text formatting.
// This prevents regression of the line breaking algorithm improvements
let input = "@command{fmt} prefers breaking lines at the end of a sentence, and tries to\n\
avoid line breaks after the first word of a sentence or before the last word\n\
of a sentence. A @dfn{sentence break} is defined as either the end of a\n\
paragraph or a word ending in any of @samp{.?!}, followed by two spaces or end\n\
of line, ignoring any intervening parentheses or quotes. Like @TeX{},\n\
@command{fmt} reads entire ''paragraphs'' before choosing line breaks; the\n\
algorithm is a variant of that given by\n\
Donald E. Knuth and Michael F. Plass\n\
in ''Breaking Paragraphs Into Lines'',\n\
@cite{Software---Practice & Experience}\n\
@b{11}, 11 (November 1981), 1119--1184.";

let expected = "@command{fmt} prefers breaking lines at the end of a sentence,\n\
and tries to avoid line breaks after the first word of a sentence\n\
or before the last word of a sentence. A @dfn{sentence break}\n\
is defined as either the end of a paragraph or a word ending\n\
in any of @samp{.?!}, followed by two spaces or end of line,\n\
ignoring any intervening parentheses or quotes. Like @TeX{},\n\
@command{fmt} reads entire ''paragraphs'' before choosing line\n\
breaks; the algorithm is a variant of that given by Donald\n\
E. Knuth and Michael F. Plass in ''Breaking Paragraphs Into\n\
Lines'', @cite{Software---Practice & Experience} @b{11}, 11\n\
(November 1981), 1119--1184.\n";

new_ucmd!()
.args(&["-g", "60", "-w", "72"])
.pipe_in(input)
.succeeds()
.stdout_is(expected);
}
Loading
0