Skip to content

Commit 4976a82

Browse files
authored
Merge pull request #71 from prataprc/issue_7
use unicode-width instead of len() or grapheme cluster #7.
2 parents 08af2e5 + 27df9b2 commit 4976a82

File tree

2 files changed

+119
-132
lines changed

2 files changed

+119
-132
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,8 @@ getopts-like option parsing.
1313
"""
1414
categories = ["command-line-interface"]
1515

16+
[dependencies]
17+
unicode-width = "0.1.5"
18+
1619
[dev-dependencies]
1720
log = "0.4"

src/lib.rs

Lines changed: 116 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -104,22 +104,23 @@
104104
reason = "use the crates.io `getopts` library instead"))]
105105

106106
#[cfg(test)] #[macro_use] extern crate log;
107+
extern crate unicode_width;
108+
107109

108110
use self::Name::*;
109111
use self::HasArg::*;
110112
use self::Occur::*;
111113
use self::Fail::*;
112114
use self::Optval::*;
113-
use self::SplitWithinState::*;
114-
use self::Whitespace::*;
115-
use self::LengthLimit::*;
116115

117116
use std::error::Error;
118117
use std::ffi::OsStr;
119118
use std::fmt;
120119
use std::iter::{repeat, IntoIterator};
121120
use std::result;
122121

122+
use unicode_width::UnicodeWidthStr;
123+
123124
/// A description of the options that a program can handle.
124125
pub struct Options {
125126
grps: Vec<OptGroup>,
@@ -479,7 +480,7 @@ impl Options {
479480
let mut row = " ".to_string();
480481

481482
// short option
482-
match short_name.len() {
483+
match short_name.width() {
483484
0 => {
484485
if any_short {
485486
row.push_str(" ");
@@ -488,26 +489,23 @@ impl Options {
488489
1 => {
489490
row.push('-');
490491
row.push_str(&short_name);
491-
if long_name.len() > 0 {
492+
if long_name.width() > 0 {
492493
row.push_str(", ");
493494
} else {
494495
// Only a single space here, so that any
495496
// argument is printed in the correct spot.
496497
row.push(' ');
497498
}
498499
}
500+
// FIXME: refer issue #7.
499501
_ => panic!("the short name should only be 1 ascii char long"),
500502
}
501503

502504
// long option
503-
match long_name.len() {
505+
match long_name.width() {
504506
0 => {}
505507
_ => {
506-
if self.long_only {
507-
row.push('-');
508-
} else {
509-
row.push_str("--");
510-
}
508+
row.push_str(if self.long_only { "-" } else { "--" });
511509
row.push_str(&long_name);
512510
row.push(' ');
513511
}
@@ -524,9 +522,7 @@ impl Options {
524522
}
525523
}
526524

527-
// FIXME: #5516 should be graphemes not codepoints
528-
// here we just need to indent the start of the description
529-
let rowlen = row.chars().count();
525+
let rowlen = row.width();
530526
if rowlen < 24 {
531527
for _ in 0 .. 24 - rowlen {
532528
row.push(' ');
@@ -535,25 +531,7 @@ impl Options {
535531
row.push_str(&desc_sep)
536532
}
537533

538-
// Normalize desc to contain words separated by one space character
539-
let mut desc_normalized_whitespace = String::new();
540-
for word in desc.split(|c: char| c.is_whitespace())
541-
.filter(|s| !s.is_empty()) {
542-
desc_normalized_whitespace.push_str(word);
543-
desc_normalized_whitespace.push(' ');
544-
}
545-
546-
// FIXME: #5516 should be graphemes not codepoints
547-
let mut desc_rows = Vec::new();
548-
each_split_within(&desc_normalized_whitespace,
549-
54,
550-
|substr| {
551-
desc_rows.push(substr.to_string());
552-
true
553-
});
554-
555-
// FIXME: #5516 should be graphemes not codepoints
556-
// wrapped description
534+
let desc_rows = each_split_within(&desc, 54);
557535
row.push_str(&desc_rows.join(&desc_sep));
558536

559537
row
@@ -923,118 +901,59 @@ fn format_option(opt: &OptGroup) -> String {
923901
line
924902
}
925903

926-
#[derive(Clone, Copy)]
927-
enum SplitWithinState {
928-
A, // leading whitespace, initial state
929-
B, // words
930-
C, // internal and trailing whitespace
931-
}
932-
933-
#[derive(Clone, Copy)]
934-
enum Whitespace {
935-
Ws, // current char is whitespace
936-
Cr // current char is not whitespace
937-
}
938-
939-
#[derive(Clone, Copy)]
940-
enum LengthLimit {
941-
UnderLim, // current char makes current substring still fit in limit
942-
OverLim // current char makes current substring no longer fit in limit
943-
}
944-
945-
946904
/// Splits a string into substrings with possibly internal whitespace,
947905
/// each of them at most `lim` bytes long, if possible. The substrings
948906
/// have leading and trailing whitespace removed, and are only cut at
949907
/// whitespace boundaries.
950-
///
951-
/// Note: Function was moved here from `std::str` because this module is the only place that
952-
/// uses it, and because it was too specific for a general string function.
953-
fn each_split_within<'a, F>(ss: &'a str, lim: usize, mut it: F)
954-
-> bool where F: FnMut(&'a str) -> bool {
955-
// Just for fun, let's write this as a state machine:
956-
957-
let mut slice_start = 0;
958-
let mut last_start = 0;
959-
let mut last_end = 0;
960-
let mut state = A;
961-
let mut fake_i = ss.len();
962-
let mut lim = lim;
963-
964-
let mut cont = true;
965-
966-
// if the limit is larger than the string, lower it to save cycles
967-
if lim >= fake_i {
968-
lim = fake_i;
969-
}
970-
971-
let mut machine = |cont: &mut bool, state: &mut SplitWithinState, (i, c): (usize, char)| {
972-
let whitespace = if c.is_whitespace() { Ws } else { Cr };
973-
let limit = if (i - slice_start + 1) <= lim { UnderLim } else { OverLim };
974-
975-
*state = match (*state, whitespace, limit) {
976-
(A, Ws, _) => { A }
977-
(A, Cr, _) => { slice_start = i; last_start = i; B }
978-
979-
(B, Cr, UnderLim) => { B }
980-
(B, Cr, OverLim) if (i - last_start + 1) > lim => {
981-
// A single word has gone over the limit. In this
982-
// case we just accept that the word will be too long.
983-
B
984-
}
985-
(B, Cr, OverLim) => {
986-
*cont = it(&ss[slice_start..last_end]);
987-
slice_start = last_start;
988-
B
989-
}
990-
(B, Ws, UnderLim) => {
991-
last_end = i;
992-
C
993-
}
994-
(B, Ws, OverLim) => {
995-
last_end = i;
996-
*cont = it(&ss[slice_start..last_end]);
997-
A
998-
}
999-
1000-
(C, Cr, UnderLim) => {
1001-
last_start = i;
1002-
B
908+
fn each_split_within(desc: &str, lim: usize) -> Vec<String> {
909+
let mut rows = Vec::new();
910+
for line in desc.trim().lines() {
911+
let line_chars = line.chars().chain(Some(' '));
912+
let words = line_chars.fold( (Vec::new(), 0, 0), |(mut words, a, z), c | {
913+
let idx = z + c.len_utf8(); // Get the current byte offset
914+
915+
// If the char is whitespace, advance the word start and maybe push a word
916+
if c.is_whitespace() {
917+
if a != z {
918+
words.push(&line[a..z]);
919+
}
920+
(words, idx, idx)
1003921
}
1004-
(C, Cr, OverLim) => {
1005-
*cont = it(&ss[slice_start..last_end]);
1006-
slice_start = i;
1007-
last_start = i;
1008-
last_end = i;
1009-
B
922+
// If the char is not whitespace, continue, retaining the current
923+
else {
924+
(words, a, idx)
1010925
}
1011-
(C, Ws, OverLim) => {
1012-
*cont = it(&ss[slice_start..last_end]);
1013-
A
926+
}).0;
927+
928+
let mut row = String::new();
929+
for word in words.iter() {
930+
let sep = if row.len() > 0 { Some(" ") } else { None };
931+
let width = row.width()
932+
+ word.width()
933+
+ sep.map(UnicodeWidthStr::width).unwrap_or(0);
934+
935+
if width <= lim {
936+
if let Some(sep) = sep { row.push_str(sep) }
937+
row.push_str(word);
938+
continue
1014939
}
1015-
(C, Ws, UnderLim) => {
1016-
C
940+
if row.len() > 0 {
941+
rows.push(row.clone());
942+
row.clear();
1017943
}
1018-
};
1019-
1020-
*cont
1021-
};
1022-
1023-
ss.char_indices().all(|x| machine(&mut cont, &mut state, x));
1024-
1025-
// Let the automaton 'run out' by supplying trailing whitespace
1026-
while cont && match state { B | C => true, A => false } {
1027-
machine(&mut cont, &mut state, (fake_i, ' '));
1028-
fake_i += 1;
944+
row.push_str(word);
945+
}
946+
if row.len() > 0 {
947+
rows.push(row);
948+
}
1029949
}
1030-
return cont;
950+
rows
1031951
}
1032952

1033953
#[test]
1034954
fn test_split_within() {
1035955
fn t(s: &str, i: usize, u: &[String]) {
1036-
let mut v = Vec::new();
1037-
each_split_within(s, i, |s| { v.push(s.to_string()); true });
956+
let v = each_split_within(&(s.to_string()), i);
1038957
assert!(v.iter().zip(u.iter()).all(|(a,b)| a == b));
1039958
}
1040959
t("", 0, &[]);
@@ -1046,7 +965,9 @@ fn test_split_within() {
1046965
"Little lamb".to_string()
1047966
]);
1048967
t("\nMary had a little lamb\nLittle lamb\n", ::std::usize::MAX,
1049-
&["Mary had a little lamb\nLittle lamb".to_string()]);
968+
&["Mary had a little lamb".to_string(),
969+
"Little lamb".to_string()
970+
]);
1050971
}
1051972

1052973
#[cfg(test)]
@@ -1827,6 +1748,69 @@ Options:
18271748
assert!(usage == expected)
18281749
}
18291750

1751+
#[test]
1752+
fn test_usage_description_newline_handling() {
1753+
let mut opts = Options::new();
1754+
opts.optflag("k", "k\u{2013}w\u{2013}",
1755+
"The word kiwi is normally spelled with two i's");
1756+
opts.optflag("a", "apple",
1757+
"This description forces a new line.\n Here is a premature\n\
1758+
newline");
1759+
1760+
let expected =
1761+
"Usage: fruits
1762+
1763+
Options:
1764+
-k, --k–w– The word kiwi is normally spelled with two i's
1765+
-a, --apple This description forces a new line.
1766+
Here is a premature
1767+
newline
1768+
";
1769+
1770+
let usage = opts.usage("Usage: fruits");
1771+
1772+
debug!("expected: <<{}>>", expected);
1773+
debug!("generated: <<{}>>", usage);
1774+
assert!(usage == expected)
1775+
}
1776+
1777+
#[test]
1778+
fn test_usage_multiwidth() {
1779+
let mut opts = Options::new();
1780+
opts.optflag("a", "apple", "apple description");
1781+
opts.optflag("b", "banana\u{00AB}", "banana description");
1782+
opts.optflag("c", "brûlée", "brûlée quite long description");
1783+
opts.optflag("k", "kiwi\u{20AC}", "kiwi description");
1784+
opts.optflag("o", "orange\u{2039}", "orange description");
1785+
opts.optflag("r", "raspberry-but-making-this-option-way-too-long",
1786+
"raspberry description is also quite long indeed longer than \
1787+
every other piece of text we might encounter here and thus will \
1788+
be automatically broken up"
1789+
);
1790+
1791+
let expected =
1792+
"Usage: fruits
1793+
1794+
Options:
1795+
-a, --apple apple description
1796+
-b, --banana« banana description
1797+
-c, --brûlée brûlée quite long description
1798+
-k, --kiwi€ kiwi description
1799+
-o, --orange‹ orange description
1800+
-r, --raspberry-but-making-this-option-way-too-long
1801+
raspberry description is also quite long indeed longer
1802+
than every other piece of text we might encounter here
1803+
and thus will be automatically broken up
1804+
";
1805+
1806+
let usage = opts.usage("Usage: fruits");
1807+
1808+
debug!("expected: <<{}>>", expected);
1809+
debug!("generated: <<{}>>", usage);
1810+
assert!(usage == expected)
1811+
}
1812+
1813+
18301814
#[test]
18311815
fn test_usage_short_only() {
18321816
let mut opts = Options::new();

0 commit comments

Comments
 (0)