Skip to content

Commit 2fbe8f2

Browse files
committed
Introduce multibyte string scanner functions
* String_makePrintable() * EncodePrintableString() * String_lineBreakWidth() * String_mbswidth() Signed-off-by: Kang-Che Sung <[email protected]>
1 parent d681877 commit 2fbe8f2

File tree

2 files changed

+329
-0
lines changed

2 files changed

+329
-0
lines changed

XUtils.c

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ in the source distribution for its full text.
1010
#include "XUtils.h"
1111

1212
#include <assert.h>
13+
#include <ctype.h> // IWYU pragma: keep
1314
#include <errno.h>
1415
#include <fcntl.h>
16+
#include <limits.h> // IWYU pragma: keep
1517
#include <math.h>
1618
#include <stdarg.h>
1719
#include <stdint.h>
@@ -235,6 +237,293 @@ size_t strnlen(const char* str, size_t maxLen) {
235237
}
236238
#endif
237239

240+
#ifdef HAVE_LIBNCURSESW
241+
static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) {
242+
assert(!ps->buf || ps->pos < ps->size);
243+
244+
char tempBuf[MB_LEN_MAX];
245+
char* dest = ps->buf ? (char*)ps->buf + ps->pos : tempBuf;
246+
247+
// It is unnecessarily expensive to fix the output string if the caller
248+
// gives an incorrect buffer size. This function would not support any
249+
// truncation of the output string.
250+
size_t len = wcrtomb(dest, wc, &ps->mbState);
251+
assert(len > 0);
252+
if (len == (size_t)-1) {
253+
assert(len != (size_t)-1);
254+
fail();
255+
}
256+
if (ps->buf && len > ps->size - ps->pos) {
257+
assert(!ps->buf || len <= ps->size - ps->pos);
258+
fail();
259+
}
260+
261+
ps->pos += len;
262+
}
263+
#else
264+
static void String_encodeWChar(WCharEncoderState* ps, int c) {
265+
assert(!ps->buf || ps->pos < ps->size);
266+
267+
char* buf = ps->buf;
268+
if (buf) {
269+
buf[ps->pos] = (char)c;
270+
}
271+
272+
ps->pos += 1;
273+
}
274+
#endif
275+
276+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) {
277+
assert(src || maxLen == 0);
278+
279+
size_t pos = 0;
280+
bool wasReplaced = false;
281+
282+
#ifdef HAVE_LIBNCURSESW
283+
const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?';
284+
wchar_t ch;
285+
286+
mbstate_t decState;
287+
memset(&decState, 0, sizeof(decState));
288+
#else
289+
const char replacementChar = '?';
290+
char ch;
291+
#endif
292+
293+
do {
294+
size_t len = 0;
295+
bool shouldReplace = false;
296+
ch = 0;
297+
298+
if (pos < maxLen) {
299+
// Read the next character from the byte sequence
300+
#ifdef HAVE_LIBNCURSESW
301+
mbstate_t newState;
302+
memcpy(&newState, &decState, sizeof(newState));
303+
len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState);
304+
305+
assert(len != 0 || ch == 0);
306+
switch (len) {
307+
case (size_t)-2:
308+
errno = EILSEQ;
309+
shouldReplace = true;
310+
len = maxLen - pos;
311+
break;
312+
313+
case (size_t)-1:
314+
shouldReplace = true;
315+
len = 1;
316+
break;
317+
318+
default:
319+
memcpy(&decState, &newState, sizeof(decState));
320+
}
321+
#else
322+
len = 1;
323+
ch = src[pos];
324+
#endif
325+
}
326+
327+
pos += len;
328+
329+
// Filter unprintable characters
330+
if (!shouldReplace && ch != 0) {
331+
#ifdef HAVE_LIBNCURSESW
332+
shouldReplace = !iswprint(ch);
333+
#else
334+
shouldReplace = !isprint((unsigned char)ch);
335+
#endif
336+
}
337+
338+
if (shouldReplace) {
339+
ch = replacementChar;
340+
if (wasReplaced) {
341+
continue;
342+
}
343+
}
344+
wasReplaced = shouldReplace;
345+
346+
encodeWChar(ps, ch);
347+
} while (ch != 0);
348+
}
349+
350+
char* String_makePrintable(const char* str, size_t maxLen) {
351+
WCharEncoderState encState;
352+
353+
memset(&encState, 0, sizeof(encState));
354+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
355+
size_t size = encState.pos;
356+
assert(size > 0);
357+
358+
memset(&encState, 0, sizeof(encState));
359+
char* buf = xMalloc(size);
360+
encState.size = size;
361+
encState.buf = buf;
362+
EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
363+
assert(encState.pos == size);
364+
365+
return buf;
366+
}
367+
368+
bool String_decodeNextWChar(MBStringDecoderState* ps) {
369+
if (!ps->str || ps->maxLen == 0) {
370+
return false;
371+
}
372+
373+
// If the previous call of this function encounters an invalid sequence,
374+
// do not continue (because the "mbState" object for mbrtowc() is
375+
// undefined). The caller is supposed to reset the state.
376+
#ifdef HAVE_LIBNCURSESW
377+
bool isStateDefined = ps->ch != WEOF;
378+
#else
379+
bool isStateDefined = ps->ch != EOF;
380+
#endif
381+
if (!isStateDefined) {
382+
return false;
383+
}
384+
385+
#ifdef HAVE_LIBNCURSESW
386+
wchar_t wc;
387+
size_t len = mbrtowc(&wc, ps->str, ps->maxLen, &ps->mbState);
388+
switch (len) {
389+
case (size_t)-1:
390+
// Invalid sequence
391+
ps->ch = WEOF;
392+
return false;
393+
394+
case (size_t)-2:
395+
// Incomplete sequence
396+
ps->str += ps->maxLen;
397+
ps->maxLen = 0;
398+
return false;
399+
400+
case 0:
401+
assert(wc == 0);
402+
403+
ps->str = NULL;
404+
ps->maxLen = 0;
405+
ps->ch = wc;
406+
return true;
407+
408+
default:
409+
ps->str += len;
410+
ps->maxLen -= len;
411+
ps->ch = wc;
412+
}
413+
return true;
414+
#else
415+
ps->ch = *ps->str;
416+
if (ps->ch == 0) {
417+
ps->str = NULL;
418+
ps->maxLen = 0;
419+
} else {
420+
ps->str++;
421+
ps->maxLen--;
422+
}
423+
return true;
424+
#endif
425+
}
426+
427+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator) {
428+
assert(*str || maxLen == 0);
429+
430+
if (maxWidth < 0)
431+
maxWidth = INT_MAX;
432+
433+
MBStringDecoderState state;
434+
memset(&state, 0, sizeof(state));
435+
state.str = *str;
436+
state.maxLen = maxLen;
437+
438+
int totalWidth = 0;
439+
int breakWidth = 0;
440+
441+
const char* breakPos = NULL;
442+
bool inSpaces = true;
443+
444+
while (String_decodeNextWChar(&state)) {
445+
if (state.ch == 0)
446+
break;
447+
448+
if (state.ch == ' ' && separator == ' ' && !inSpaces) {
449+
breakWidth = totalWidth;
450+
breakPos = *str;
451+
inSpaces = true;
452+
}
453+
454+
#ifdef HAVE_LIBNCURSESW
455+
int w = wcwidth((wchar_t)state.ch);
456+
if (w < 0) {
457+
// This function should not be used with string containing unprintable
458+
// characters. Tolerate them on release build, however.
459+
assert(w >= 0);
460+
break;
461+
}
462+
#else
463+
assert(isprint(state.ch));
464+
int w = 1;
465+
#endif
466+
467+
if (w > maxWidth - totalWidth) {
468+
// This character cannot fit the line with the given maxWidth.
469+
if (breakPos) {
470+
// Rewind the scanning state to the last found separator.
471+
totalWidth = breakWidth;
472+
*str = breakPos;
473+
}
474+
break;
475+
}
476+
477+
#ifdef HAVE_LIBNCURSESW
478+
// If the character takes zero columns, include the character in the
479+
// substring if the working encoding is UTF-8, and ignore it otherwise.
480+
// In Unicode, combining characters are always placed after the base
481+
// character, but some legacy 8-bit encodings instead place combining
482+
// characters before the base character.
483+
if (w <= 0 && !CRT_utf8) {
484+
continue;
485+
}
486+
#endif
487+
488+
totalWidth += w;
489+
490+
// (*str - start) will represent the length of the substring bounded
491+
// by the width limit.
492+
*str = state.str;
493+
494+
if (state.ch != ' ')
495+
inSpaces = false;
496+
497+
#ifdef HAVE_LIBNCURSESW
498+
wint_t sepCast = (wint_t)separator;
499+
#else
500+
int sepCast = (int)separator;
501+
#endif
502+
if (state.ch == sepCast && separator != ' ') {
503+
breakWidth = totalWidth;
504+
breakPos = *str;
505+
}
506+
}
507+
508+
return totalWidth;
509+
}
510+
511+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth) {
512+
#ifdef HAVE_LIBNCURSESW
513+
return String_lineBreakWidth(str, maxLen, maxWidth, '\0');
514+
#else
515+
assert(*str || maxLen == 0);
516+
517+
if (maxWidth < 0)
518+
maxWidth = INT_MAX;
519+
520+
maxLen = MINIMUM((size_t)maxWidth, maxLen);
521+
size_t len = strnlen(*str, maxLen);
522+
*str += len;
523+
return (int)len;
524+
#endif
525+
}
526+
238527
int xAsprintf(char** strp, const char* fmt, ...) {
239528
va_list vl;
240529
va_start(vl, fmt);

XUtils.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,32 @@ in the source distribution for its full text.
2323

2424
#include "Compat.h"
2525
#include "Macros.h"
26+
#include "ProvideCurses.h"
27+
28+
29+
typedef struct WCharEncoderState_ {
30+
size_t pos;
31+
size_t size;
32+
void* buf;
33+
mbstate_t mbState;
34+
} WCharEncoderState;
35+
36+
typedef struct MBStringDecoderState_ {
37+
const char* str;
38+
size_t maxLen;
39+
#ifdef HAVE_LIBNCURSESW
40+
wint_t ch;
41+
mbstate_t mbState;
42+
#else
43+
int ch;
44+
#endif
45+
} MBStringDecoderState;
2646

47+
#ifdef HAVE_LIBNCURSESW
48+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc);
49+
#else
50+
typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c);
51+
#endif
2752

2853
ATTR_NORETURN
2954
void fail(void);
@@ -106,6 +131,21 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t
106131
size_t strnlen(const char* str, size_t maxLen);
107132
#endif
108133

134+
ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3)
135+
void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar);
136+
137+
ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2)
138+
char* String_makePrintable(const char* str, size_t maxLen);
139+
140+
ATTR_NONNULL
141+
bool String_decodeNextWChar(MBStringDecoderState* ps);
142+
143+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
144+
int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator);
145+
146+
ATTR_NONNULL ATTR_ACCESS2_RW(1)
147+
int String_mbswidth(const char** str, size_t maxLen, int maxWidth);
148+
109149
ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2)
110150
int xAsprintf(char** strp, const char* fmt, ...);
111151

0 commit comments

Comments
 (0)