Skip to content

Commit 06c4272

Browse files
committed
support multi-char code points in encodeCharacters
also provide a natural way to disable encoding for characters enabled by default (mainly for markdown, because html-to-text doesn't come with any)
1 parent d960c11 commit 06c4272

File tree

6 files changed

+59
-6
lines changed

6 files changed

+59
-6
lines changed

packages/base/src/index.js

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { parseDocument } from 'htmlparser2';
44
import { DecisionTree } from 'selderee';
55

66
import { BlockTextBuilder } from './block-text-builder';
7-
import { limitedDepthRecursive } from './util';
7+
import { limitedDepthRecursive, unicodeEscape } from './util';
88

99

1010
/**
@@ -162,20 +162,24 @@ function recursiveWalk (walk, dom, builder) {
162162
}
163163

164164
/**
165-
* @param { Object<string,string> } dict
165+
* @param { Object<string,string | false> } dict
166166
* A dictionary where keys are characters to replace
167167
* and values are replacement strings.
168168
*
169+
* First code point from dict keys is used.
170+
* Compound emojis with ZWJ are not supported (not until Node 16).
171+
*
169172
* @returns { ((str: string) => string) | undefined }
170173
*/
171174
function makeReplacerFromDict (dict) {
172175
if (!dict || Object.keys(dict).length === 0) {
173176
return undefined;
174177
}
175-
const entries = [...Object.entries(dict)];
178+
/** @type { [string, string][] } */
179+
const entries = Object.entries(dict).filter(([, v]) => v !== false);
176180
const regex = new RegExp(
177181
entries
178-
.map(([c]) => `(\\u${(c.charCodeAt(0).toString(16).padStart(4, '0'))})`)
182+
.map(([c]) => `(${unicodeEscape([...c][0])})`)
179183
.join('|'),
180184
'g'
181185
);

packages/base/src/typedefs.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
* @property { boolean } [decodeEntities]
1010
* Specify whether HTML entities should be decoded in the text output.
1111
*
12-
* @property { Object<string,string> | ((str: string) => string) | undefined } [encodeCharacters]
12+
* @property { Object<string,string|false> | ((str: string) => string) | undefined } [encodeCharacters]
1313
* A dictionary mapping from input text characters to escape sequences
14+
* (you can set values to false to disable escaping characters that are enabled by default)
1415
* or a function that does the replacement.
1516
*
1617
* @property { Object< string, FormatCallback > } [formatters = {}]

packages/base/src/util.js

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ function trimCharacterEnd (str, char) {
5656
: str;
5757
}
5858

59+
/**
60+
* Return a new string will all characters replaced with unicode escape sequences.
61+
* This extreme kind of escaping can used to be safely compose regular expressions.
62+
*
63+
* @param { string } str A string to escape.
64+
* @returns { string } A string of unicode escape sequences.
65+
*/
66+
function unicodeEscape (str) {
67+
return str.replace(/[\s\S]/g, c => '\\u' + c.charCodeAt().toString(16).padStart(4, '0'));
68+
}
69+
5970
/**
6071
* Deduplicate an array by a given key callback.
6172
* Item properties are merged recursively and with the preference for last defined values.
@@ -149,5 +160,6 @@ export {
149160
numberToLetterSequence,
150161
numberToRoman,
151162
trimCharacter,
152-
trimCharacterEnd
163+
trimCharacterEnd,
164+
unicodeEscape
153165
};

packages/html-to-md/test/html-to-md.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,17 @@ test(
2727
snapshotMacro,
2828
'<img src="test.png" alt="**alt text**" title="*title*">'
2929
);
30+
31+
test(
32+
'should allow to disable encoding of some characters encoded by default',
33+
snapshotMacro,
34+
'<p>!#[]()*+-.\\_`{}</p>',
35+
{ encodeCharacters: { '(': '(', ')': false } }
36+
);
37+
38+
test(
39+
'should allow to encode additional symbols (single code point)',
40+
snapshotMacro,
41+
'<p>!#[]()*+-.\\_`{}</p><p>👁️ - eye</p><p>👁️‍🗨️ - eye in a speech bubble</p><p>😀 - smiley</p>',
42+
{ encodeCharacters: { '👁️': ':eye:', '😀': ':smiley:' } }
43+
);

packages/html-to-md/test/snapshots/html-to-md.js.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,25 @@ Generated by [AVA](https://avajs.dev).
2727
> ```
2828
2929
'![&ast;&ast;alt text&ast;&ast;](test.png "&ast;title&ast;")'
30+
31+
## should allow to disable encoding of some characters encoded by default
32+
33+
> ```html
34+
> <p>!#[]()*+-.\_`{}</p>
35+
> ```
36+
37+
'&excl;&num;&lbrack;&rbrack;()&ast;&plus;&#45;&period;&bsol;&lowbar;&grave;&lbrace;&rbrace;'
38+
39+
## should allow to encode additional symbols (single code point)
40+
41+
> ```html
42+
> <p>!#[]()*+-.\_`{}</p><p>👁️ - eye</p><p>👁️‍🗨️ - eye in a speech bubble</p><p>😀 - smiley</p>
43+
> ```
44+
45+
`&excl;&num;&lbrack;&rbrack;&lpar;&rpar;&ast;&plus;&#45;&period;&bsol;&lowbar;&grave;&lbrace;&rbrace;
46+
47+
:eye:️ &#45; eye␊
48+
49+
:eye:️‍🗨️ &#45; eye in a speech bubble␊
50+
51+
:smiley: &#45; smiley`
Binary file not shown.

0 commit comments

Comments
 (0)