Skip to content

Commit 00cd65b

Browse files
authored
Merge pull request #676 from djc/idna2008
Implement support for reporting errors on invalid IDNA2008 characters Includes changes from Optimize IDNA tables #677
2 parents e385083 + 04b7f49 commit 00cd65b

File tree

6 files changed

+2372
-3508
lines changed

6 files changed

+2372
-3508
lines changed

idna/src/make_uts46_mapping_table.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ def rust_slice(s):
7878
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
7979
elif mapping == "Deviation":
8080
unicode_str = u''
81+
82+
if len(fields) > 3:
83+
assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
84+
assert mapping == 'Valid', mapping
85+
mapping = 'DisallowedIdna2008'
86+
8187
ranges.append((first, last, mapping, unicode_str))
8288

8389
def mergeable_key(r):
@@ -86,7 +92,7 @@ def mergeable_key(r):
8692
# These types have associated data, so we should not merge them.
8793
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
8894
return r
89-
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
95+
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
9096
return mapping
9197

9298
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
@@ -116,11 +122,7 @@ def mergeable_key(r):
116122
# Assert we're seeing the surrogate case here.
117123
assert last_char == 0xd7ff
118124
assert next_char == 0xe000
119-
first = group[0][0]
120-
last = group[-1][1]
121-
mapping = group[0][2]
122-
unicode_str = group[0][3]
123-
optimized_ranges.append((first, last, mapping, unicode_str))
125+
optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
124126

125127
def is_single_char_range(r):
126128
(first, last, _, _) = r
@@ -148,30 +150,22 @@ def merge_single_char_ranges(ranges):
148150

149151
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
150152

151-
152-
print("static TABLE: &[Range] = &[")
153-
154-
for ranges in optimized_ranges:
155-
first = ranges[0][0]
156-
last = ranges[-1][1]
157-
print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
158-
escape_char(char(last))))
159-
160-
print("];\n")
161-
162-
print("static INDEX_TABLE: &[u16] = &[")
163-
164153
SINGLE_MARKER = 1 << 15
165154

155+
print("static TABLE: &[(char, u16)] = &[")
156+
166157
offset = 0
167158
for ranges in optimized_ranges:
168159
assert offset < SINGLE_MARKER
169160

170161
block_len = len(ranges)
171162
single = SINGLE_MARKER if block_len == 1 else 0
172-
print(" %s," % (offset | single))
163+
index = offset | single
173164
offset += block_len
174165

166+
start = escape_char(char(ranges[0][0]))
167+
print(" ('%s', %s)," % (start, index))
168+
175169
print("];\n")
176170

177171
print("static MAPPING_TABLE: &[Mapping] = &[")

idna/src/uts46.rs

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
1212
use self::Mapping::*;
1313
use crate::punycode;
14-
use std::cmp::Ordering::{Equal, Greater, Less};
1514
use std::{error::Error as StdError, fmt};
1615
use unicode_bidi::{bidi_class, BidiClass};
1716
use unicode_normalization::char::is_combining_mark;
@@ -48,38 +47,26 @@ enum Mapping {
4847
Disallowed,
4948
DisallowedStd3Valid,
5049
DisallowedStd3Mapped(StringTableSlice),
51-
}
52-
53-
struct Range {
54-
from: char,
55-
to: char,
50+
DisallowedIdna2008,
5651
}
5752

5853
fn find_char(codepoint: char) -> &'static Mapping {
59-
let r = TABLE.binary_search_by(|ref range| {
60-
if codepoint > range.to {
61-
Less
62-
} else if codepoint < range.from {
63-
Greater
64-
} else {
65-
Equal
66-
}
67-
});
68-
r.ok()
69-
.map(|i| {
70-
const SINGLE_MARKER: u16 = 1 << 15;
54+
let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
55+
Ok(idx) => idx,
56+
Err(idx) => idx - 1,
57+
};
7158

72-
let x = INDEX_TABLE[i];
73-
let single = (x & SINGLE_MARKER) != 0;
74-
let offset = !SINGLE_MARKER & x;
59+
const SINGLE_MARKER: u16 = 1 << 15;
7560

76-
if single {
77-
&MAPPING_TABLE[offset as usize]
78-
} else {
79-
&MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
80-
}
81-
})
82-
.unwrap()
61+
let (base, x) = TABLE[idx];
62+
let single = (x & SINGLE_MARKER) != 0;
63+
let offset = !SINGLE_MARKER & x;
64+
65+
if single {
66+
&MAPPING_TABLE[offset as usize]
67+
} else {
68+
&MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
69+
}
8370
}
8471

8572
struct Mapper<'a> {
@@ -140,6 +127,12 @@ impl<'a> Iterator for Mapper<'a> {
140127
self.slice = Some(decode_slice(slice).chars());
141128
continue;
142129
}
130+
Mapping::DisallowedIdna2008 => {
131+
if self.config.use_idna_2008_rules {
132+
self.errors.disallowed_in_idna_2008 = true;
133+
}
134+
codepoint
135+
}
143136
});
144137
}
145138
}
@@ -310,7 +303,7 @@ fn check_validity(label: &str, config: Config, errors: &mut Errors) {
310303

311304
// V6: Check against Mapping Table
312305
if label.chars().any(|c| match *find_char(c) {
313-
Mapping::Valid => false,
306+
Mapping::Valid | Mapping::DisallowedIdna2008 => false,
314307
Mapping::Deviation(_) => config.transitional_processing,
315308
Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
316309
_ => true,
@@ -510,6 +503,7 @@ pub struct Config {
510503
transitional_processing: bool,
511504
verify_dns_length: bool,
512505
check_hyphens: bool,
506+
use_idna_2008_rules: bool,
513507
}
514508

515509
/// The defaults are that of https://url.spec.whatwg.org/#idna
@@ -524,6 +518,7 @@ impl Default for Config {
524518

525519
// Only use for to_ascii, not to_unicode
526520
verify_dns_length: false,
521+
use_idna_2008_rules: false,
527522
}
528523
}
529524
}
@@ -553,6 +548,12 @@ impl Config {
553548
self
554549
}
555550

551+
#[inline]
552+
pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
553+
self.use_idna_2008_rules = value;
554+
self
555+
}
556+
556557
/// http://www.unicode.org/reports/tr46/#ToASCII
557558
pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
558559
let mut result = String::new();
@@ -599,6 +600,7 @@ pub struct Errors {
599600
disallowed_character: bool,
600601
too_long_for_dns: bool,
601602
too_short_for_dns: bool,
603+
disallowed_in_idna_2008: bool,
602604
}
603605

604606
impl Errors {
@@ -615,6 +617,7 @@ impl Errors {
615617
disallowed_character,
616618
too_long_for_dns,
617619
too_short_for_dns,
620+
disallowed_in_idna_2008,
618621
} = *self;
619622
punycode
620623
|| check_hyphens
@@ -627,6 +630,7 @@ impl Errors {
627630
|| disallowed_character
628631
|| too_long_for_dns
629632
|| too_short_for_dns
633+
|| disallowed_in_idna_2008
630634
}
631635
}
632636

@@ -644,6 +648,7 @@ impl fmt::Debug for Errors {
644648
disallowed_character,
645649
too_long_for_dns,
646650
too_short_for_dns,
651+
disallowed_in_idna_2008,
647652
} = *self;
648653

649654
let fields = [
@@ -661,6 +666,7 @@ impl fmt::Debug for Errors {
661666
("disallowed_character", disallowed_character),
662667
("too_long_for_dns", too_long_for_dns),
663668
("too_short_for_dns", too_short_for_dns),
669+
("disallowed_in_idna_2008", disallowed_in_idna_2008),
664670
];
665671

666672
let mut empty = true;

0 commit comments

Comments
 (0)