Skip to content

Commit f16ad0c

Browse files
committed
WIP: Use Finite State Transducers (FST) as the backing store for language models
FST have the nice properties of both compressing the ngram data by exploiting common suffixes and prefixes as well as being embeddable into the binary in a form that is directly suitable for look-up thereby avoiding the separate decompression step and indireclty using memory mappings as supplied by the operating system for all binaries. This is still WIP since I do not know how to regenerate the language models and it also seems like the unique ngram models are built elsewhere. TODO: * Integrate unique ngram models * Regenerate all language models * Drop non-unified models
1 parent d69f611 commit f16ad0c

File tree

8 files changed

+188
-324
lines changed

8 files changed

+188
-324
lines changed

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ brotli = "7.0.0"
7272
compact_str = "0.9.0"
7373
dashmap = "6.1.0"
7474
fraction = "0.15.3"
75+
fst = "0.4.7"
7576
include_dir = "0.7.4"
7677
itertools = "0.14.0"
7778
maplit = "1.0.2"

language-models/de/models/ngrams.fst

Whitespace-only changes.

language-models/de/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,6 @@ use include_dir::{include_dir, Dir};
1818

1919
pub const GERMAN_MODELS_DIRECTORY: Dir = include_dir!("$CARGO_MANIFEST_DIR/models");
2020

21+
pub const GERMAN_UNIFIED_MODEL: &[u8] = include_bytes!("../models/ngrams.fst");
22+
2123
pub const GERMAN_TESTDATA_DIRECTORY: Dir = include_dir!("$CARGO_MANIFEST_DIR/testdata");

src/detector.rs

Lines changed: 45 additions & 318 deletions
Large diffs are not rendered by default.

src/model.rs

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ fn deserialize_ngram_probabilities<'de, D: Deserializer<'de>>(
127127

128128
pub(crate) struct TrainingDataLanguageModel {
129129
pub(crate) absolute_frequencies: HashMap<Ngram, u32>,
130-
ngram_probability_model: NgramProbabilityModel,
130+
pub(crate) ngram_probability_model: NgramProbabilityModel,
131131
}
132132

133133
impl TrainingDataLanguageModel {
@@ -260,6 +260,72 @@ fn get_utf8_slice(string: &str, start: usize, end: usize) -> &str {
260260
.unwrap()
261261
}
262262

263+
pub(crate) struct UnifiedNgramModel<'a> {
264+
map: fst::Map<&'a [u8]>,
265+
}
266+
267+
impl<'a> UnifiedNgramModel<'a> {
268+
pub(crate) const PROBABILITY: u8 = 0;
269+
pub(crate) const UNIQUE: u8 = 1;
270+
pub(crate) const MOST_COMMON: u8 = 2;
271+
272+
pub(crate) fn load(language: Language) -> Result<Self, fst::Error> {
273+
let data = match language {
274+
#[cfg(feature = "german")]
275+
Language::German => lingua_german_language_model::GERMAN_UNIFIED_MODEL,
276+
277+
_ => unimplemented!(),
278+
};
279+
280+
fst::Map::new(data).map(|map| Self { map })
281+
}
282+
283+
pub(crate) fn get_probability(&self, ngram: &str) -> Option<f64> {
284+
self.get(ngram, Self::PROBABILITY).map(f64::from_bits)
285+
}
286+
287+
pub(crate) fn is_unique(&self, ngram: &str) -> bool {
288+
self.get(ngram, Self::UNIQUE).is_some()
289+
}
290+
291+
pub(crate) fn is_most_common(&self, ngram: &str) -> bool {
292+
self.get(ngram, Self::MOST_COMMON).is_some()
293+
}
294+
295+
fn get(&self, ngram: &str, kind: u8) -> Option<u64> {
296+
let key = UnifiedNgramKey::new(ngram, kind);
297+
298+
self.map.get(key)
299+
}
300+
}
301+
302+
#[derive(Clone, Copy)]
303+
pub(crate) struct UnifiedNgramKey {
304+
len: usize,
305+
key: [u8; Self::MAX_LEN],
306+
}
307+
308+
impl UnifiedNgramKey {
309+
// Maximum UTF-8-encoded length of fivegrams plus one kind byte.
310+
pub(crate) const MAX_LEN: usize = 5 * 4 + 1;
311+
312+
pub(crate) fn new(ngram: &str, kind: u8) -> Self {
313+
let len = ngram.len();
314+
315+
let mut key = [0; Self::MAX_LEN];
316+
key[..len].copy_from_slice(ngram.as_bytes());
317+
key[len] = kind;
318+
319+
Self { len, key }
320+
}
321+
}
322+
323+
impl AsRef<[u8]> for UnifiedNgramKey {
324+
fn as_ref(&self) -> &[u8] {
325+
&self.key[..=self.len]
326+
}
327+
}
328+
263329
#[cfg(test)]
264330
mod tests {
265331
use itertools::Itertools;

src/writer.rs

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ use std::io::{BufRead, BufReader, LineWriter, Write};
2121
use std::path::Path;
2222

2323
use brotli::CompressorWriter;
24+
use fraction::ToPrimitive;
2425
use itertools::Itertools;
2526
use regex::Regex;
2627

2728
use crate::constant::{MULTIPLE_WHITESPACE, NUMBERS, PUNCTUATION};
28-
use crate::model::TrainingDataLanguageModel;
29+
use crate::model::{TrainingDataLanguageModel, UnifiedNgramKey, UnifiedNgramModel};
2930
use crate::ngram::Ngram;
3031
use crate::Language;
3132

@@ -127,6 +128,18 @@ impl LanguageModelFilesWriter {
127128
"fivegrams.json",
128129
)?;
129130

131+
Self::write_unified_language_model(
132+
[
133+
&unigram_model,
134+
&bigram_model,
135+
&trigram_model,
136+
&quadrigram_model,
137+
&fivegram_model,
138+
],
139+
output_directory_path,
140+
"ngrams.fst",
141+
)?;
142+
130143
Ok(())
131144
}
132145

@@ -167,6 +180,53 @@ impl LanguageModelFilesWriter {
167180
compressed_file.write_all(model.to_json().as_bytes())?;
168181
Ok(())
169182
}
183+
184+
fn write_unified_language_model(
185+
models: [&TrainingDataLanguageModel; 5],
186+
output_directory_path: &Path,
187+
file_name: &str,
188+
) -> io::Result<()> {
189+
let mut pairs = models
190+
.iter()
191+
.flat_map(|model| {
192+
let probabilities =
193+
model
194+
.ngram_probability_model
195+
.ngrams
196+
.iter()
197+
.map(|(ngram, probability)| {
198+
let key = UnifiedNgramKey::new(ngram, UnifiedNgramModel::PROBABILITY);
199+
let value = probability.to_f64().unwrap().ln().to_bits();
200+
201+
(key, value)
202+
});
203+
204+
let most_common = model
205+
.absolute_frequencies
206+
.iter()
207+
.k_largest_by_key(25, |(_, frequency)| *frequency)
208+
.map(|(ngram, frequency)| {
209+
let key =
210+
UnifiedNgramKey::new(&ngram.value, UnifiedNgramModel::MOST_COMMON);
211+
let value = *frequency as u64;
212+
213+
(key, value)
214+
});
215+
216+
probabilities.chain(most_common)
217+
})
218+
.collect_vec();
219+
220+
pairs.sort_unstable_by(|(lhs, _), (rhs, _)| lhs.as_ref().cmp(rhs.as_ref()));
221+
222+
let mut builder = fst::MapBuilder::memory();
223+
builder.extend_iter(pairs).unwrap();
224+
let buffer = builder.into_inner().unwrap();
225+
226+
let file_path = output_directory_path.join(file_name);
227+
let mut file = File::create(file_path)?;
228+
file.write_all(&buffer)
229+
}
170230
}
171231

172232
impl TestDataFilesWriter {

tests/python/test_writer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,13 @@ def test_language_model_files_writer(language_model_files_text):
6262

6363
files = read_directory_content(output_directory_path)
6464

65-
assert len(files) == 5
66-
assert files[4] == "unigrams.json.br"
65+
assert len(files) == 6
66+
assert files[5] == "unigrams.json.br"
6767
assert files[0] == "bigrams.json.br"
68-
assert files[3] == "trigrams.json.br"
69-
assert files[2] == "quadrigrams.json.br"
68+
assert files[4] == "trigrams.json.br"
69+
assert files[3] == "quadrigrams.json.br"
7070
assert files[1] == "fivegrams.json.br"
71+
assert files[2] == "ngrams.fst"
7172

7273

7374
def test_test_data_files_writer(test_data_files_text):

0 commit comments

Comments
 (0)