Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 45 additions & 4 deletions source/textUtils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka
# Copyright (C) 2018-2026 NV Access Limited, Babbage B.V., Łukasz Golonka

"""
Classes and utilities to deal with offsets variable width encodings, particularly utf_16.
Expand Down Expand Up @@ -445,7 +445,10 @@ def __init__(self, text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZA
origOffset = normOffset = 0
normalized = ""
for origPart in splitAtCharacterBoundaries(text):
normPart = unicodedata.normalize(normalizationForm, origPart)
normPart = unicodedata.normalize(
normalizationForm,
origPart.translate(_supplementaryNormalizationTable),
)
normalized += normPart
isReorder = all(c in normPart for c in origPart)
if origPart == normPart:
Expand Down Expand Up @@ -516,13 +519,51 @@ def encodedToStrOffsets(
return (resultStart, resultEnd)


def _buildSupplementaryNormalizationTable() -> dict[int, str]:
"""Build a translation table for decorative Unicode characters not handled by standard NFKC normalization.

This includes characters such as negative squared and negative circled letters,
which are decorative variants of Latin letters
that ``unicodedata.normalize("NFKC", ...)`` does not decompose.
"""
table: dict[int, str] = {}
# Negative Circled Latin Capital Letters: U+1F150 - U+1F169 -> A-Z
for offset in range(26):
table[0x1F150 + offset] = chr(ord("A") + offset)
# Negative Squared Latin Capital Letters: U+1F170 - U+1F189 -> A-Z
# Skip codepoints that have emoji semantics:
# U+1F170 (🅰 A button/blood type), U+1F171 (🅱 B button/blood type),
# U+1F17E (🅾 O button/blood type), U+1F17F (🅿 P button)
_squaredEmojiCodepoints = {0x1F170, 0x1F171, 0x1F17E, 0x1F17F}
for offset in range(26):
codepoint = 0x1F170 + offset
if codepoint not in _squaredEmojiCodepoints:
table[codepoint] = chr(ord("A") + offset)
return table


_supplementaryNormalizationTable: dict[int, str] = _buildSupplementaryNormalizationTable()


def isUnicodeNormalized(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM) -> bool:
"""Convenience function to wrap unicodedata.is_normalized with a default normalization form."""
"""Check whether the given text is already Unicode normalized.

This checks both standard Unicode normalization and supplementary normalization
for decorative letter characters not handled by the standard algorithm.
"""
if any(ord(c) in _supplementaryNormalizationTable for c in text):
return False
return unicodedata.is_normalized(normalizationForm, text)


def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM) -> str:
"""Convenience function to wrap unicodedata.normalize with a default normalization form."""
"""Normalize the given text using the specified Unicode normalization form.

In addition to standard Unicode normalization (e.g. NFKC), this applies a supplementary
translation for decorative Unicode letter characters (such as negative squared and negative circled
letters) that are not decomposed by the standard algorithm.
"""
text = text.translate(_supplementaryNormalizationTable)
return unicodedata.normalize(normalizationForm, text)


Expand Down
1 change: 1 addition & 0 deletions user_docs/en/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Consult the speech dictionaries section in the User Guide for more details. (#19
* In Microsoft Word with UIA enabled, page changes are now correctly announced when navigating table rows that span multiple pages. (#19386, @akj)
* Fixed excessive resource usage and highlight flickering when using Visual Highlight. (#17434, @hwf1324)
* The `NVDA+k` command now correctly reports the destination of links containing formatted text, such as bold or italics. (#19428, @Cary-rowen)
* Decorative Unicode letters such as negative squared, negative circled, and regional indicator symbol characters are now normalized to their base Latin letters when Unicode normalization is enabled. (#19608, @bramd)
* Configuration profile triggers now activate when the Add-on Store is open. (#19583, @bramd)

### Changes for Developers
Expand Down
Loading