Skip to content
  • Sponsor TheAlgorithms/Python

  • Notifications You must be signed in to change notification settings
  • Fork 47k
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5875884

Browse files
committedJun 2, 2025·
feat(data_structures): Add Suffix Array algorithm
1 parent 4b077c0 commit 5875884

File tree

2 files changed

+171
-0
lines changed

2 files changed

+171
-0
lines changed
 

‎data_structures/suffix_array/__init__.py

Whitespace-only changes.
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""
2+
Implementation of the Suffix Array construction algorithm in Python.
3+
4+
This algorithm takes a text string as input and produces its Suffix Array.
5+
A Suffix Array is a sorted array of all suffixes of a given string.
6+
It is a data structure used in, among others, bioinformatics and data compression.
7+
"""
8+
9+
10+
def build_suffix_array(text: str) -> list[int]:
11+
"""
12+
Builds the Suffix Array for a given text string.
13+
14+
The construction involves:
15+
1. Generating all suffixes of the string.
16+
2. Storing each suffix along with its original starting index.
17+
3. Sorting these (suffix, index) pairs lexicographically based on the suffix.
18+
4. Extracting the indices into a list, which is the Suffix Array.
19+
20+
Args:
21+
text: The input text string. It's common to append a special
22+
character (lexicographically smallest, like '$') to the end
23+
of the string to ensure all suffixes are unique and to
24+
simplify certain suffix array algorithms, though this
25+
implementation will work without it too by relying on Python's
26+
string comparison. For canonical behavior, consider appending it.
27+
28+
Returns:
29+
list[int]: The Suffix Array, which is a list of starting
30+
indices of sorted suffixes.
31+
32+
Raises:
33+
TypeError: If the input is not a string.
34+
35+
Examples:
36+
>>> build_suffix_array("banana") # Using "banana" without a special end char
37+
[5, 3, 1, 0, 4, 2]
38+
Suffixes:
39+
"a" (5)
40+
"ana" (3)
41+
"anana" (1)
42+
"banana" (0)
43+
"na" (4)
44+
"nana" (2)
45+
46+
>>> build_suffix_array("banana$")
47+
[6, 5, 3, 1, 0, 4, 2]
48+
Suffixes:
49+
"$" (6)
50+
"a$" (5)
51+
"ana$" (3)
52+
"anana$" (1)
53+
"banana$" (0)
54+
"na$" (4)
55+
"nana$" (2)
56+
57+
>>> build_suffix_array("abracadabra")
58+
[10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]
59+
60+
>>> build_suffix_array("")
61+
[]
62+
63+
>>> build_suffix_array("aaa")
64+
[2, 1, 0] (or any order of 0,1,2 if suffixes are identical like "a", "a", "a")
65+
Python's sort is stable, so for identical suffixes,
66+
the one with larger original index comes later if we consider '$' implicitly.
67+
If we list them: "a" (2), "aa" (1), "aaa" (0)
68+
Sorted by suffix: "a", "aa", "aaa" -> indices [2, 1, 0]
69+
"""
70+
if not isinstance(text, str):
71+
raise TypeError("Input must be a string.")
72+
73+
n = len(text)
74+
if n == 0:
75+
return []
76+
77+
# 1. Generate all suffixes and store them with their original starting indices.
78+
# A suffix is defined by its starting position in the original text.
79+
# Example: text = "banana"
80+
# Suffixes are:
81+
# (0, "banana")
82+
# (1, "anana")
83+
# (2, "nana")
84+
# (3, "ana")
85+
# (4, "na")
86+
# (5, "a")
87+
88+
suffixes = []
89+
for i in range(n):
90+
suffixes.append((text[i:], i)) # Store (suffix_string, original_index)
91+
92+
# 2. Sort the (suffix, index) pairs.
93+
# Python's default sort for tuples will sort based on the first element
94+
# (the suffix string), and then by the second element (the index) if
95+
# suffixes are identical. This lexicographical sort is the core of
96+
# Suffix Array construction. The sort is stable, meaning if two suffixes
97+
# are identical (which shouldn't happen if a unique terminator like '$'
98+
# is used), their relative order base on original index would be preserved
99+
# if that was a secondary sort key. Here, we just need to sort by the suffix
100+
# string. suffixes.sort(key=lambda x: x[0])
101+
102+
# 3. Extract the indices into a list.
103+
# This list of sorted indices is the Suffix Array.
104+
suffix_array = [item[1] for item in suffixes]
105+
106+
return suffix_array
107+
108+
109+
def print_suffixes_and_array(text: str, sa: list[int]):
110+
"""Helper function to print suffixes in sorted order along with their indices."""
111+
if not sa:
112+
print(" (Empty string has no suffixes)")
113+
return
114+
print(" Sorted Suffixes (index: suffix):")
115+
for i in sa:
116+
print(f" {i}: {text[i:]}")
117+
print(f" Suffix Array: {sa}")
118+
119+
120+
def main():
121+
"""
122+
Main function to demonstrate Suffix Array construction.
123+
"""
124+
print("### Suffix Array Construction Demonstration ###\n")
125+
126+
test_cases = [
127+
"banana",
128+
"banana$", # With a unique terminator
129+
"abracadabra",
130+
"mississippi",
131+
"GATTACA",
132+
"aaaaa",
133+
"abcde",
134+
"", # Empty string
135+
]
136+
137+
for text_to_process in test_cases:
138+
print(f'Original string: "{text_to_process}"')
139+
try:
140+
suffix_arr = build_suffix_array(text_to_process)
141+
print_suffixes_and_array(text_to_process, suffix_arr)
142+
print("") # Newline for better readability
143+
except TypeError as e:
144+
print(f" Error: {e}\n")
145+
146+
# Example with user input
147+
print("--- Test with user input ---")
148+
try:
149+
user_input = input(
150+
"Enter a string to build its Suffix Array (e.g., 'banana'): "
151+
)
152+
# It's good practice to suggest adding '$' if needed for specific use cases
153+
# print("(Consider adding a unique character like '$' to the end if not
154+
# present)")
155+
if (
156+
user_input is not None
157+
): # Check if input is not None (Ctrl+D might give None)
158+
sa_output = build_suffix_array(user_input)
159+
print_suffixes_and_array(user_input, sa_output)
160+
else:
161+
print(" No string entered.") # Should not happen with input() unless EOF
162+
except TypeError as e:
163+
print(f" Error: {e}")
164+
except EOFError: # Handles Ctrl+D
165+
print("\n Input cancelled.")
166+
except KeyboardInterrupt:
167+
print("\n Process interrupted by user.")
168+
169+
170+
if __name__ == "__main__":
171+
main()

0 commit comments

Comments
 (0)
Please sign in to comment.