Skip to content

Commit 45164f0

Browse files
authored
Create freq_analysis.py
1 parent 6155611 commit 45164f0

File tree

1 file changed

+304
-0
lines changed

1 file changed

+304
-0
lines changed

freq_analysis.py

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
#!/usr/bin/env python3
2+
3+
'''
4+
A small script that provides several utility functions to analyze a piece of
5+
text and determine what the most (and least) frequently repeated letters are.
6+
Frequency analysis is used among other things to break encryption ciphers that
7+
rely on letter substitution such as the Vigenere cipher.
8+
9+
'ETAOIN' are the six most common letters used in the English language.
10+
11+
WORK IN PROCESS!
12+
TODO: Create possible key combinations
13+
'''
14+
15+
ETAOIN='ETAOINSHRDLCUMWFGYPBVKJXQZ'
16+
LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
17+
MAX_KEY_LENGTH = 16
18+
19+
from vigenere_cipher import decrypt
20+
from is_lang import is_english
21+
from concurrent.futures import ProcessPoolExecutor
22+
import itertools
23+
import time
24+
import pprint
25+
import re
26+
27+
only_letters = re.compile(r'[^a-zA-Z]')
28+
29+
def main():
30+
# ciphertext = 'Ai seh pvxo xffbxzl elt fzghip cs yhglp xgjnfw. Ganjbrp ftwez bfjs gr lmyk awbx ulgg nqw pzwt hytu ezwylmc mirollf sd GUHEPQMHV. phv amiyg txwipgk ybpj wawh hpie rvisiqgvrg mq rdh hnjx qwajbrr ichzkfpw. Wavmilh dt hnjxrwaj, ai seh dvktitsehw eyh qsvg ulc tergx xec cw mimq pnqw, fcmcuzgh yq ozdsmyk esiypvkoafxw es ivzl eew. huh yvfwifrmjsl cs qhx dsjbubok rvr ztc jsj ijxe xm, oag wmdxpwe yssk tnql ass loem imk pbrmio ecr ttox ydcuxgteis nabx fs vv ysc xws stoh uchow hpjtok fpwr drripp. Qn tropvghr yhglpxgk hg ejz glfi lrs ap kppc abgxp.'
31+
# uppercased = 'AISEHPVXOXFFBXZLELTFZGHIPCSYHGLPXGJNFWGANJBRPFTWEZBFJSGRLMYKAWBXULGGNQWPZWTHYTUEZWYLMCMIROLLFSDGUHEPQMHVPHVAMIYGTXWIPGKYBPJWAWHHPIERVISIQGVRGMQRDHHNJXQWAJBRRICHZKFPWWAVMILHDTHNJXRWAJAISEHDVKTITSEHWEYHQSVGULCTERGXXECCWMIMQPNQWFCMCUZGHYQOZDSMYKESIYPVKOAFXWESIVZLEEWHUHYVFWIFRMJSLCSQHXDSJBUBOKRVRZTCJSJIJXEXMOAGWMDXPWEYSSKTNQLASSLOEMIMKPBRMIOECRTTOXYDCUXGTEISNABXFSVVYSCXWSSTOHUCHOWHPJTOKFPWRDRRIPPQNTROPVGHRYHGLPXGKHGEJZGLFILRSAPKPPCABGXP'
32+
# ciphertext = 'Sm nhl frpv zokshrp zox vvyoky hj izkgs bwffmy. Pfettvk ixmar ihsx xb dqtn emxp bnpl eao tuzx xulb gibpveg hlvehdm um LLRWTLPLL. lzc cvnpq lbrltwg qirs brgz lkli hrazkzlmby ql uhx dfqz zbrttvm lgxvcmrf. Brfemgk hj dfqzabrt, sm nhl trcakcxvro itk uiry bnl yvbyb shg ss epoz ueao, jxpgkvyo az tqnkqtn iieqwxttrppa zv mlvd lgf. mlr qzazmvnequu hj azb yvnrqtvm aai jlg evn yfpl zv, trq oqyatma qzut yead env peae pot usbemj hgh plvz hitepkohmi jsiz ox mf qwx aai olvj dhyyo lkmxeg xwya iibatk. Tr jngwxpmi izkgsbwg zn gse xvxm guw ql cwrl fsqpt.'
33+
# 'He has been through the ringer of vocal issues. Imagine being able to sing like this and lose that ability because of SHELLFISH. you would atleast fall into deep depression if not quit singing entirely. Instead of quitting, he has persevered and been the front man of this band, bringing us amazing performances to this day. the frustration of not sounding the way you used to, and distain from fans who want him booted and cant appreciate what he is for the band would defeat most people. My favorite vocalist of all time and my role model.'
34+
# teleportbeyond
35+
36+
# BEFORE!!!
37+
uppercased = 'PPQCA XQVEKG YBNKMAZU YBNGBAL JON I TSZM JYIM. VRAG VOHT VRAU C TKSG. DDWUO XITLAZU VAVV RAZ C VKB QP IWPOU'
38+
uppercased = only_letters.sub('', uppercased)
39+
40+
# List of numbers, potential key lengths
41+
pattern_distances = kasiski_key_lengths(uppercased)
42+
print(pattern_distances)
43+
44+
potential_key_lengths = get_common_factors(pattern_distances)
45+
print(potential_key_lengths)
46+
47+
interval_string = get_letter_at_interval(uppercased, 4)
48+
print(interval_string)
49+
50+
potential_subkeys = []
51+
52+
for subtext in interval_string:
53+
highest_subkey_score = 0
54+
highest_subkeys = []
55+
for subkey in LETTERS:
56+
decrypted = decrypt(subtext, subkey)
57+
letter_count = count_letters(decrypted)
58+
sorted_by_frequency = sort_by_frequency(letter_count)
59+
score = get_english_score(sorted_by_frequency)
60+
61+
# If score is high enough add a new subkey as potential match
62+
if score == highest_subkey_score:
63+
highest_subkeys.append(subkey)
64+
65+
# If score is the highest so far, current subkey takes precedence
66+
if score > highest_subkey_score:
67+
highest_subkeys = [subkey]
68+
highest_subkey_score = score
69+
70+
potential_subkeys.extend(highest_subkeys)
71+
72+
print(potential_subkeys)
73+
74+
return
75+
76+
for key in potential_key_lengths:
77+
# (big) list of strings taken at regular intervals
78+
interval_string = get_letter_at_interval(uppercased, key)
79+
80+
# This will include all letter combinations for a potential key
81+
potential_subkeys = []
82+
83+
# Decrypt each string using all possible letters (subkeys)
84+
for subtext in interval_string:
85+
print(subtext)
86+
# Keep track of each decryption attempt and the score
87+
highest_subkey_score = 0
88+
highest_subkeys = []
89+
90+
for subkey in LETTERS:
91+
decryption_attempt = decrypt(subtext, subkey)
92+
93+
# Analyze letter frequency and score each decrypted result
94+
letter_count = count_letters(decryption_attempt)
95+
sorted_by_frequency = sort_by_frequency(letter_count)
96+
score = get_english_score(sorted_by_frequency)
97+
# print(f'English probability score: {score}')
98+
99+
# If score is high enough add a new subkey as potential match
100+
if score == highest_subkey_score:
101+
highest_subkeys.append(subkey)
102+
103+
# If score is the highest so far, current subkey takes precedence
104+
if score > highest_subkey_score:
105+
highest_subkeys = [subkey]
106+
highest_subkey_score = score
107+
108+
# Keep only subkeys with high probability
109+
potential_subkeys.append(highest_subkeys)
110+
111+
print(potential_subkeys)
112+
return
113+
114+
# Finally with all possible letters found, calculate all possible
115+
# key combinations
116+
117+
# --------------CAUTION--------------
118+
# Bugs ahead do not run this code yet :/
119+
120+
# possible_key_combinations = itertools.product(potential_subkeys, repeat=key)
121+
# print(f'Trying {len(list(possible_key_combinations))} keys...')
122+
#
123+
# for key_try in possible_key_combinations:
124+
# cleartext = decrypt(ciphertext, key_try)
125+
# if is_english(cleartext):
126+
# print('Potential key match found!')
127+
# print(f'{cleartext[:100]}...')
128+
# print('Continue decryption? y/n')
129+
# response = input()
130+
# if not response.lower().startswith('y'):
131+
# print(f'Decryption key: {key_try}')
132+
# print(cleartext)
133+
# print('Exiting program...')
134+
# sys.exit()
135+
136+
137+
def count_letters(text):
138+
'''
139+
Counts the letters in a string of text and returns a dictionary with the
140+
number of times each letter appears in it.
141+
'''
142+
143+
letter_frequency = {
144+
'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, \
145+
'I': 0, 'J': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'O': 0, 'P': 0, \
146+
'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'U': 0, 'V': 0, 'W': 0, 'X': 0, \
147+
'Y': 0, 'Z': 0
148+
}
149+
for letter in text.upper():
150+
if letter in letter_frequency:
151+
letter_frequency[letter] += 1
152+
return letter_frequency
153+
154+
155+
def sort_by_frequency(letter_freq):
156+
'''
157+
Takes a dictionary of 26 items representing each letter associated with a
158+
number, which represents the number of times it appears on a given text.
159+
160+
Returns a string sorted from most frequently repeated to least.
161+
'''
162+
163+
result = {}
164+
165+
# Groupt letters by their frequency
166+
for k, v in letter_freq.items():
167+
result.setdefault(v, [])
168+
result[v].append(k)
169+
170+
# Sort each frequency group by ETAOIN (to avoid false positives)
171+
for k, v in result.items():
172+
v = sorted(v, key=ETAOIN.find, reverse=True)
173+
result[k] = ''.join(v)
174+
175+
# Join each group as a string, and then all groups to return final result
176+
result = [x[1] for x in sorted(result.items(), key=lambda x: x[0], reverse=True)]
177+
return ''.join(result)
178+
179+
180+
def get_english_score(frequency_list):
181+
182+
score = 0
183+
184+
# Compare most common letters in English against most common from text
185+
for letter in ETAOIN[:6]:
186+
if letter in frequency_list[:6]:
187+
score += 1
188+
189+
# Same with the least commons, used as well to build the confidence score
190+
for letter in ETAOIN[-6:]:
191+
if letter in frequency_list[-6:]:
192+
score += 1
193+
194+
return score
195+
196+
197+
def kasiski_key_lengths(ciphertext):
198+
'''
199+
Using the Kasiski's examination of the ciphertext we can find sequences of
200+
repeated patterns at specific intervals. Those intervals will be used to
201+
find the length of the encryption key.
202+
203+
Takes a ciphertext and returns a list of numbers.
204+
'''
205+
206+
# Remove non-letters and make uppercase
207+
ciphertext = only_letters.sub('', ciphertext).upper()
208+
print(ciphertext)
209+
210+
msg_length = len(ciphertext)
211+
sequences = {}
212+
213+
# Extract patterns at "i" intervals, of 3 letters long
214+
for j in range(0, msg_length):
215+
pattern = ciphertext[j:j+3]
216+
sequences.setdefault(pattern, [])
217+
sequences[pattern].append(j + 3)
218+
219+
# Retrieve the distance between repeating patterns
220+
distances = []
221+
222+
for sequence in sequences.values():
223+
# Skip patterns that dont repeat
224+
if len(sequence) < 2:
225+
continue
226+
227+
# For every element, look at other elements lower than current and get
228+
# the difference. This provides all distance values for all matches.
229+
for x in sequence:
230+
for y in sequence:
231+
if y < x:
232+
distances.append(x - y)
233+
234+
return distances
235+
236+
237+
def get_common_factors(distances):
238+
'''
239+
Takes a list of numbers representing the distance between found patterns
240+
in a ciphertext. This will calculate the factors of each of those numbers,
241+
and return those that are more likely based on how after they are repeated
242+
'''
243+
244+
# Get factor of each pattern distances, representing a potential key length
245+
factors = []
246+
[factors.extend(find_factors(d)) for d in distances]
247+
248+
# Count how often each factor number appears
249+
factor_count = {}
250+
for f in factors:
251+
factor_count[f] = factors.count(f)
252+
253+
# Return those that are repeated most often (may be one or multiple numbers)
254+
most_common = max(factor_count.values())
255+
most_likely_factors = [k for k,v in factor_count.items() if v == most_common]
256+
257+
return most_likely_factors
258+
259+
260+
def find_factors(number, max=MAX_KEY_LENGTH):
261+
'''
262+
Calculates all factors for the number provided. Returns a list of all
263+
factors found, above 1 and up to the specified maximum.
264+
'''
265+
266+
factors = []
267+
268+
# Find all factors (above 1) for the given number.
269+
for idx in range(2, max + 1):
270+
if number % idx == 0:
271+
factors.append(idx)
272+
273+
# We can also add as factor any number we know is valid divided by 2
274+
other_factors = idx // 2
275+
if other_factors > 1 and other_factors < max:
276+
factors.append(other_factors)
277+
278+
# Avoid returning duplicate factors
279+
return set(factors)
280+
281+
282+
def get_letter_at_interval(text, interval):
283+
'''
284+
Given a string of text and a number, it will return a list of strings
285+
containing letters at a specified interval. Starting at index 0 and repeat
286+
at index 1, index 2, index n...
287+
'''
288+
289+
letters = []
290+
291+
# Extract the nth letter from text starting at indexes 0, 1, 2, ...n
292+
for i in range(interval):
293+
294+
current_index_string = []
295+
# Extract the nth letter from text at interval specified
296+
for j in range(i, len(text), interval):
297+
current_index_string.append(text[j])
298+
letters.append(''.join(current_index_string))
299+
300+
return letters
301+
302+
303+
if __name__ == '__main__':
304+
main()

0 commit comments

Comments
 (0)