Skip to content

Commit 60816d4

Browse files
committed
Adding fft.s
1 parent 7460f2a commit 60816d4

File tree

1 file changed

+398
-0
lines changed
  • contents/cooley_tukey/code/asm-x64

1 file changed

+398
-0
lines changed
Lines changed: 398 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
.intel_syntax noprefix
2+
3+
.section .rodata
4+
two: .double 2.0
5+
one: .double 1.0
6+
two_pi: .double -6.28318530718
7+
rand_max: .long 4290772992
8+
.long 1105199103
9+
fmt: .string "%g\n"
10+
11+
.section .text
12+
.global main
13+
.extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2
14+
15+
# rdi - array ptr
16+
# rsi - array size
17+
dft:
18+
push rbx
19+
push r12
20+
push r13
21+
push r14
22+
push r15
23+
mov r12, rdi # Save parameters
24+
mov r13, rsi
25+
sub rsp, r13 # Make a double complex array
26+
xor r14, r14 # Set index to 0
27+
dft_loop_i:
28+
cmp r14, r13 # Check if index is equal to array size
29+
je dft_end_i
30+
lea rax, [rsp + r14] # Set tmp array to zero at r14
31+
mov QWORD PTR [rax], 0
32+
mov QWORD PTR [rax + 8], 0
33+
xor r15, r15 # Set second index to 0
34+
dft_loop_j:
35+
cmp r15, r13 # Check if the index is equal to array size
36+
je dft_end_j
37+
movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N
38+
mov rax, r14
39+
imul rax, r15
40+
shr rax, 4
41+
cvtsi2sdq xmm2, rax
42+
mulsd xmm1, xmm2
43+
cvtsi2sdq xmm2, r13
44+
divsd xmm1, xmm2
45+
pxor xmm0, xmm0 # Set xmm0 to 0
46+
call cexp
47+
lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N)
48+
movsd xmm2, QWORD PTR [rax]
49+
movsd xmm3, QWORD PTR [rax + 8]
50+
call __muldc3
51+
lea rax, [rsp + r14]
52+
movsd xmm6, QWORD PTR [rax] # Sum to tmp array
53+
movsd xmm7, QWORD PTR [rax + 8]
54+
addsd xmm6, xmm0
55+
addsd xmm7, xmm1
56+
movsd QWORD PTR [rax], xmm6 # Save to tmp array
57+
movsd QWORD PTR [rax + 8], xmm7
58+
add r15, 16
59+
jmp dft_loop_j
60+
dft_end_j:
61+
add r14, 16
62+
jmp dft_loop_i
63+
dft_end_i:
64+
mov rdi, r12 # Move tmp array to array ptr
65+
mov rsi, rsp
66+
mov rdx, r13
67+
call memcpy
68+
add rsp, r13
69+
pop r15
70+
pop r14
71+
pop r13
72+
pop r12
73+
pop rbx
74+
ret
75+
76+
# rdi - array ptr
77+
# rsi - array size
78+
cooley_tukey:
79+
cmp rsi, 16 # Check if size if greater then 1
80+
jle cooley_tukey_return
81+
push rbx
82+
push r12
83+
push r13
84+
push r14
85+
push r15
86+
mov r12, rdi # Save parameters
87+
mov r13, rsi
88+
mov r14, rsi # Save N / 2
89+
shr r14, 1
90+
sub rsp, r14 # Make a tmp array
91+
xor r15, r15
92+
mov rbx, r12
93+
cooley_tukey_spliting:
94+
cmp r15, r14
95+
je cooley_tukey_split
96+
lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array
97+
movaps xmm0, XMMWORD PTR [rax + 16]
98+
movaps xmm1, XMMWORD PTR [rax]
99+
movaps XMMWORD PTR [rsp + r15], xmm0
100+
movaps XMMWORD PTR [rbx], xmm1
101+
add rbx, 16
102+
add r15, 16
103+
jmp cooley_tukey_spliting
104+
cooley_tukey_split:
105+
mov rax, rsp
106+
lea rdi, [r12 + r13]
107+
cooley_tukey_mov_data:
108+
cmp rbx, rdi
109+
je cooley_tukey_moved
110+
movaps xmm0, XMMWORD PTR [rax]
111+
movaps XMMWORD PTR [rbx], xmm0
112+
add rbx, 16
113+
add rax, 16
114+
jmp cooley_tukey_mov_data
115+
cooley_tukey_moved:
116+
add rsp, r14
117+
mov rdi, r12 # Makking a recursive call
118+
mov rsi, r14
119+
call cooley_tukey
120+
lea rdi, [r12 + r14] # Makking a recursive call
121+
mov rsi, r14
122+
call cooley_tukey
123+
lea rbx, [r12 + r14]
124+
mov r14, rbx
125+
mov r15, r12
126+
cooley_tukey_loop:
127+
cmp r15, rbx
128+
je cooley_tukey_end
129+
pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N)
130+
movsd xmm1, two_pi
131+
mov rax, r14
132+
sub rax, rbx
133+
cvtsi2sdq xmm2, rax
134+
cvtsi2sdq xmm3, r13
135+
divsd xmm2, xmm3
136+
mulsd xmm1, xmm2
137+
call cexp
138+
movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2]
139+
movq xmm3, QWORD PTR [r14 + 8]
140+
call __muldc3
141+
movq xmm2, QWORD PTR [r15]
142+
movq xmm3, QWORD PTR [r15 + 8]
143+
subsd xmm2, xmm0
144+
subsd xmm3, xmm1
145+
movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2]
146+
movq QWORD PTR [r14 + 8], xmm3
147+
movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i]
148+
movq xmm1, QWORD PTR [r15 + 8]
149+
subsd xmm2, xmm0
150+
subsd xmm3, xmm1
151+
subsd xmm0, xmm2
152+
subsd xmm1, xmm3
153+
movq QWORD PTR [r15], xmm0
154+
movq QWORD PTR [r15 + 8], xmm1
155+
add r14, 16
156+
add r15, 16
157+
jmp cooley_tukey_loop
158+
cooley_tukey_end:
159+
pop r15
160+
pop r14
161+
pop r13
162+
pop r12
163+
pop rbx
164+
cooley_tukey_return:
165+
ret
166+
167+
# rdi - array ptr
168+
# rsi - array size
169+
bit_reverse:
170+
push rbx
171+
push r12
172+
push r13
173+
push r14
174+
push r15
175+
mov r12, rdi # Save parameters
176+
mov r13, rsi
177+
shr r13, 4
178+
xor r14, r14 # Loop through all entries
179+
bit_reverse_entries:
180+
cmp r14, r13
181+
je bit_reverse_return
182+
cvtsi2sdq xmm0, r13 # Calculating the number of bit in N
183+
call log2
184+
cvttsd2si rcx, xmm0
185+
mov rdi, 1 # Calculating (1 << log2(N)) - 1
186+
sal edi, cl
187+
sub edi, 1
188+
sub ecx, 1
189+
mov rax, r14
190+
mov r15, r14
191+
bit_reverse_loop:
192+
sar r15 # Check if r15 is 0
193+
je bit_reverse_reversed
194+
sal rax, 1 # Calculating (rax << 1) | (r15 & 1)
195+
mov rsi, r15
196+
and rsi, 1
197+
or rax, rsi
198+
sub ecx, 1 # Decrement bit count
199+
jmp bit_reverse_loop
200+
bit_reverse_reversed:
201+
sal eax, cl # Calculate (rax << rcx) & (1 << bit count)
202+
and rax, rdi
203+
cmp rax, r14 # Check if rax is greater then r14
204+
jle bit_reverse_no_swap # If so then swap entries
205+
shl rax, 4 # Times index by 16 to get bytes to entry
206+
shl r14, 4
207+
movaps xmm0, XMMWORD PTR [r12 + rax]
208+
movaps xmm1, XMMWORD PTR [r12 + r14]
209+
movaps XMMWORD PTR [r12 + rax], xmm1
210+
movaps XMMWORD PTR [r12 + r14], xmm0
211+
shr r14, 4
212+
bit_reverse_no_swap:
213+
add r14, 1
214+
jmp bit_reverse_entries
215+
bit_reverse_return:
216+
pop r15
217+
pop r14
218+
pop r13
219+
pop r12
220+
pop rbx
221+
ret
222+
223+
# rdi - array ptr
224+
# rsi - array size
225+
iterative_cooley_tukey:
226+
push r12
227+
push r13
228+
push r14
229+
push r15
230+
push rbx
231+
push rbp
232+
sub rsp, 40
233+
mov r12, rdi
234+
mov r13, rsi
235+
call bit_reverse # Bit reversing array
236+
sar r13, 4 # Calculate log2(N)
237+
cvtsi2sdq xmm0, r13
238+
call log2
239+
cvttsd2si rax, xmm0
240+
mov QWORD PTR [rsp], rax # Save it to the stack
241+
mov r14, 1
242+
iter_ct_loop_i:
243+
cmp r14, rax # Check if r14 is greater then log2(N)
244+
jg iter_ct_end_i
245+
movsd xmm0, two # Calculate stride = 2^(r14)
246+
cvtsi2sdq xmm1, r14
247+
call pow
248+
cvttsd2si rbp, xmm0
249+
movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride)
250+
divsd xmm1, xmm0
251+
pxor xmm0, xmm0
252+
call cexp
253+
movq QWORD PTR [rsp + 8], xmm0 # Save it to stack
254+
movq QWORD PTR [rsp + 16], xmm1
255+
xor r15, r15
256+
iter_ct_loop_j:
257+
cmp r15, r13 # Check if r15 is less then array size
258+
je iter_ct_end_j
259+
movsd xmm4, one # Save 1 + 0i to stack
260+
pxor xmm5, xmm5
261+
movsd QWORD PTR [rsp + 24], xmm4
262+
movsd QWORD PTR [rsp + 32], xmm5
263+
xor rbx, rbx
264+
mov rax, rbp # Calculate stride / 2
265+
sar rax, 1
266+
iter_ct_loop_k:
267+
cmp rbx, rax # Check if rbx is less then stride / 2
268+
je iter_ct_end_k
269+
mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j]
270+
add r8, rbx
271+
sal r8, 4
272+
mov r9, rbp
273+
sal r9, 3
274+
add r9, r8
275+
lea r9, [r12 + r9]
276+
lea r8, [r12 + r8]
277+
movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2]
278+
movsd xmm1, QWORD PTR [r9 + 8]
279+
movsd xmm2, QWORD PTR [rsp + 24]
280+
movsd xmm3, QWORD PTR [rsp + 32]
281+
call __muldc3
282+
movsd xmm2, QWORD PTR [r8]
283+
movsd xmm3, QWORD PTR [r8 + 8]
284+
subsd xmm2, xmm0
285+
subsd xmm3, xmm1
286+
movsd QWORD PTR [r9], xmm2 # Saving answer
287+
movsd QWORD PTR [r9 + 8], xmm3
288+
movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
289+
movsd xmm1, QWORD PTR [r8 + 8]
290+
subsd xmm2, xmm0
291+
subsd xmm3, xmm1
292+
subsd xmm0, xmm2
293+
subsd xmm1, xmm3
294+
movsd QWORD PTR [r8], xmm0 # Saving answer
295+
movsd QWORD PTR [r8 + 8], xmm1
296+
movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w
297+
movsd xmm1, QWORD PTR [rsp + 32]
298+
movsd xmm2, QWORD PTR [rsp + 8]
299+
movsd xmm3, QWORD PTR [rsp + 16]
300+
call __muldc3
301+
movsd QWORD PTR [rsp + 24], xmm0 # Saving answer
302+
movsd QWORD PTR [rsp + 32], xmm1
303+
add rbx, 1
304+
mov rax, rbp
305+
sar rax, 1
306+
jmp iter_ct_loop_k
307+
iter_ct_end_k:
308+
add r15, rbp
309+
jmp iter_ct_loop_j
310+
iter_ct_end_j:
311+
add r14, 1
312+
mov rax, QWORD PTR [rsp]
313+
jmp iter_ct_loop_i
314+
iter_ct_end_i:
315+
add rsp, 40
316+
pop rbp
317+
pop rbx
318+
pop r15
319+
pop r14
320+
pop r13
321+
pop r12
322+
ret
323+
324+
# rdi - array a ptr
325+
# rsi - array b ptr
326+
# rdx - array size
327+
approx:
328+
push r12
329+
push r13
330+
push r14
331+
push r15
332+
mov r12, rdi
333+
mov r13, rsi
334+
mov r14, rdx
335+
lea r15, [rdi + rdx]
336+
sub rsp, 8
337+
approx_loop:
338+
cmp r12, r15
339+
je approx_return
340+
movsd xmm0, QWORD PTR[r13]
341+
movsd xmm1, QWORD PTR[r13 + 8]
342+
call cabs
343+
movsd QWORD PTR [rsp], xmm0
344+
movsd xmm0, QWORD PTR[r12]
345+
movsd xmm1, QWORD PTR[r12 + 8]
346+
call cabs
347+
movsd xmm1, QWORD PTR [rsp]
348+
subsd xmm0, xmm1
349+
mov rdi, OFFSET fmt
350+
mov rax, 1
351+
call printf
352+
add r12, 16
353+
add r13, 16
354+
jmp approx_loop
355+
approx_return:
356+
add rsp, 8
357+
pop r15
358+
pop r14
359+
pop r13
360+
pop r12
361+
ret
362+
363+
main:
364+
push r12
365+
sub rsp, 2048
366+
mov rdi, 0
367+
call time
368+
mov edi, eax
369+
call srand
370+
lea r12, [rsp + 1024]
371+
loop:
372+
cmp r12, rsp
373+
je end_loop
374+
sub r12, 16
375+
call rand
376+
cvtsi2sd xmm0, rax
377+
divsd xmm0, rand_max
378+
lea rax, [r12 + 1024]
379+
movsd QWORD PTR [r12], xmm0
380+
movsd QWORD PTR [rax], xmm0
381+
mov QWORD PTR [r12 + 8], 0
382+
mov QWORD PTR [rax + 8], 0
383+
jmp loop
384+
end_loop:
385+
mov rdi, rsp
386+
mov rsi, 1024
387+
call iterative_cooley_tukey
388+
lea rdi, [rsp + 1024]
389+
mov rsi, 1024
390+
call cooley_tukey
391+
mov rdi, rsp
392+
lea rsi, [rsp + 1024]
393+
mov rdx, 1024
394+
call approx
395+
add rsp, 2048
396+
pop r12
397+
ret
398+

0 commit comments

Comments
 (0)