-
Notifications
You must be signed in to change notification settings - Fork 67
Open
Labels
enhancementNew feature or requestNew feature or requestgood first issueGood for newcomersGood for newcomershelp wantedExtra attention is neededExtra attention is needed
Description
We can rewrite the whole "pipeline" as a single C 99 version:
void pipeline_kernel_c99(unsigned long start, unsigned long end, unsigned long *sum_out, unsigned long *count_out) {
unsigned long sum = 0;
unsigned long count = 0;
unsigned long max_power_of_three = 12157665459056928801ull;
for (unsigned long value = start; value <= end; ++value) {
// Check if the value is a power of two
if (value > 0 && (value & (value - 1)) == 0) continue;
// Check if the value is a power of three
if (value > 0 && max_power_of_three % value == 0) continue;
// Prime factorization
unsigned long n = value;
unsigned long factor = 2;
// Handle factor 2 separately
while (n % 2 == 0) {
sum += 2;
count += 1;
n /= 2;
}
// Handle odd factors
factor = 3;
while (factor * factor <= n) {
while (n % factor == 0) {
sum += factor;
count += 1;
n /= factor;
}
factor += 2;
}
// If n is still greater than 1, it is a prime factor
if (n > 1) {
sum += n;
count += 1;
}
}
*sum_out = sum;
*count_out = count;
}
GCC 14.2 produces the following Assembly:
pipeline_kernel_c99(unsigned long, unsigned long, unsigned long*, unsigned long*):
push rbp
mov rbp, rcx
push rbx
mov rbx, rdx
cmp rsi, rdi
jb .L15
mov r9, rdi
mov r10, rsi
xor edi, edi
xor r8d, r8d
movabs r11, -6289078614652622815
jmp .L14
.L7:
add r9, 1
cmp r10, r9
jb .L2
.L14:
xor ecx, ecx
test r9, r9
je .L9
lea rax, [r9-1]
test rax, r9
je .L7
mov rax, r11
xor edx, edx
div r9
test rdx, rdx
je .L7
mov rcx, r9
test r9b, 1
jne .L39
.L9:
shr rcx
add r8, 2
add rdi, 1
test cl, 1
je .L9
cmp rcx, 8
jbe .L10
.L5:
mov esi, 3
.L11:
mov rax, rcx
xor edx, edx
div rsi
test rdx, rdx
jne .L13
.L12:
mov rax, rcx
xor edx, edx
add r8, rsi
add rdi, 1
div rsi
xor edx, edx
mov rcx, rax
div rsi
test rdx, rdx
je .L12
.L13:
add rsi, 2
mov rax, rsi
imul rax, rsi
cmp rcx, rax
jnb .L11
.L10:
cmp rcx, 1
je .L7
.L6:
add r9, 1
add r8, rcx
add rdi, 1
cmp r10, r9
jnb .L14
.L2:
mov QWORD PTR [rbx], r8
mov QWORD PTR [rbp+0], rdi
pop rbx
pop rbp
ret
.L39:
cmp r9, 8
ja .L5
jmp .L6
.L15:
xor edi, edi
xor r8d, r8d
mov QWORD PTR [rbx], r8
mov QWORD PTR [rbp+0], rdi
pop rbx
pop rbp
ret
It seems like this can be further optimized. Who's up for the task?
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or requestgood first issueGood for newcomersGood for newcomershelp wantedExtra attention is neededExtra attention is needed