#const-generics

无std array-bin-ops

高效的数组二进制操作

6个版本

0.1.6 2022年11月25日
0.1.5 2022年1月22日

#744 in 数学

Download history 20/week @ 2024-03-13 25/week @ 2024-03-20 26/week @ 2024-03-27 20/week @ 2024-04-03 49/week @ 2024-04-10 78/week @ 2024-04-17 49/week @ 2024-04-24 56/week @ 2024-05-01 48/week @ 2024-05-08 17/week @ 2024-05-15 26/week @ 2024-05-22 45/week @ 2024-05-29 151/week @ 2024-06-05 116/week @ 2024-06-12 162/week @ 2024-06-19 190/week @ 2024-06-26

每月 630 次下载
用于 iter_num_tools

自定义许可协议

14KB
245

array_bin_ops

Rust中数组逐元素二进制操作的示例实现。

尽可能生成高效代码,避免任何内存安全问题。当前基准测试表明,它的速度比目前任何安全代码都要快(仅使用std)

示例汇编

给定以下Rust代码

pub fn add_i64x32(lhs: [i64; 32], rhs: [i64; 32]) -> [i64; 32] {
    Array(lhs) + rhs
}

它输出以下汇编代码,执行了16次i64x2加法操作,在一个展开的循环中执行,以避免分支。

add_i64x32:
 sub     rsp, 72
 mov     rax, rdi
 movdqu  xmm1, xmmword, ptr, [rsi]
 movdqu  xmm3, xmmword, ptr, [rsi, +, 16]
 movdqu  xmm5, xmmword, ptr, [rsi, +, 32]
 movdqu  xmm7, xmmword, ptr, [rsi, +, 48]
 movdqu  xmm15, xmmword, ptr, [rsi, +, 64]
 movdqu  xmm8, xmmword, ptr, [rsi, +, 80]
 movdqu  xmm9, xmmword, ptr, [rsi, +, 96]
 movdqu  xmm10, xmmword, ptr, [rsi, +, 112]
 movdqu  xmm14, xmmword, ptr, [rsi, +, 128]
 movdqu  xmm13, xmmword, ptr, [rsi, +, 144]
 movdqu  xmm12, xmmword, ptr, [rsi, +, 160]
 movdqu  xmm11, xmmword, ptr, [rsi, +, 176]
 movups  xmm0, xmmword, ptr, [rsi, +, 192]
 movaps  xmmword, ptr, [rsp], xmm0
 movdqu  xmm2, xmmword, ptr, [rsi, +, 208]
 movups  xmm0, xmmword, ptr, [rsi, +, 224]
 movaps  xmmword, ptr, [rsp, +, 48], xmm0
 movdqu  xmm0, xmmword, ptr, [rdx]
 paddq   xmm0, xmm1
 movdqa  xmmword, ptr, [rsp, +, 32], xmm0
 movdqu  xmm0, xmmword, ptr, [rdx, +, 16]
 paddq   xmm0, xmm3
 movdqa  xmmword, ptr, [rsp, +, 16], xmm0
 movdqu  xmm4, xmmword, ptr, [rdx, +, 32]
 paddq   xmm4, xmm5
 movdqu  xmm6, xmmword, ptr, [rdx, +, 48]
 paddq   xmm6, xmm7
 movdqu  xmm1, xmmword, ptr, [rdx, +, 64]
 paddq   xmm1, xmm15
 movdqu  xmm15, xmmword, ptr, [rdx, +, 80]
 paddq   xmm15, xmm8
 movdqu  xmm8, xmmword, ptr, [rdx, +, 96]
 paddq   xmm8, xmm9
 movdqu  xmm9, xmmword, ptr, [rdx, +, 112]
 paddq   xmm9, xmm10
 movdqu  xmm10, xmmword, ptr, [rdx, +, 128]
 paddq   xmm10, xmm14
 movdqu  xmm14, xmmword, ptr, [rdx, +, 144]
 paddq   xmm14, xmm13
 movdqu  xmm13, xmmword, ptr, [rdx, +, 160]
 paddq   xmm13, xmm12
 movdqu  xmm12, xmmword, ptr, [rdx, +, 176]
 paddq   xmm12, xmm11
 movdqu  xmm3, xmmword, ptr, [rdx, +, 192]
 paddq   xmm3, xmmword, ptr, [rsp]
 movdqu  xmm7, xmmword, ptr, [rdx, +, 208]
 paddq   xmm7, xmm2
 movdqu  xmm5, xmmword, ptr, [rdx, +, 224]
 paddq   xmm5, xmmword, ptr, [rsp, +, 48]
 movdqu  xmm11, xmmword, ptr, [rsi, +, 240]
 movdqu  xmm0, xmmword, ptr, [rdx, +, 240]
 paddq   xmm0, xmm11
 movaps  xmm2, xmmword, ptr, [rsp, +, 32]
 movups  xmmword, ptr, [rdi], xmm2
 movaps  xmm2, xmmword, ptr, [rsp, +, 16]
 movups  xmmword, ptr, [rdi, +, 16], xmm2
 movdqu  xmmword, ptr, [rdi, +, 32], xmm4
 movdqu  xmmword, ptr, [rdi, +, 48], xmm6
 movdqu  xmmword, ptr, [rdi, +, 64], xmm1
 movdqu  xmmword, ptr, [rdi, +, 80], xmm15
 movdqu  xmmword, ptr, [rdi, +, 96], xmm8
 movdqu  xmmword, ptr, [rdi, +, 112], xmm9
 movdqu  xmmword, ptr, [rdi, +, 128], xmm10
 movdqu  xmmword, ptr, [rdi, +, 144], xmm14
 movdqu  xmmword, ptr, [rdi, +, 160], xmm13
 movdqu  xmmword, ptr, [rdi, +, 176], xmm12
 movdqu  xmmword, ptr, [rdi, +, 192], xmm3
 movdqu  xmmword, ptr, [rdi, +, 208], xmm7
 movdqu  xmmword, ptr, [rdi, +, 224], xmm5
 movdqu  xmmword, ptr, [rdi, +, 240], xmm0
 add     rsp, 72
 ret

无运行时依赖