6个版本
0.1.6 | 2022年11月25日 |
---|---|
0.1.5 | 2022年1月22日 |
#744 in 数学
每月 630 次下载
用于 iter_num_tools
14KB
245 行
array_bin_ops
Rust中数组逐元素二进制操作的示例实现。
尽可能生成高效代码,避免任何内存安全问题。当前基准测试表明,它的速度比目前任何安全代码都要快(仅使用std)
示例汇编
给定以下Rust代码
pub fn add_i64x32(lhs: [i64; 32], rhs: [i64; 32]) -> [i64; 32] {
Array(lhs) + rhs
}
它输出以下汇编代码,执行了16次i64x2加法操作,在一个展开的循环中执行,以避免分支。
add_i64x32:
sub rsp, 72
mov rax, rdi
movdqu xmm1, xmmword, ptr, [rsi]
movdqu xmm3, xmmword, ptr, [rsi, +, 16]
movdqu xmm5, xmmword, ptr, [rsi, +, 32]
movdqu xmm7, xmmword, ptr, [rsi, +, 48]
movdqu xmm15, xmmword, ptr, [rsi, +, 64]
movdqu xmm8, xmmword, ptr, [rsi, +, 80]
movdqu xmm9, xmmword, ptr, [rsi, +, 96]
movdqu xmm10, xmmword, ptr, [rsi, +, 112]
movdqu xmm14, xmmword, ptr, [rsi, +, 128]
movdqu xmm13, xmmword, ptr, [rsi, +, 144]
movdqu xmm12, xmmword, ptr, [rsi, +, 160]
movdqu xmm11, xmmword, ptr, [rsi, +, 176]
movups xmm0, xmmword, ptr, [rsi, +, 192]
movaps xmmword, ptr, [rsp], xmm0
movdqu xmm2, xmmword, ptr, [rsi, +, 208]
movups xmm0, xmmword, ptr, [rsi, +, 224]
movaps xmmword, ptr, [rsp, +, 48], xmm0
movdqu xmm0, xmmword, ptr, [rdx]
paddq xmm0, xmm1
movdqa xmmword, ptr, [rsp, +, 32], xmm0
movdqu xmm0, xmmword, ptr, [rdx, +, 16]
paddq xmm0, xmm3
movdqa xmmword, ptr, [rsp, +, 16], xmm0
movdqu xmm4, xmmword, ptr, [rdx, +, 32]
paddq xmm4, xmm5
movdqu xmm6, xmmword, ptr, [rdx, +, 48]
paddq xmm6, xmm7
movdqu xmm1, xmmword, ptr, [rdx, +, 64]
paddq xmm1, xmm15
movdqu xmm15, xmmword, ptr, [rdx, +, 80]
paddq xmm15, xmm8
movdqu xmm8, xmmword, ptr, [rdx, +, 96]
paddq xmm8, xmm9
movdqu xmm9, xmmword, ptr, [rdx, +, 112]
paddq xmm9, xmm10
movdqu xmm10, xmmword, ptr, [rdx, +, 128]
paddq xmm10, xmm14
movdqu xmm14, xmmword, ptr, [rdx, +, 144]
paddq xmm14, xmm13
movdqu xmm13, xmmword, ptr, [rdx, +, 160]
paddq xmm13, xmm12
movdqu xmm12, xmmword, ptr, [rdx, +, 176]
paddq xmm12, xmm11
movdqu xmm3, xmmword, ptr, [rdx, +, 192]
paddq xmm3, xmmword, ptr, [rsp]
movdqu xmm7, xmmword, ptr, [rdx, +, 208]
paddq xmm7, xmm2
movdqu xmm5, xmmword, ptr, [rdx, +, 224]
paddq xmm5, xmmword, ptr, [rsp, +, 48]
movdqu xmm11, xmmword, ptr, [rsi, +, 240]
movdqu xmm0, xmmword, ptr, [rdx, +, 240]
paddq xmm0, xmm11
movaps xmm2, xmmword, ptr, [rsp, +, 32]
movups xmmword, ptr, [rdi], xmm2
movaps xmm2, xmmword, ptr, [rsp, +, 16]
movups xmmword, ptr, [rdi, +, 16], xmm2
movdqu xmmword, ptr, [rdi, +, 32], xmm4
movdqu xmmword, ptr, [rdi, +, 48], xmm6
movdqu xmmword, ptr, [rdi, +, 64], xmm1
movdqu xmmword, ptr, [rdi, +, 80], xmm15
movdqu xmmword, ptr, [rdi, +, 96], xmm8
movdqu xmmword, ptr, [rdi, +, 112], xmm9
movdqu xmmword, ptr, [rdi, +, 128], xmm10
movdqu xmmword, ptr, [rdi, +, 144], xmm14
movdqu xmmword, ptr, [rdi, +, 160], xmm13
movdqu xmmword, ptr, [rdi, +, 176], xmm12
movdqu xmmword, ptr, [rdi, +, 192], xmm3
movdqu xmmword, ptr, [rdi, +, 208], xmm7
movdqu xmmword, ptr, [rdi, +, 224], xmm5
movdqu xmmword, ptr, [rdi, +, 240], xmm0
add rsp, 72
ret