小编典典

计算 [1..N] 中前导 1 以下 K 个零位的整数？（没有 HW POPCNT 的连续范围的 popcount）

all

我有以下任务：计算 1 和 N 之间有多少个数字恰好有 K 个零非前导位。（例如 7 10 =111 2将有 0 个，4 将有 2 个）

N 和 K 满足条件 0 ≤ K, N ≤ 1000000000

这个版本使用 POPCNT 并且在我的机器上足够快：

%include "io.inc"

section .bss
    n resd 1
    k resd 1
    ans resd 1
section .text
global CMAIN
CMAIN:
    GET_DEC 4,n
    GET_DEC 4,k
    mov ecx,1
    mov edx,0
    ;ecx is counter from 1 to n

loop_:
    mov eax, ecx
    popcnt eax,eax;in eax now amount of bits set
    mov edx, 32
    sub edx, eax;in edx now 32-bits set=bits not set

    mov eax, ecx;count leading bits
    bsr eax, eax;
    xor eax, 0x1f;
    sub edx, eax
    mov eax, edx
    ; all this lines something like (gcc):
    ; eax=32-__builtin_clz(x)-_mm_popcnt_u32(x);

    cmp eax,[k];is there k non-leading bits in ecx?
    jnz notk
    ;if so, then increment ans

    mov edx,[ans]
    add edx,1
    mov [ans],edx
notk:
    ;increment counter, compare to n and loop
    inc ecx
    cmp ecx,dword[n]
    jna loop_

    ;print ans
    PRINT_DEC 4,ans
    xor  eax, eax
    ret

就速度而言（~0.8 秒）应该没问题，但它没有被接受，因为（我猜）测试服务器上使用的 CPU 太旧了，所以它表明发生了运行时错误。

我尝试使用带有 64K * 4 字节查找表的预计数技巧，但速度不够快：

%include "io.inc"
section .bss
    n resd 1
    k resd 1
    ans resd 1
    wordbits resd 65536; bits set in numbers from 0 to 65536
section .text
global CMAIN
CMAIN:
    mov ebp, esp; for correct debugging
    mov ecx,0
    ;mov eax, ecx
    ;fill in wordbits, ecx is wordbits array index
precount_:
    mov eax,ecx
    xor ebx,ebx
    ;c is ebx, v is eax
    ;for (c = 0; v; c++){
    ;    v &= v - 1; // clear the least significant bit set
    ;}
lloop_:
    mov edx,eax
    dec edx
    and eax,edx
    inc ebx
    test eax,eax
    jnz lloop_

    ;computed bits set
    mov dword[wordbits+4*ecx],ebx

    inc ecx
    cmp ecx,65536
    jna precount_

    ;0'th element should be 0
    mov dword[wordbits],0

    GET_DEC 4,edi;n
    GET_DEC 4,esi;k

    mov ecx,1
    xor edx,edx
    xor ebp,ebp

loop_:
    mov eax, ecx
    ;popcnt eax,eax
    mov edx,ecx
    and eax,0xFFFF 
    shr edx,16
    mov eax,dword[wordbits+4*eax]
    add eax,dword[wordbits+4*edx]
    ;previous lines are to implement absent instruction popcnt.
    ; they simply do eax=wordbits[x & 0xFFFF] + wordbits[x >> 16]
    mov edx, 32
    sub edx, eax
    ;and the same as before: 
    ;non-leading zero bits=32-bits set-__builtin_clz(x)
    mov eax, ecx
    bsr eax, eax
    xor eax, 0x1f
    sub edx, eax
    mov eax, edx

    ;compare to k again to see if this number has exactly k 
    ;non-leading zero bits

    cmp edx, esi
    jnz notk

    ;increment ebp (answer) if so
    mov edx, ebp
    add edx, 1
    mov ebp, edx
    ;and (or) go to then next iteration 
notk:
    inc ecx
    cmp ecx, edi
    jna loop_

    ;print answer what is in ebp
    PRINT_DEC 4, ebp
    xor  eax, eax
    ret

(>1 秒)

我应该加快第二个程序（如果是，那么如何？）或以某种方式用其他一些（哪个？）指令替换 POPCNT（我猜应该可以使用 SSE2 和更早版本）？

阅读 215

2022-02-28

共1个答案

小编典典

首先，太旧的服务器popcnt在其他方面会明显变慢，并且有不同的瓶颈。鉴于它有 pshufb 但没有 popcnt，它是 Core 2 第一代或第二代（Conroe 或 Penryn）。请参阅 Agner Fog 的 microarch PDF（在https://agner.org/optimize/上）。还降低了时钟速度，因此您可以在该 CPU 上做的最好的事情可能不足以让蛮力工作。

可能存在可以节省大量时间的算法改进，例如注意到每 4 个增量循环低 2 位通过 00、01、10、11 模式：每四个增量发生一次 2 个零，发生两次 1 个零，没有零发生一次。对于每个 >= 4 的数字，这 2 位都低于前导位，因此是计数的一部分。将其推广到 1 到 log2(N) 之间的每个 MSB 位置的组合公式可能是一种大大减少工作量的方法。处理 2^M 和 N 之间的数字不太明显。

这里的版本：

清理 popcnt 版本，在 i7-6700k @ 3.9GHz 上为 536ms，没有跨迭代的算法优化。对于 k=8，N=1000000000
朴素 LUT 版本（每次迭代 2 次加载，无迭代间优化）：运行良好时约为 595 毫秒，k=8 时更常见约为 610 毫秒，N=1000000000。Core2Duo (Conroe) @ 2.4GHz：1.69 秒。（编辑历史中有几个更糟糕的版本，第一个版本在 Core 2 上有部分寄存器停顿。）
（未完成，未编写清理代码）优化的 LUT 版本（展开，高半/MSB BSR 工作已提升，每次迭代仅留下 1 个查找 (cmp/jcc)），Skylake 上 210 毫秒，Core 2 @ 2.4GHz 上 0.58 秒。时间应该是现实的；我们已经完成了所有工作，只是错过了 MSB 处于低 16 位的最后 2^16 次迭代。在外循环中处理任何必要的极端情况以及清理，对速度的影响不应超过 1%。
pcmpeqb（甚至更多未完成）：用/对优化的 LUT 版本进行矢量化psubb（psadbw在外循环中，如How to count character occurrences using SIMD显示 -内循环减少为计算固定大小数组中与计算值匹配的字节元素外循环。 就像标量版本一样）。Skylake 为 18 毫秒，Core 2 为约 0.036 秒。这些时间现在可能包括相当多的启动开销。但正如预期/希望的那样，两者都快了大约 16 倍。
wordbits对表格进行一次直方图（可能在您生成它时）。无需搜索 64kiB 来查找匹配的字节，只需查找每次外循环迭代的答案！对于大 N，这应该让您的速度提高数千倍。（尽管当 N 不是 64K 的倍数时，您仍然需要处理低 1..64K 和部分范围。）

为了有效地测量更快的版本，您可以在整个过程中重复循环，这样整个过程仍然需要一些可测量的时间，比如半秒。（因为它是 asm，没有编译器会优化重复执行相同 N，k 的工作。）或者，rdtsc如果您知道 TSC 频率，您可以在程序内部进行计时。但是能够perf stat轻松地在整个过程中使用很好，所以我会继续这样做（取出 printf 并制作一个静态可执行文件以进一步减少启动开销）。

您似乎在询问如何对仍然单独检查每个数字的蛮力方法进行微优化。32 - clz - popcnt == k（尽管您的实现方式可以进行重大优化。）

还有其他一些通常更快的 popcnt 方法，例如如何计算 32 位整数中设置的位数？. 但是，当您在一个紧密的循环中进行大量弹出计数（足以使查找表在缓存中保持热）时，LUT 可能会很好。

如果您有快速 SSSE3 pshufb，则值得使用它在 XMM 寄存器中并行执行四个双字的 SIMD 弹出计数（自动矢量化循环），或者在带有 AVX2 的 YMM 寄存器中更好。（第一代 Core2 有pshufb，但在第二代 Core2 之前它不是单一的 uop。仍然可能值得。）

或者更好的是，使用 SIMD 来计算与我们正在寻找的匹配的 LUT 元素，对于给定的数字的高半部分。

蛮力检查连续数字范围为 LUT 策略开辟了重大优化：数字的高 n 位仅每 2^n 增量更改一次。 因此，您可以将这些位的计数提升到最内层循环之外。这也使得使用较小的表（适合 L1d 缓存）值得。

说到这里，你的 64k * 4 表是 256KiB，你的 L2 缓存的大小。这意味着每次循环通过它时可能都必须从 L3 进入。你的桌面 CPU 应该有足够的 L3 带宽（由于增量，访问模式是连续的），现代服务器有更大的 L2，但几乎没有理由不使用字节 LUT（popcnt(-1) 只有 32 ）。现代 Intel CPU（从 Haswell 开始）不会将 AL 与 EAX/RAX 的其余部分分开重命名，并且字节加载与dword 加载movzx一样便宜。mov

; General LUT lookup with two 16-bit halves
    movzx  edx, cx            ; low 16 bits
    mov    eax, ecx
    shr    eax, 16            ; high 16 bits
    movzx  edx, byte [wordbits + edx]
    add     dl,      [wordbits + eax]
      ; no partial-reg stall for reading EDX after this, on Intel Sandybridge and later
      ; on Core 2, set up so you can cmp al,dl later to avoid it

在一个太旧以至于不支持的 Intel CPU 上popcnt，这将导致部分寄存器停顿。改为进行下一个比较cmp al, dl。（在结果上使用leaoradd或，而不是 popcount LUT 加载，这样可以避免部分寄存器停顿。）sub``bsr

通常您希望使用较小的 LUT，例如每步 11 位，因此 3 步处理整个 32 位数字（2^11 = 2048 字节，32k L1d 的一小部分）。但是通过这种顺序访问模式，硬件预取可以处理它并完全隐藏延迟，尤其是当 L1d 预取将在 L2 中命中时。同样，这很好，因为此循环不涉及此查找表以外的任何内存。在每个 popcount 之间发生大量其他工作的正常情况下，查找表要糟糕得多，或者您在缓存中有任何其他有价值的数据，您不想驱逐。

针对 Skylake (i7-6700k) 进行了优化：即使每次迭代有 2 个 LUT 访问：0.600 秒，3.9GHz

与 popcnt 的 0.536 秒相比。 提升高半 LUT 查找（可能还有32常量）甚至可能让该版本更快。

注意：一个太旧以至于它没有的 CPUpopcnt与 Skylake 有很大的不同。为 Skylake 优化这个有点傻，除非你更进一步并最终击败popcntSkylake 上的版本，如果我们可以通过嵌套循环提升 BSR 工作，这是可能的，内部循环使用相同的 BSR 结果整个数字范围2^m .. 2^(m+1)-1（钳制到 64k 范围，因此您还可以提升高半 popcnt LUT 查找）。 == 从、和popcnt_low(i)计算的一些常数。k``popcnt_high(i)``clz(i)

对 Skylake 来说，3 件大事非常重要（其中一些与旧 CPU 相关，包括出于前端原因避免采用分支）：

避免在具有最新微码的英特尔 Skylake 派生 CPU 上让 cmp/jcc 触及 32 字节边界，因为英特尔通过禁用此类行的 uop 缓存来缓解 JCC 错误：32 字节对齐例程不适合微信缓存

这会查看反汇编并决定是否使指令更长（例如，lea eax, [dword -1 + edx]强制使用 4 字节位移而不是更小的 disp8。）以及是否align 32在循环顶部使用。

无增量比增量更常见，英特尔 CPU 只能以 1/clock 运行采用的分支。（但由于 Haswell 在另一个端口上有第二个执行单元，可以运行预测未采用的分支。）更改jne notk为je yesk跳回函数下方的块。dec ecx// else的尾部重复jnz .loop下降到jmp print_and_exit一个很小的数量，而只是在 . 之后跳回je yesk。

它很少使用（并且具有足够一致的模式）以至于它不会经常错误预测，因此setnz al/add ebx, eax可能会更糟。

优化32 - clz - popcnt == k检查，利用bsr给你的事实31-clz。所以31-clz - (popcnt-1) = 32-clz-popcnt。
由于我们正在比较，== k因此可以进一步重新排列为popcnt-1 + k == 31-clz。
当我们将 LUT 用于 popcount 时popcnt，我们可以负担得起使用 3 组件（慢）LEA 来代替必须在端口 1 上运行的指令lea edx, [edx + esi - 1]来执行popcnt-1+k. 由于它有 3 个组件（2 个寄存器和一个位移，2 个+寻址模式中的符号），它只能在端口 1 上运行（具有 3 个周期延迟），与 bsr（如果我们使用它，还有 popcnt）竞争。

通常利用lea保存的指令，即使在 popcnt 版本中也是如此。使用宏融合的 1-uopdec/jnz而不是inc+ ，将循环向下计数到 0 也是如此cmp/jne。（我还没有尝试过计算 L1d 硬件预取是否在这个方向上效果更好；popcnt 版本不在乎，但 LUT 版本可能会。）

移植到没有 io.inc 的情况下工作，只需使用带有 printf 的硬编码 N 和 k 进行输出。这不是“干净”的代码，例如%define wordbits edi我使用索引寻址模式而不是[reg + disp32]每次访问数组来测试更改分支对齐方式的讨厌黑客。这恰好起到了作用，让几乎所有的微指令都来自 DSB（微指令缓存）而不是 MITE，即避免了 JCC 勘误表减速。另一种方法是使指令更长，推动cmp/je和dec/jnz超过 32 字节的边界。（或更改循环开始的对齐方式。）Uop-cache 提取发生在最多 6 个 uops 的行中，如果您最终得到一行只有几个 uops 的行，则可能成为瓶颈。（Skylake 的循环缓冲区又名 LSD 也被微码禁用以修复早期的错误；英特尔在 Skylake 上的大错误比大多数设计都多。）

%use SMARTALIGN
alignmode p6, 64

section .bss
 wordbits: resb 65536
;    n resd 1
;    k resd 1
    ans resd 1
section .rodata
  n: dd 1000000000
  k: dd 8
  print_fmt: db `ans: %d\n`, 0

section .text

global main
main:            ; no popcnt version
    push  ebp
    push  edi    ; save some call-preserved registers
    push  esi
    push  ebx

    mov   edi, wordbits
%define wordbits edi             ; dirty hack, use indexed addressing modes instead of reg+disp32.
                                 ; Avoids Skylake JCC erratum problems, and is is slightly better on Core2 with good instruction scheduling
    ;fill in wordbits, ecx is wordbits array index
    mov   ecx, 1     ; leave wordbits[0] = 0
.init_loop:
    mov   eax,ecx
    xor   ebx,ebx
  .popc_loop:
      lea   edx, [eax-1]
      inc   ebx
      and   eax,edx         ; v &= v - 1; // blsr
      jnz  .popc_loop

    ;computed bits set
    mov [wordbits + ecx], bl

    inc ecx
    cmp ecx,65536
    jb .init_loop       ; bugfix: array out of bounds with jna: stores to wordbits[65536]


;    GET_DEC 4,n
;    GET_DEC 4,k
    mov   ecx, [n]      ; ecx counts from n down to 1
;    mov   esi, [k]
    xor   ebx, ebx      ; ebx = ans

    mov   esi, 1
    sub   esi, [k]      ; 1-k
align 32
.loop:
    ;popcnt eax, ecx
    movzx  eax, cx
    mov    ebp, ecx         ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
    movzx  edx, byte [wordbits + eax]
    shr    ebp, 16
;    xor eax, eax        ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr    eax, ecx         ; eax = 31-lzcnt for non-zero ecx
;    sub    edx, esi         ; sub now avoids partial-reg stuff.  Could have just used EBX to allow BL.
    add eax, esi           ; Add to BSR result seems slightly better on Core2 than sub from popcnt
    add     dl,      [wordbits + ebp]   ; we don't read EDX, no partial-register stall even on P6-family

                        ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
    cmp  al, dl         ; 31-clz+(1-k)  == popcount.  or  31-clz == popcnt - (1-k)
    je .yesk          ; not-taken is the more common fast path
 .done_inc:
    dec ecx
    jnz .loop         ; }while(--n >= 0U)

.print_and_exit:
    ;print ans
;    PRINT_DEC 4,ans
    push  ebx
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

    pop  ebx
    pop  esi
    pop  edi
    pop  ebp
    xor  eax, eax
    ret

align 8
.yesk:
   inc  ebx
;   jmp  .done_inc           ; tail duplication is a *tiny* bit faster
   dec  ecx
   jnz  .loop
   jmp  .print_and_exit

这是第 3 版，已更新以避免对 Core 2 (Conroe) 的部分注册惩罚。在 1.78 秒内运行1.69秒与 3.18 秒。有时在 Skylake 上还是一样快，但更常见的是 610 毫秒而不是 594 毫秒。我的 Core 2 上没有性能计数器访问权限；perf 太老了，无法完全支持，而且我没有最后启动的内核的 perf。

（Godbolt 版本 1 的反汇编和性能结果：https ://godbolt.org/z/ox7e8G ）

在我的 Linux 桌面上，i7-6700k 频率为 3.9GHz。（EPP = balance_performance，不是全性能，所以它显然不想加速到 4.2GHz。）我不需要sudo使用 perf，因为我设置了/proc/sys/kernel/perf_event_paranoid= 0。我taskset -c 3只是为了避免单线程工作负载的 CPU 迁移。

# Results from version 1, not the Core2-friendly version.
# Version 3 sometimes runs this fast, but more often ~610ms
# Event counts are near identical for both, except cycles, but uops_issue and executed are mysteriously lower, like 9,090,858,203 executed.
$ nasm -felf32 foo.asm -l/dev/stdout &&
    gcc -m32 -no-pie -fno-pie -fno-plt foo.o 
$ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,branches,branch-misses,instructions,uops_issued.any,uops_executed.thread -r 2 ./a.out 
ans: 12509316
ans: 12509316

 Performance counter stats for './a.out' (2 runs):

            597.78 msec task-clock                #    0.999 CPUs utilized            ( +-  0.12% )
                 0      context-switches          #    0.000 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                62      page-faults               #    0.103 K/sec                    ( +-  0.81% )
     2,328,038,096      cycles                    #    3.894 GHz                      ( +-  0.12% )
     2,000,637,322      branches                  # 3346.789 M/sec                    ( +-  0.00% )
         1,719,724      branch-misses             #    0.09% of all branches          ( +-  0.02% )
    11,015,217,584      instructions              #    4.73  insn per cycle           ( +-  0.00% )
     9,148,164,159      uops_issued.any           # 15303.609 M/sec                   ( +-  0.00% )
     9,102,818,982      uops_executed.thread      # 15227.753 M/sec                   ( +-  0.00% )

   (from a separate run):
      9,204,430,548      idq.dsb_uops              # 15513.249 M/sec                   ( +-  0.00% )
         1,008,922      idq.mite_uops             #    1.700 M/sec                    ( +- 20.51% )


          0.598156 +- 0.000760 seconds time elapsed  ( +-  0.13% )

这大约是 3.93 融合域（前端）微指令/时钟。所以我们非常接近 4/clock 的前端宽度。

使用popcnt：

您的原始文件（GET_DEC替换为加载常量）在我的桌面上运行了 1.3 秒，k=8 N=1000000000。此版本运行时间约为 0.54 秒。我的原始版本甚至不是分支对齐的最坏情况（另一个版本大约是 1.6 秒），尽管由于我确实必须更改某些内容，因此它可能与您的机器不同。

我使用了与上面几乎相同的优化来节省 uops 并帮助循环内的前端。（但我先这样做了，所以它缺少一些优化。）

align 32
.loop:
    mov    eax, ecx
    popcnt eax,eax
    lea    edx, [dword eax - 32 + 31]  ; popcnt - 32  =  -(bits not set)
                   ; dword displacement pads the cmp/jnz location to avoid the JCC erratum penalty on Intel

;    xor eax, eax         ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr eax, ecx         ; eax = 31-lzcnt
;    xor eax, 0x1f        ; eax = lzcnt (for non-zero x)
    ; want:  32-__builtin_clz(x)-_mm_popcnt_u32(x)  = (31-clz) + 1-popcnt = (31-clz) - (popcnt-1)
    sub eax, edx

    cmp eax, esi  ;is there k non-leading bits in ecx?
%if 0
    jnz .notk
    inc ebx       ;if so, then increment ans
.notk:
%else
    jz .yesk      ; not-taken is the more common fast path
 .done_inc:
%endif
    dec ecx
    jnz .loop   ; }while(--n >= 0U)

    ;print ans
;    PRINT_DEC 4,ans
    push  ebx
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

    pop  ebx
    pop  esi
    xor  eax, eax
    ret

.yesk:
   inc  ebx
   jmp  .done_inc         ;; TODO: tail duplication

（未完成）具有不变 clz(x) 和高半 popcount 的内循环

这个版本在我的 2.4GHz Core 2 Duo E6600 (Conroe) 上运行仅需 0.58 秒，微架构与Xeon 3050 2.13GHz 相同。
在我的 Skylake 上只需 210 毫秒。

它完成了大部分工作，仅缺少对 N < 65536 的清理（或较大 N 的低 65536，其中 MSB 位于低半部分），并且可能缺少处理外循环中的其他几个极端情况。但是内部循环完全控制了运行时，它不必运行更多，所以这些时间应该是现实的。

它仍然蛮力检查每一个数字，但依赖于高半部分的每个数字的大部分工作都是循环不变的并且被提升了。假设非零高半部分，但只有 2^16 个数字的 MSB 在低 16 位。缩小到仅低 12 或 14 位意味着更少的清理，以及循环的 LUT 的较小部分可以保留L1d很热。

%use SMARTALIGN
alignmode p6, 64

section .bss
align 4096
 wordbits: resb 65536
;    n resd 1
;    k resd 1
;    ans resd 1
section .rodata
  ;n: dd 0x40000000        ; low half zero, maybe useful to test correctness for a version that doesn't handle that.
  n:  dd 1000000000 ; = 0x3b9aca00
  k: dd 8
  print_fmt: db `ans: %d\n`, 0

section .text
global main

align 16
main:
main_1lookup:
    push  ebp
    push  edi    ; save some call-preserved registers
    push  esi
    push  ebx

    mov   edi, wordbits
;%define wordbits edi             ; dirty hack, use indexed addressing modes instead of reg+disp32.
                                 ; actually slightly worse on Skylake: causes un-lamination of cmp bl, [reg+reg],
                                 ; although the front-end isn't much of a bottleneck anymore
                                 ; also seems pretty much neutral to use disp32+reg on Core 2, maybe reg-read stalls or just not a front-end bottleneck
    ;fill in wordbits, ecx is wordbits array index
    mov   ecx, 1     ; leave wordbits[0] = 0
.init_loop:
    mov   eax,ecx
    xor   ebx,ebx
  .popc_loop:
      lea   edx, [eax-1]
      inc   ebx
      and   eax,edx         ; v &= v - 1; // blsr
      jnz  .popc_loop

    ;computed bits set
    mov [wordbits + ecx], bl

    inc ecx
    cmp ecx,65536
    jb .init_loop


;    GET_DEC 4,n
;    GET_DEC 4,k
    mov   ecx, [n]      ; ecx counts from n down to 1
;    mov   esi, [k]
    xor   esi, esi      ; ans

    mov   ebp, 1
    sub   ebp, [k]      ; 1-k
align 32
.outer:
    mov    eax, ecx         ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
    shr    eax, 16
;    xor eax, eax        ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr    ebx, ecx         ; eax = 31-lzcnt for non-zero ecx
         ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
         ; 31-clz+(1-k)  == popcount.  or  31-clz == popcnt - (1-k)
         ; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x))
    add    ebx, ebp
    sub     bl, byte [wordbits + eax]

    ;movzx  edx, cx
    lea    edx, [ecx - 4]   ; TODO: handle cx < 4 making this wrap
    movzx  edx, dx
    and    ecx, -65536      ; clear low 16 bits, which we're processing with the inner loop.
align 16
  .low16:
    cmp   bl, [wordbits + edx + 0]
    je    .yesk0
  .done_inc0:
    cmp   bl, [wordbits + edx + 1]
    je    .yesk1
  .done_inc1:
    cmp   bl, [wordbits + edx + 2]
    je    .yesk2
  .done_inc2:
    cmp   bl, [wordbits + edx + 3]
    je    .yesk3
  .done_inc3:

; TODO: vectorize with pcmpeqb / psubb / psadbw!!
; perhaps over fewer low bits to only use 16kiB of L1d cache

    sub  edx, 4
    jae  .low16        ; }while(lowhalf-=4 doesn't wrap)

   sub   ecx, 65536
   ja   .outer
; TODO: handle ECX < 65536 initially or after handling leading bits.  Probably with BSR in the inner loop


.print_and_exit:
    ;print ans
;    PRINT_DEC 4,ans
    push  esi
    push  print_fmt
extern printf
    call  printf
    add   esp, 8

    pop  ebx
    pop  esi
    pop  edi
    pop  ebp
    xor  eax, eax
    ret

align 16
%assign i 0
%rep 4
;align 4
.yesk%+i:
   inc  esi
   jmp  .done_inc%+i
%assign i  i+1
%endrep
  ; could use a similar %rep block for the inner loop

     ; attempt tail duplication?
     ; TODO: skip the next cmp/jcc when jumping back.
     ; Two in a row will never both be equal

;   dec  ecx
;   jnz  .loop
;   jmp  .print_and_exit

Skylake 性能结果：

(update after outer-loop over-count on first iter bugfix, ans: 12497876)

ans: 12498239        # This is too low by a bit vs. 12509316
                     # looks reasonable given skipping cleanup

            209.46 msec task-clock                #    0.992 CPUs utilized          
                 0      context-switches          #    0.000 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                62      page-faults               #    0.296 K/sec                  
       813,311,333      cycles                    #    3.883 GHz                    
     1,263,086,089      branches                  # 6030.123 M/sec                  
           824,103      branch-misses             #    0.07% of all branches        
     2,527,743,287      instructions              #    3.11  insn per cycle         
     1,300,567,770      uops_issued.any           # 6209.065 M/sec                  
     2,299,321,355      uops_executed.thread      # 10977.234 M/sec                 

(from another run)
        37,150,918      idq.dsb_uops              #  174.330 M/sec                  
     1,266,487,977      idq.mite_uops             # 5942.976 M/sec

       0.211235157 seconds time elapsed

       0.209838000 seconds user
       0.000000000 seconds sys

请注意， uops_issued.any 与 idq.DSB_uops + idq.MITE_uops 大致相同 - 如果我们使用 EDI 作为指针来保存代码大小，则 uops_issued.any 会高得多，因为从 micro 中取消了索引寻址模式+ 宏融合 cmp+jcc。

同样有趣的是，分支未命中率更低；也许展开有助于在 IT-TAGE 预测器表中更好地分布历史。

SSE2 SIMD

也未完成，没有处理角落案例或清理，但我认为做的工作量大致合适。

与如何使用 SIMD 计算字符出现次数不同，我们匹配的数组对匹配发生的频率有已知限制，因此（大部分？）不做嵌套循环是安全的，只有 2^14 (16384 ) 在将字节计数器扩大到 dword 之前，迭代循环展开 2。至少对于 k=8。

总计数为 12507677，略低于 12509316（对于 N=1000000000，k=8 正确），但我没有检查这是否是因为没有执行 1..16384，或者我是否丢失了任何在任何地方都很重要。

您可以展开外部循环迭代，以对每个负载使用每个 XMM 向量两次或 4 次。（通过对 L1d 缓存中的数组进行顺序访问，这可能会让我们通过每次加载执行更多 ALU 工作来稍微快一点，但速度不会快很多。）通过设置 2 或 4 个向量来匹配 2 或 4 个不同的高半部分，您可以在内部循环中花费更长的时间。可能我们可以比每个时钟 1 次比较/累加快一点。但这可能会在 Core 2 上遇到（冷）寄存器读取瓶颈。

下面的版本只是进行标准展开。

;;;;;  Just the loop from main_SSE2, same init stuff and print as main_1lookup
align 32
.outer:
    mov    eax, ecx         ; using an extra register (EBP) to schedule instructions better(?) for Core2 decode
    shr    eax, 16-2

;    xor eax, eax        ; break false dependency, or just let OoO exec hide it after breaking once per iter
    bsr    ebx, ecx         ; eax = 31-lzcnt for non-zero ecx
         ;; want: k == 32-__builtin_clz(x)-_mm_popcnt_u32(x)
         ; 31-clz+(1-k)  == popcount.  or  31-clz == popcnt - (1-k)
         ; 31-cls+(1-k) - popcount(hi(x)) == popcount(lo(x))
    add    ebx, ebp
    movzx  edx, al
;    movzx  edx, byte [wordbits + edx]
    sub    bl, byte [wordbits + edx]
    shr    eax, 8            ; high part is more than 16 bits if low is 14, needs to be broken up
    sub    bl, byte [wordbits + eax]
;    movzx  eax, byte [wordbits + eax]
;    add    eax, edx
;    sub    ebx, eax

    movzx  eax,  bl
    movd   xmm7, eax
    pxor   xmm0, xmm0
    pxor   xmm1, xmm1    ; 2 accumulators
    pshufb xmm7, xmm0    ; broadcast byte to search for.
      ;;  Actually SSSE3, but it only takes a few more insns to broadcast a byte with just SSE2.  
      ;; e.g. imul eax, 0x01010101 / movd / pshufd

    ;movzx  edx, cx
;    lea    edx, [ecx - 4]   ; TODO: handle cx < 4 making this wrap
;    movzx  edx, dx
    and    ecx, -16384      ; clear low bits, which we're processing with the inner loop.

    mov    edx, wordbits     ; quick and dirty, just loop forward over the array
     ;; FIXME: handle non-zero CX on first outer loop iteration, maybe loop backwards so we can go downwards toward 0,
     ;; or calculate an end-pointer if we can use that without register-read stalls on Core 2.
     ;; Also need to handle the leftover part not being a multiple of 32 in size
     ;; So maybe just make a more-flexible copy of this loop and peel the first outer iteration (containing that inner loop)
     ;;  if the cleanup for that slows down the common case of doing exactly 16K 
align 16
  .low14:
    movdqa  xmm2, [edx]
    movdqa  xmm3, [edx + 16]
 ds   pcmpeqb xmm2, xmm7           ; extra prefixes for padding for Skylake JCC erratum: 18ms vs. 25ms
 ds   psubb   xmm0, xmm2
    ds add     edx, 32
 cs   pcmpeqb xmm3, xmm7
 cs   psubb   xmm1, xmm3

  ; hits are rare enough to not wrap counters?
  ; TODO: may need an inner loop to accumulate after 256 steps if every other 32nd element is a match overflowing some SIMD element
    cmp    edx, wordbits + 16384
    jb   .low14

   pxor   xmm7, xmm7
   psadbw xmm0, xmm7
   psadbw xmm1, xmm7       ; byte -> qword horizontal sum
   paddd  xmm0, xmm1       ; reduce to 1 vector
   movhlps xmm1, xmm0
   paddd  xmm0, xmm1       ; hsum the low/high counts
   movd   eax, xmm0
   add    esi, eax         ; sum in scalar (could sink this out)

   sub   ecx, 16384
   ja   .outer
; TODO: handle ECX < 65536 initially or after handling leading bits.  Probably with BSR in the inner loop

可能可以只将 PADDD 放入向量累加器中，并且只能将 hsum 用于循环外的标量，但我们可能想要更多的免费向量 regs？

2022-02-28