From 20ca271fdd651f12793f6e2580b8d19f627d80f7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 9 May 2018 15:16:25 +0000 Subject: [PATCH] amd64: depessimize bcmp for small buffers Adapt assembly generated by clang for memcmp and use it for <= 64 sized compares (which are the vast majority). Sample result of doing stats on Broadwell (% of samples): before: 4.0 kernel bcmp cache_lookup after : 0.7 kernel bcmp cache_lookup The routine is most definitely still not optimal. Anyone interested in spending time improving it is welcome to take over. Reviewed by: kib --- sys/amd64/amd64/support.S | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 79b9db3d6283..7c1b52cdd0de 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -98,17 +98,40 @@ END(sse2_pagezero) ENTRY(bcmp) PUSH_FRAME_POINTER + test %rdx,%rdx + je 1f + cmpq $64,%rdx + jg 4f + + xor %ecx,%ecx +2: + movzbl (%rdi,%rcx,1),%eax + movzbl (%rsi,%rcx,1),%r8d + cmp %r8b,%al + jne 3f + add $0x1,%rcx + cmp %rcx,%rdx + jne 2b +1: + xor %eax,%eax + POP_FRAME_POINTER + retq +3: + mov $1,%eax + POP_FRAME_POINTER + retq +4: movq %rdx,%rcx shrq $3,%rcx repe cmpsq - jne 1f + jne 5f movq %rdx,%rcx andq $7,%rcx repe cmpsb -1: +5: setne %al movsbl %al,%eax POP_FRAME_POINTER