eal/x86: revert select optimized memcpy at run-time

Revert the patchset run-time Linking support including the following 3 commits: Fixes: 84cc318424 ("eal/x86: select optimized memcpy at run-time") Fixes: c7fbc80fe6 ("test: select memcpy alignment unit at run-time") Fixes: 5f180ae329 ("efd: move AVX2 lookup in its own compilation unit") The patchset would cause perf drop in vhost/virtio loopback performance test. Because the run-time dispatch must cost at least a function call comparing to the compile-time dispatch. And the reference cpu cycles value is small. And in the test, when using 128-256 bytes packet, it would cause 16%-20% perf drop with mergeble path. When using 256 bytes packet, it would cause 13% perf drop with vector path. Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
2017-11-03 20:47:23 +08:00 · 2017-11-03 20:47:23 +08:00 · d35cc1fe6a
commit d35cc1fe6a
parent e3a64deae2
13 changed files with 905 additions and 1283 deletions
--- a/lib/librte_eal/bsdapp/eal/Makefile
+++ b/lib/librte_eal/bsdapp/eal/Makefile
@ -91,24 +91,6 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_cpuflags.c
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
 SRCS-y += rte_cycles.c
 # for run-time dispatch of memcpy
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy.c
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_sse.c
 # if the compiler supports AVX512, add avx512 file
 ifneq ($(findstring CC_SUPPORT_AVX512F,$(MACHINE_CFLAGS)),)
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx512f.c
 CFLAGS_rte_memcpy_avx512f.o += -mavx512f
 CFLAGS_rte_memcpy_avx512f.o += -DRTE_MACHINE_CPUFLAG_AVX512F
 endif
 # if the compiler supports AVX2, add avx2 file
 ifneq ($(findstring CC_SUPPORT_AVX2,$(MACHINE_CFLAGS)),)
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx2.c
 CFLAGS_rte_memcpy_avx2.o += -mavx2
 CFLAGS_rte_memcpy_avx2.o += -DRTE_MACHINE_CPUFLAG_AVX2
 endif
 CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
 CFLAGS_eal.o := -D_GNU_SOURCE
--- a/lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy_avx2.c
@ -1,44 +0,0 @@
 /*-
 *   BSD LICENSE
 *
 *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <rte_memcpy.h>
 #ifndef RTE_MACHINE_CPUFLAG_AVX2
 #error RTE_MACHINE_CPUFLAG_AVX2 not defined
 #endif
 void *
 rte_memcpy_avx2(void *dst, const void *src, size_t n)
 {
 	return rte_memcpy_internal(dst, src, n);
 }
--- a/lib/librte_eal/common/arch/x86/rte_memcpy_avx512f.c
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy_avx512f.c
@ -1,44 +0,0 @@
 /*-
 *   BSD LICENSE
 *
 *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <rte_memcpy.h>
 #ifndef RTE_MACHINE_CPUFLAG_AVX512F
 #error RTE_MACHINE_CPUFLAG_AVX512F not defined
 #endif
 void *
 rte_memcpy_avx512f(void *dst, const void *src, size_t n)
 {
 	return rte_memcpy_internal(dst, src, n);
 }
--- a/lib/librte_eal/common/arch/x86/rte_memcpy_sse.c
+++ b/lib/librte_eal/common/arch/x86/rte_memcpy_sse.c
@ -1,40 +0,0 @@
 /*-
 *   BSD LICENSE
 *
 *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <rte_memcpy.h>
 void *
 rte_memcpy_sse(void *dst, const void *src, size_t n)
 {
 	return rte_memcpy_internal(dst, src, n);
 }
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@ -1,7 +1,7 @@
 /*-
 *   BSD LICENSE
 *
- *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
@ -34,36 +34,867 @@
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_
-#include <rte_memcpy_internal.h>
+/**
 * @file
 *
 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
-#define RTE_X86_MEMCPY_THRESH 128
+/**
 * Copy bytes from one location to another. The locations must not overlap.
 *
 * @note This is implemented as a macro, so it's address should not be taken
 * and care is needed as parameter expressions may be evaluated multiple times.
 *
 * @param dst
 *   Pointer to the destination of the data.
 * @param src
 *   Pointer to the source data.
 * @param n
 *   Number of bytes to copy.
 * @return
 *   Pointer to the destination data.
 */
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
-extern void *
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
-(*rte_memcpy_ptr)(void *dst, const void *src, size_t n);
+
 #define ALIGNMENT_MASK 0x3F
 /**
- * Different implementations of memcpy.
+ * AVX512 implementation below
 */
 extern void*
 rte_memcpy_avx512f(void *dst, const void *src, size_t n);
-extern void *
+/**
-rte_memcpy_avx2(void *dst, const void *src, size_t n);
+ * Copy 16 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
-extern void *
+	xmm0 = _mm_loadu_si128((const __m128i *)src);
-rte_memcpy_sse(void *dst, const void *src, size_t n);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 /**
 * Copy 32 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
 }
 /**
 * Copy 64 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
 	__m512i zmm0;
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
 }
 /**
 * Copy 128 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
 }
 /**
 * Copy 256 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
 	rte_mov64(dst + 2 * 64, src + 2 * 64);
 	rte_mov64(dst + 3 * 64, src + 3 * 64);
 }
 /**
 * Copy 128-byte blocks from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1;
 	while (n >= 128) {
 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 		n -= 128;
 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 		src = src + 128;
 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 		dst = dst + 128;
 	}
 }
 /**
 * Copy 512-byte blocks from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 	while (n >= 512) {
 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 		n -= 512;
 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 		src = src + 512;
 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 		dst = dst + 512;
 	}
 }
 static inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 	/**
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
 		}
 		if (n & 0x08)
 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
 		return ret;
 	}
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
 				  (const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK63:
 		if (n > 64) {
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov64((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 			return ret;
 		}
 		if (n > 0)
 			rte_mov64((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 		return ret;
 	}
 	/**
 	 * Make store aligned when copy size exceeds 512 bytes
 	 */
 	dstofss = ((uintptr_t)dst & 0x3F);
 	if (dstofss > 0) {
 		dstofss = 64 - dstofss;
 		n -= dstofss;
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
 	/**
 	 * Copy 512-byte blocks.
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 511;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 	/**
 	 * Copy 128-byte blocks.
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
 	if (n >= 128) {
 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 		bits = n;
 		n = n & 127;
 		bits -= n;
 		src = (const uint8_t *)src + bits;
 		dst = (uint8_t *)dst + bits;
 	}
 	/**
 	 * Copy whatever left
 	 */
 	goto COPY_BLOCK_128_BACK63;
 }
 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 #define ALIGNMENT_MASK 0x1F
 /**
 * AVX2 implementation below
 */
 /**
 * Copy 16 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 	xmm0 = _mm_loadu_si128((const __m128i *)src);
 	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 /**
 * Copy 32 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
 }
 /**
 * Copy 64 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 }
 /**
 * Copy 128 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 }
 /**
 * Copy 128-byte blocks from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 	while (n >= 128) {
 		ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 		n -= 128;
 		ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 		ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 		ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 		src = (const uint8_t *)src + 128;
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 		dst = (uint8_t *)dst + 128;
 	}
 }
 static inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 	/**
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
 		}
 		if (n & 0x08) {
 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
 		}
 		return ret;
 	}
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 		if (n > 32) {
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov32((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
 			rte_mov32((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
 	}
 	/**
 	 * Make store aligned when copy size exceeds 256 bytes
 	 */
 	dstofss = (uintptr_t)dst & 0x1F;
 	if (dstofss > 0) {
 		dstofss = 32 - dstofss;
 		n -= dstofss;
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
 	/**
 	 * Copy 128-byte blocks
 	 */
 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 127;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 	/**
 	 * Copy whatever left
 	 */
 	goto COPY_BLOCK_128_BACK31;
 }
 #else /* RTE_MACHINE_CPUFLAG */
 #define ALIGNMENT_MASK 0x0F
 /**
 * SSE & AVX implementation below
 */
 /**
 * Copy 16 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 /**
 * Copy 32 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 }
 /**
 * Copy 64 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 }
 /**
 * Copy 128 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 }
 /**
 * Copy 256 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 }
 /**
 * Macro for copying unaligned block from one location to another with constant load offset,
 * 47 bytes leftover maximum,
 * locations should not overlap.
 * Requirements:
 * - Store is aligned
 * - Load offset is <offset>, which must be immediate value within [1, 15]
 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 * - <dst>, <src>, <len> must be variables
 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 */
 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 __extension__ ({                                                                                            \
    int tmp;                                                                                                \
    while (len >= 128 + 16 - offset) {                                                                      \
        xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
        len -= 128;                                                                                         \
        xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
        xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
        xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
        xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
        xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
        xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
        xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
        xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
        src = (const uint8_t *)src + 128;                                                                   \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
        _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
        dst = (uint8_t *)dst + 128;                                                                         \
    }                                                                                                       \
    tmp = len;                                                                                              \
    len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
    tmp -= len;                                                                                             \
    src = (const uint8_t *)src + tmp;                                                                       \
    dst = (uint8_t *)dst + tmp;                                                                             \
    if (len >= 32 + 16 - offset) {                                                                          \
        while (len >= 32 + 16 - offset) {                                                                   \
            xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
            len -= 32;                                                                                      \
            xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
            xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
            src = (const uint8_t *)src + 32;                                                                \
            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
            _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
            dst = (uint8_t *)dst + 32;                                                                      \
        }                                                                                                   \
        tmp = len;                                                                                          \
        len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
        tmp -= len;                                                                                         \
        src = (const uint8_t *)src + tmp;                                                                   \
        dst = (uint8_t *)dst + tmp;                                                                         \
    }                                                                                                       \
 })
 /**
 * Macro for copying unaligned block from one location to another,
 * 47 bytes leftover maximum,
 * locations should not overlap.
 * Use switch here because the aligning instruction requires immediate value for shift count.
 * Requirements:
 * - Store is aligned
 * - Load offset is <offset>, which must be within [1, 15]
 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 * - <dst>, <src>, <len> must be variables
 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 */
 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 __extension__ ({                                                      \
    switch (offset) {                                                 \
    case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
    case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
    case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
    case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
    case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
    case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
    case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
    case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
    case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
    case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
    case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
    case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
    case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
    case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
    case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
    default:;                                                         \
    }                                                                 \
 })
 static inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 	/**
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
 		}
 		if (n & 0x08) {
 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
 		}
 		return ret;
 	}
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 128) {
 		goto COPY_BLOCK_128_BACK15;
 	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 COPY_BLOCK_255_BACK15:
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK15:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 COPY_BLOCK_64_BACK15:
 		if (n >= 32) {
 			n -= 32;
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 32;
 			dst = (uint8_t *)dst + 32;
 		}
 		if (n > 16) {
 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 			return ret;
 		}
 		if (n > 0) {
 			rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		}
 		return ret;
 	}
 	/**
 	 * Make store aligned when copy size exceeds 512 bytes,
 	 * and make sure the first 15 bytes are copied, because
 	 * unaligned copy functions require up to 15 bytes
 	 * backwards access.
 	 */
 	dstofss = (uintptr_t)dst & 0x0F;
 	if (dstofss > 0) {
 		dstofss = 16 - dstofss + 16;
 		n -= dstofss;
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
 	srcofs = ((uintptr_t)src & 0x0F);
 	/**
 	 * For aligned copy
 	 */
 	if (srcofs == 0) {
 		/**
 		 * Copy 256-byte blocks
 		 */
 		for (; n >= 256; n -= 256) {
 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 			dst = (uint8_t *)dst + 256;
 			src = (const uint8_t *)src + 256;
 		}
 		/**
 		 * Copy whatever left
 		 */
 		goto COPY_BLOCK_255_BACK15;
 	}
 	/**
 	 * For copy with unaligned load
 	 */
 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 	/**
 	 * Copy whatever left
 	 */
 	goto COPY_BLOCK_64_BACK15;
 }
 #endif /* RTE_MACHINE_CPUFLAG */
 static inline void *
 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	/* Copy size <= 16 bytes */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dst = *(const uint8_t *)src;
 			src = (const uint8_t *)src + 1;
 			dst = (uint8_t *)dst + 1;
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dst = *(const uint16_t *)src;
 			src = (const uint16_t *)src + 1;
 			dst = (uint16_t *)dst + 1;
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dst = *(const uint32_t *)src;
 			src = (const uint32_t *)src + 1;
 			dst = (uint32_t *)dst + 1;
 		}
 		if (n & 0x08)
 			*(uint64_t *)dst = *(const uint64_t *)src;
 		return ret;
 	}
 	/* Copy 16 <= size <= 32 bytes */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	/* Copy 32 < size <= 64 bytes */
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	/* Copy 64 bytes blocks */
 	for (; n >= 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 		dst = (uint8_t *)dst + 64;
 		src = (const uint8_t *)src + 64;
 	}
 	/* Copy whatever left */
 	rte_mov64((uint8_t *)dst - 64 + n,
 			(const uint8_t *)src - 64 + n);
 	return ret;
 }
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
-	if (n <= RTE_X86_MEMCPY_THRESH)
+	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
-		return rte_memcpy_internal(dst, src, n);
+		return rte_memcpy_aligned(dst, src, n);
 	else
-		return (*rte_memcpy_ptr)(dst, src, n);
+		return rte_memcpy_generic(dst, src, n);
 }
 #ifdef __cplusplus
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h
@ -1,966 +0,0 @@
 /*-
 *   BSD LICENSE
 *
 *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef _RTE_MEMCPY_INTERNAL_X86_64_H_
 #define _RTE_MEMCPY_INTERNAL_X86_64_H_
 /**
 * @file
 *
 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * Copy bytes from one location to another. The locations must not overlap.
 *
 * @note This is implemented as a macro, so it's address should not be taken
 * and care is needed as parameter expressions may be evaluated multiple times.
 *
 * @param dst
 *   Pointer to the destination of the data.
 * @param src
 *   Pointer to the source data.
 * @param n
 *   Number of bytes to copy.
 * @return
 *   Pointer to the destination data.
 */
 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
 #define ALIGNMENT_MASK 0x3F
 /**
 * AVX512 implementation below
 */
 /**
 * Copy 16 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 	xmm0 = _mm_loadu_si128((const __m128i *)src);
 	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 /**
 * Copy 32 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
 }
 /**
 * Copy 64 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
 	__m512i zmm0;
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
 }
 /**
 * Copy 128 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
 }
 /**
 * Copy 256 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov64(dst + 0 * 64, src + 0 * 64);
 	rte_mov64(dst + 1 * 64, src + 1 * 64);
 	rte_mov64(dst + 2 * 64, src + 2 * 64);
 	rte_mov64(dst + 3 * 64, src + 3 * 64);
 }
 /**
 * Copy 128-byte blocks from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1;
 	while (n >= 128) {
 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 		n -= 128;
 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 		src = src + 128;
 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 		dst = dst + 128;
 	}
 }
 /**
 * Copy 512-byte blocks from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 	while (n >= 512) {
 		zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 		n -= 512;
 		zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 		zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 		zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 		zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 		zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 		zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 		zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 		src = src + 512;
 		_mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 		_mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 		_mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 		_mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 		_mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 		_mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 		_mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 		_mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 		dst = dst + 512;
 	}
 }
 static inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 	/**
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
 		}
 		if (n & 0x08)
 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
 		return ret;
 	}
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
 				  (const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK63:
 		if (n > 64) {
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov64((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 			return ret;
 		}
 		if (n > 0)
 			rte_mov64((uint8_t *)dst - 64 + n,
 					  (const uint8_t *)src - 64 + n);
 		return ret;
 	}
 	/**
 	 * Make store aligned when copy size exceeds 512 bytes
 	 */
 	dstofss = ((uintptr_t)dst & 0x3F);
 	if (dstofss > 0) {
 		dstofss = 64 - dstofss;
 		n -= dstofss;
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
 	/**
 	 * Copy 512-byte blocks.
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
 	rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 511;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 	/**
 	 * Copy 128-byte blocks.
 	 * Use copy block function for better instruction order control,
 	 * which is important when load is unaligned.
 	 */
 	if (n >= 128) {
 		rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 		bits = n;
 		n = n & 127;
 		bits -= n;
 		src = (const uint8_t *)src + bits;
 		dst = (uint8_t *)dst + bits;
 	}
 	/**
 	 * Copy whatever left
 	 */
 	goto COPY_BLOCK_128_BACK63;
 }
 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 #define ALIGNMENT_MASK 0x1F
 /**
 * AVX2 implementation below
 */
 /**
 * Copy 16 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 	xmm0 = _mm_loadu_si128((const __m128i *)src);
 	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 /**
 * Copy 32 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
 	__m256i ymm0;
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
 }
 /**
 * Copy 64 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 }
 /**
 * Copy 128 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 }
 /**
 * Copy 128-byte blocks from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 {
 	__m256i ymm0, ymm1, ymm2, ymm3;
 	while (n >= 128) {
 		ymm0 = _mm256_loadu_si256((const __m256i *)
 				((const uint8_t *)src + 0 * 32));
 		n -= 128;
 		ymm1 = _mm256_loadu_si256((const __m256i *)
 				((const uint8_t *)src + 1 * 32));
 		ymm2 = _mm256_loadu_si256((const __m256i *)
 				((const uint8_t *)src + 2 * 32));
 		ymm3 = _mm256_loadu_si256((const __m256i *)
 				((const uint8_t *)src + 3 * 32));
 		src = (const uint8_t *)src + 128;
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 		_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 		dst = (uint8_t *)dst + 128;
 	}
 }
 static inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
 	/**
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
 		}
 		if (n & 0x08)
 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
 		return ret;
 	}
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	if (n <= 256) {
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK31:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 		if (n > 32) {
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov32((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 			return ret;
 		}
 		if (n > 0) {
 			rte_mov32((uint8_t *)dst - 32 + n,
 					(const uint8_t *)src - 32 + n);
 		}
 		return ret;
 	}
 	/**
 	 * Make store aligned when copy size exceeds 256 bytes
 	 */
 	dstofss = (uintptr_t)dst & 0x1F;
 	if (dstofss > 0) {
 		dstofss = 32 - dstofss;
 		n -= dstofss;
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
 	/**
 	 * Copy 128-byte blocks
 	 */
 	rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 	bits = n;
 	n = n & 127;
 	bits -= n;
 	src = (const uint8_t *)src + bits;
 	dst = (uint8_t *)dst + bits;
 	/**
 	 * Copy whatever left
 	 */
 	goto COPY_BLOCK_128_BACK31;
 }
 #else /* RTE_MACHINE_CPUFLAG */
 #define ALIGNMENT_MASK 0x0F
 /**
 * SSE & AVX implementation below
 */
 /**
 * Copy 16 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
 	__m128i xmm0;
 	xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 	_mm_storeu_si128((__m128i *)dst, xmm0);
 }
 /**
 * Copy 32 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 }
 /**
 * Copy 64 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 }
 /**
 * Copy 128 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 }
 /**
 * Copy 256 bytes from one location to another,
 * locations should not overlap.
 */
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
 	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 }
 /**
 * Macro for copying unaligned block from one location to another with constant
 * load offset, 47 bytes leftover maximum,
 * locations should not overlap.
 * Requirements:
 * - Store is aligned
 * - Load offset is <offset>, which must be immediate value within [1, 15]
 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards
 *   are available for loading
 * - <dst>, <src>, <len> must be variables
 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 */
 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)(		      \
 __extension__ ({							      \
 	int tmp;							      \
 	while (len >= 128 + 16 - offset) {				      \
 		xmm0 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 0 * 16));	      \
 		len -= 128;						      \
 		xmm1 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 1 * 16));	      \
 		xmm2 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 2 * 16));	      \
 		xmm3 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 3 * 16));	      \
 		xmm4 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 4 * 16));	      \
 		xmm5 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 5 * 16));	      \
 		xmm6 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 6 * 16));	      \
 		xmm7 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 7 * 16));	      \
 		xmm8 = _mm_loadu_si128((const __m128i *)		      \
 			((const uint8_t *)src - offset + 8 * 16));	      \
 		src = (const uint8_t *)src + 128;			      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16),        \
 			_mm_alignr_epi8(xmm1, xmm0, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16),        \
 			_mm_alignr_epi8(xmm2, xmm1, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16),        \
 			_mm_alignr_epi8(xmm3, xmm2, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16),        \
 			_mm_alignr_epi8(xmm4, xmm3, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16),        \
 			_mm_alignr_epi8(xmm5, xmm4, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16),        \
 			_mm_alignr_epi8(xmm6, xmm5, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16),        \
 			_mm_alignr_epi8(xmm7, xmm6, offset));		      \
 		_mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16),        \
 			_mm_alignr_epi8(xmm8, xmm7, offset));		      \
 		dst = (uint8_t *)dst + 128;				      \
 	}								      \
 	tmp = len;							      \
 	len = ((len - 16 + offset) & 127) + 16 - offset;		      \
 	tmp -= len;							      \
 	src = (const uint8_t *)src + tmp;				      \
 	dst = (uint8_t *)dst + tmp;					      \
 	if (len >= 32 + 16 - offset) {					      \
 		while (len >= 32 + 16 - offset) {			      \
 			xmm0 = _mm_loadu_si128((const __m128i *)	      \
 				((const uint8_t *)src - offset + 0 * 16));    \
 			len -= 32;					      \
 			xmm1 = _mm_loadu_si128((const __m128i *)	      \
 				((const uint8_t *)src - offset + 1 * 16));    \
 			xmm2 = _mm_loadu_si128((const __m128i *)	      \
 				((const uint8_t *)src - offset + 2 * 16));    \
 			src = (const uint8_t *)src + 32;		      \
 			_mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16),\
 				_mm_alignr_epi8(xmm1, xmm0, offset));	      \
 			_mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16),\
 				_mm_alignr_epi8(xmm2, xmm1, offset));	      \
 			dst = (uint8_t *)dst + 32;			      \
 		}							      \
 		tmp = len;						      \
 		len = ((len - 16 + offset) & 31) + 16 - offset;		      \
 		tmp -= len;						      \
 		src = (const uint8_t *)src + tmp;			      \
 		dst = (uint8_t *)dst + tmp;				      \
 	}								      \
 }))
 /**
 * Macro for copying unaligned block from one location to another,
 * 47 bytes leftover maximum,
 * locations should not overlap.
 * Use switch here because the aligning instruction requires immediate value
 * for shift count.
 * Requirements:
 * - Store is aligned
 * - Load offset is <offset>, which must be within [1, 15]
 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards
 *   are available for loading
 * - <dst>, <src>, <len> must be variables
 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be
 *   pre-defined
 */
 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)(			    \
 __extension__ ({							    \
 	switch (offset) {						    \
 	case 0x01:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01);		    \
 		break;							    \
 	case 0x02:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02);		    \
 		break;							    \
 	case 0x03:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03);		    \
 		break;							    \
 	case 0x04:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04);		    \
 		break;							    \
 	case 0x05:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05);		    \
 		break;							    \
 	case 0x06:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06);		    \
 		break;							    \
 	case 0x07:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07);		    \
 		break;							    \
 	case 0x08:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08);		    \
 		break;							    \
 	case 0x09:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09);		    \
 		break;							    \
 	case 0x0A:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A);		    \
 		break;							    \
 	case 0x0B:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B);		    \
 		break;							    \
 	case 0x0C:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C);		    \
 		break;							    \
 	case 0x0D:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D);		    \
 		break;							    \
 	case 0x0E:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E);		    \
 		break;							    \
 	case 0x0F:							    \
 		MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F);		    \
 		break;							    \
 	default:							    \
 		break;							    \
 	}								    \
 }))
 static inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 	uintptr_t dstu = (uintptr_t)dst;
 	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
 	/**
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dstu = *(const uint8_t *)srcu;
 			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 			dstu = (uintptr_t)((uint8_t *)dstu + 1);
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dstu = *(const uint16_t *)srcu;
 			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 			dstu = (uintptr_t)((uint16_t *)dstu + 1);
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dstu = *(const uint32_t *)srcu;
 			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 			dstu = (uintptr_t)((uint32_t *)dstu + 1);
 		}
 		if (n & 0x08)
 			*(uint64_t *)dstu = *(const uint64_t *)srcu;
 		return ret;
 	}
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 48) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 128)
 		goto COPY_BLOCK_128_BACK15;
 	if (n <= 512) {
 		if (n >= 256) {
 			n -= 256;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov128((uint8_t *)dst + 128,
 					(const uint8_t *)src + 128);
 			src = (const uint8_t *)src + 256;
 			dst = (uint8_t *)dst + 256;
 		}
 COPY_BLOCK_255_BACK15:
 		if (n >= 128) {
 			n -= 128;
 			rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 128;
 			dst = (uint8_t *)dst + 128;
 		}
 COPY_BLOCK_128_BACK15:
 		if (n >= 64) {
 			n -= 64;
 			rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 64;
 			dst = (uint8_t *)dst + 64;
 		}
 COPY_BLOCK_64_BACK15:
 		if (n >= 32) {
 			n -= 32;
 			rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 			src = (const uint8_t *)src + 32;
 			dst = (uint8_t *)dst + 32;
 		}
 		if (n > 16) {
 			rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 			rte_mov16((uint8_t *)dst - 16 + n,
 					(const uint8_t *)src - 16 + n);
 			return ret;
 		}
 		if (n > 0) {
 			rte_mov16((uint8_t *)dst - 16 + n,
 					(const uint8_t *)src - 16 + n);
 		}
 		return ret;
 	}
 	/**
 	 * Make store aligned when copy size exceeds 512 bytes,
 	 * and make sure the first 15 bytes are copied, because
 	 * unaligned copy functions require up to 15 bytes
 	 * backwards access.
 	 */
 	dstofss = (uintptr_t)dst & 0x0F;
 	if (dstofss > 0) {
 		dstofss = 16 - dstofss + 16;
 		n -= dstofss;
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		src = (const uint8_t *)src + dstofss;
 		dst = (uint8_t *)dst + dstofss;
 	}
 	srcofs = ((uintptr_t)src & 0x0F);
 	/**
 	 * For aligned copy
 	 */
 	if (srcofs == 0) {
 		/**
 		 * Copy 256-byte blocks
 		 */
 		for (; n >= 256; n -= 256) {
 			rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 			dst = (uint8_t *)dst + 256;
 			src = (const uint8_t *)src + 256;
 		}
 		/**
 		 * Copy whatever left
 		 */
 		goto COPY_BLOCK_255_BACK15;
 	}
 	/**
 	 * For copy with unaligned load
 	 */
 	MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 	/**
 	 * Copy whatever left
 	 */
 	goto COPY_BLOCK_64_BACK15;
 }
 #endif /* RTE_MACHINE_CPUFLAG */
 static inline void *
 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 {
 	void *ret = dst;
 	/* Copy size <= 16 bytes */
 	if (n < 16) {
 		if (n & 0x01) {
 			*(uint8_t *)dst = *(const uint8_t *)src;
 			src = (const uint8_t *)src + 1;
 			dst = (uint8_t *)dst + 1;
 		}
 		if (n & 0x02) {
 			*(uint16_t *)dst = *(const uint16_t *)src;
 			src = (const uint16_t *)src + 1;
 			dst = (uint16_t *)dst + 1;
 		}
 		if (n & 0x04) {
 			*(uint32_t *)dst = *(const uint32_t *)src;
 			src = (const uint32_t *)src + 1;
 			dst = (uint32_t *)dst + 1;
 		}
 		if (n & 0x08)
 			*(uint64_t *)dst = *(const uint64_t *)src;
 		return ret;
 	}
 	/* Copy 16 <= size <= 32 bytes */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	/* Copy 32 < size <= 64 bytes */
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
 				(const uint8_t *)src - 32 + n);
 		return ret;
 	}
 	/* Copy 64 bytes blocks */
 	for (; n >= 64; n -= 64) {
 		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 		dst = (uint8_t *)dst + 64;
 		src = (const uint8_t *)src + 64;
 	}
 	/* Copy whatever left */
 	rte_mov64((uint8_t *)dst - 64 + n,
 			(const uint8_t *)src - 64 + n);
 	return ret;
 }
 static inline void *
 rte_memcpy_internal(void *dst, const void *src, size_t n)
 {
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 		return rte_memcpy_aligned(dst, src, n);
 	else
 		return rte_memcpy_generic(dst, src, n);
 }
 #ifdef __cplusplus
 }
 #endif
 #endif /* _RTE_MEMCPY_INTERNAL_X86_64_H_ */
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@ -98,24 +98,6 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
 SRCS-y += rte_cycles.c
 # for run-time dispatch of memcpy
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy.c
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_sse.c
 # if the compiler supports AVX512, add avx512 file
 ifneq ($(findstring CC_SUPPORT_AVX512F,$(MACHINE_CFLAGS)),)
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx512f.c
 CFLAGS_rte_memcpy_avx512f.o += -mavx512f
 CFLAGS_rte_memcpy_avx512f.o += -DRTE_MACHINE_CPUFLAG_AVX512F
 endif
 # if the compiler supports AVX2, add avx2 file
 ifneq ($(findstring CC_SUPPORT_AVX2,$(MACHINE_CFLAGS)),)
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_memcpy_avx2.c
 CFLAGS_rte_memcpy_avx2.o += -mavx2
 CFLAGS_rte_memcpy_avx2.o += -DRTE_MACHINE_CPUFLAG_AVX2
 endif
 CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
 CFLAGS_eal.o := -D_GNU_SOURCE
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@ -196,7 +196,6 @@ DPDK_17.11 {
 	rte_lcore_has_role;
 	rte_malloc_virt2iova;
 	rte_mem_virt2iova;
 	rte_memcpy_ptr;
 	rte_vfio_enable;
 	rte_vfio_is_enabled;
 	rte_vfio_noiommu_is_enabled;
--- a/lib/librte_efd/Makefile
+++ b/lib/librte_efd/Makefile
@ -45,12 +45,6 @@ LIBABIVER := 1
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_EFD) := rte_efd.c
 # if the compiler supports AVX2, add efd x86 file
 ifneq ($(findstring CC_SUPPORT_AVX2,$(MACHINE_CFLAGS)),)
 SRCS-$(CONFIG_RTE_ARCH_X86) += rte_efd_x86.c
 CFLAGS_rte_efd_x86.o += -mavx2
 endif
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_EFD)-include := rte_efd.h
--- a/lib/librte_efd/rte_efd_x86.c
+++ b/lib/librte_efd/rte_efd_x86.c
@ -1,77 +0,0 @@
 /*-
 *   BSD LICENSE
 *
 *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /* rte_efd_x86.c
 * This file holds all x86 specific EFD functions
 */
 #include <rte_efd.h>
 #include <rte_efd_x86.h>
 #if (RTE_EFD_VALUE_NUM_BITS == 8 || RTE_EFD_VALUE_NUM_BITS == 16 || \
 	RTE_EFD_VALUE_NUM_BITS == 24 || RTE_EFD_VALUE_NUM_BITS == 32)
 #define EFD_LOAD_SI128(val) _mm_load_si128(val)
 #else
 #define EFD_LOAD_SI128(val) _mm_lddqu_si128(val)
 #endif
 efd_value_t
 efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx,
 		const efd_lookuptbl_t *group_lookup_table,
 		const uint32_t hash_val_a, const uint32_t hash_val_b)
 {
 	efd_value_t value = 0;
 	uint32_t i = 0;
 	__m256i vhash_val_a = _mm256_set1_epi32(hash_val_a);
 	__m256i vhash_val_b = _mm256_set1_epi32(hash_val_b);
 	for (; i < RTE_EFD_VALUE_NUM_BITS; i += 8) {
 		__m256i vhash_idx =
 				_mm256_cvtepu16_epi32(EFD_LOAD_SI128(
 				(__m128i const *) &group_hash_idx[i]));
 		__m256i vlookup_table = _mm256_cvtepu16_epi32(
 				EFD_LOAD_SI128((__m128i const *)
 				&group_lookup_table[i]));
 		__m256i vhash = _mm256_add_epi32(vhash_val_a,
 				_mm256_mullo_epi32(vhash_idx, vhash_val_b));
 		__m256i vbucket_idx = _mm256_srli_epi32(vhash,
 				EFD_LOOKUPTBL_SHIFT);
 		__m256i vresult = _mm256_srlv_epi32(vlookup_table,
 				vbucket_idx);
 		value |= (_mm256_movemask_ps(
 			(__m256) _mm256_slli_epi32(vresult, 31))
 			& ((1 << (RTE_EFD_VALUE_NUM_BITS - i)) - 1)) << i;
 	}
 	return value;
 }
--- a/lib/librte_efd/rte_efd_x86.h
+++ b/lib/librte_efd/rte_efd_x86.h
@ -36,7 +36,51 @@
 */
 #include <immintrin.h>
-extern efd_value_t
+#if (RTE_EFD_VALUE_NUM_BITS == 8 || RTE_EFD_VALUE_NUM_BITS == 16 || \
 	RTE_EFD_VALUE_NUM_BITS == 24 || RTE_EFD_VALUE_NUM_BITS == 32)
 #define EFD_LOAD_SI128(val) _mm_load_si128(val)
 #else
 #define EFD_LOAD_SI128(val) _mm_lddqu_si128(val)
 #endif
 static inline efd_value_t
 efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx,
 		const efd_lookuptbl_t *group_lookup_table,
-		const uint32_t hash_val_a, const uint32_t hash_val_b);
+		const uint32_t hash_val_a, const uint32_t hash_val_b)
 {
 #ifdef RTE_MACHINE_CPUFLAG_AVX2
 	efd_value_t value = 0;
 	uint32_t i = 0;
 	__m256i vhash_val_a = _mm256_set1_epi32(hash_val_a);
 	__m256i vhash_val_b = _mm256_set1_epi32(hash_val_b);
 	for (; i < RTE_EFD_VALUE_NUM_BITS; i += 8) {
 		__m256i vhash_idx =
 				_mm256_cvtepu16_epi32(EFD_LOAD_SI128(
 				(__m128i const *) &group_hash_idx[i]));
 		__m256i vlookup_table = _mm256_cvtepu16_epi32(
 				EFD_LOAD_SI128((__m128i const *)
 				&group_lookup_table[i]));
 		__m256i vhash = _mm256_add_epi32(vhash_val_a,
 				_mm256_mullo_epi32(vhash_idx, vhash_val_b));
 		__m256i vbucket_idx = _mm256_srli_epi32(vhash,
 				EFD_LOOKUPTBL_SHIFT);
 		__m256i vresult = _mm256_srlv_epi32(vlookup_table,
 				vbucket_idx);
 		value |= (_mm256_movemask_ps(
 			(__m256) _mm256_slli_epi32(vresult, 31))
 			& ((1 << (RTE_EFD_VALUE_NUM_BITS - i)) - 1)) << i;
 	}
 	return value;
 #else
 	RTE_SET_USED(group_hash_idx);
 	RTE_SET_USED(group_lookup_table);
 	RTE_SET_USED(hash_val_a);
 	RTE_SET_USED(hash_val_b);
 	/* Return dummy value, only to avoid compilation breakage */
 	return 0;
 #endif
 }
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@ -134,20 +134,6 @@ endif
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 # Check if the compiler suppoerts AVX512
 CC_SUPPORT_AVX512F := $(shell $(CC) -mavx512f -dM -E - < /dev/null 2>&1 | grep -q AVX512 && echo 1)
 ifeq ($(CC_SUPPORT_AVX512F),1)
 ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
 MACHINE_CFLAGS += -DCC_SUPPORT_AVX512F
 endif
 endif
 # Check if the compiler supports AVX2
 CC_SUPPORT_AVX2 := $(shell $(CC) -mavx2 -dM -E - < /dev/null 2>&1 | grep -q AVX2 && echo 1)
 ifeq ($(CC_SUPPORT_AVX2),1)
 MACHINE_CFLAGS += -DCC_SUPPORT_AVX2
 endif
 # To strip whitespace
 comma:= ,
 empty:=
--- a/test/test/test_memcpy_perf.c
+++ b/test/test/test_memcpy_perf.c
@ -42,7 +42,6 @@
 #include <rte_malloc.h>
 #include <rte_memcpy.h>
 #include <rte_cpuflags.h>
 #include "test.h"
@ -80,7 +79,13 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE         100
 /* Data is aligned on this many bytes (power of 2) */
-static uint8_t alignment_unit = 16;
+#ifdef RTE_MACHINE_CPUFLAG_AVX512F
 #define ALIGNMENT_UNIT          64
 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 #define ALIGNMENT_UNIT          32
 #else /* RTE_MACHINE_CPUFLAG */
 #define ALIGNMENT_UNIT          16
 #endif /* RTE_MACHINE_CPUFLAG */
 /*
 * Pointers used in performance tests. The two large buffers are for uncached
@ -90,54 +95,25 @@ static uint8_t alignment_unit = 16;
 static uint8_t *large_buf_read, *large_buf_write;
 static uint8_t *small_buf_read, *small_buf_write;
 /* Initialise alignment_unit based on machine at run-time. */
 static void
 init_alignment_unit(void)
 {
 #ifdef CC_SUPPORT_AVX512
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
 		alignment_unit = 64;
 		return;
 	}
 #endif
 #ifdef CC_SUPPORT_AVX2
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
 		alignment_unit = 32;
 		return;
 	}
 #endif
 	alignment_unit = 16;
 }
 /* Initialise data buffers. */
 static int
 init_buffers(void)
 {
 	unsigned i;
-	init_alignment_unit();
+	large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
 	large_buf_read = rte_malloc("memcpy",
 				    LARGE_BUFFER_SIZE + alignment_unit,
 				    alignment_unit);
 	if (large_buf_read == NULL)
 		goto error_large_buf_read;
-	large_buf_write = rte_malloc("memcpy",
+	large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
 				     LARGE_BUFFER_SIZE + alignment_unit,
 				     alignment_unit);
 	if (large_buf_write == NULL)
 		goto error_large_buf_write;
-	small_buf_read = rte_malloc("memcpy",
+	small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
 				    SMALL_BUFFER_SIZE + alignment_unit,
 				    alignment_unit);
 	if (small_buf_read == NULL)
 		goto error_small_buf_read;
-	small_buf_write = rte_malloc("memcpy",
+	small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT);
 				     SMALL_BUFFER_SIZE + alignment_unit,
 				     alignment_unit);
 	if (small_buf_write == NULL)
 		goto error_small_buf_write;
@ -177,7 +153,7 @@ static inline size_t
 get_rand_offset(size_t uoffset)
 {
 	return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
-			~(alignment_unit - 1)) + uoffset;
+			~(ALIGNMENT_UNIT - 1)) + uoffset;
 }
 /* Fill in source and destination addresses. */
@ -345,8 +321,7 @@ perf_test(void)
 		   "(bytes)        (ticks)        (ticks)        (ticks)        (ticks)\n"
 		   "------- -------------- -------------- -------------- --------------");
-	printf("\n========================= %2dB aligned ============================",
+	printf("\n========================== %2dB aligned ============================", ALIGNMENT_UNIT);
 		alignment_unit);
 	/* Do aligned tests where size is a variable */
 	perf_test_variable_aligned();
 	printf("\n------- -------------- -------------- -------------- --------------");