acl: check max SIMD bitwidth

When choosing a vector path to take, an extra condition must be
satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
path. These checks are added in the check alg helper functions.

Signed-off-by: Ciara Power <ciara.power@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Tested-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
This commit is contained in:
Ciara Power 2020-10-19 15:48:58 +02:00 committed by David Marchand
parent 13facf47d6
commit 1e6a661302
3 changed files with 48 additions and 24 deletions

View File

@ -368,24 +368,27 @@ After rte_acl_build() over given AC context has finished successfully, it can be
There are several implementations of classify algorithm:
* **RTE_ACL_CLASSIFY_SCALAR**: generic implementation, doesn't require any specific HW support.
Requires max SIMD bitwidth to be at least 64.
* **RTE_ACL_CLASSIFY_SSE**: vector implementation, can process up to 8 flows in parallel. Requires SSE 4.1 support.
Requires max SIMD bitwidth to be at least 128.
* **RTE_ACL_CLASSIFY_AVX2**: vector implementation, can process up to 16 flows in parallel. Requires AVX2 support.
Requires max SIMD bitwidth to be at least 256.
* **RTE_ACL_CLASSIFY_NEON**: vector implementation, can process up to 8 flows
in parallel. Requires NEON support.
in parallel. Requires NEON support. Requires max SIMD bitwidth to be at least 128.
* **RTE_ACL_CLASSIFY_ALTIVEC**: vector implementation, can process up to 8
flows in parallel. Requires ALTIVEC support.
flows in parallel. Requires ALTIVEC support. Requires max SIMD bitwidth to be at least 128.
* **RTE_ACL_CLASSIFY_AVX512X16**: vector implementation, can process up to 16
flows in parallel. Uses 256-bit width SIMD registers.
Requires AVX512 support.
Requires AVX512 support. Requires max SIMD bitwidth to be at least 256.
* **RTE_ACL_CLASSIFY_AVX512X32**: vector implementation, can process up to 32
flows in parallel. Uses 512-bit width SIMD registers.
Requires AVX512 support.
Requires AVX512 support. Requires max SIMD bitwidth to be at least 512.
It is purely a runtime decision which method to choose, there is no build-time difference.
All implementations operates over the same internal RT structures and use similar principles. The main difference is that vector implementations can manually exploit IA SIMD instructions and process several input data flows in parallel.
@ -393,9 +396,8 @@ At startup ACL library determines the highest available classify method for the
.. note::
Right now ``RTE_ACL_CLASSIFY_AVX512X32`` is not selected by default
(due to possible frequency level change), but it can be selected at
runtime by apps through the use of ACL API: ``rte_acl_set_ctx_classify``.
Runtime algorithm selection obeys EAL max SIMD bitwidth parameter.
For more details about expected behaviour please see :ref:`max_simd_bitwidth`
Application Programming Interface (API) Usage
---------------------------------------------

View File

@ -6,6 +6,7 @@
#include <rte_string_fns.h>
#include <rte_acl.h>
#include <rte_tailq.h>
#include <rte_vect.h>
#include "acl.h"
@ -114,14 +115,14 @@ acl_check_alg_arm(enum rte_acl_classify_alg alg)
{
if (alg == RTE_ACL_CLASSIFY_NEON) {
#if defined(RTE_ARCH_ARM64)
return 0;
#elif defined(RTE_ARCH_ARM)
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
return 0;
#elif defined(RTE_ARCH_ARM)
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
return 0;
return -ENOTSUP;
#else
return -ENOTSUP;
#endif
return -ENOTSUP;
}
return -EINVAL;
@ -136,15 +137,26 @@ acl_check_alg_ppc(enum rte_acl_classify_alg alg)
{
if (alg == RTE_ACL_CLASSIFY_ALTIVEC) {
#if defined(RTE_ARCH_PPC_64)
return 0;
#else
return -ENOTSUP;
if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
return 0;
#endif
return -ENOTSUP;
}
return -EINVAL;
}
#ifdef CC_AVX512_SUPPORT
static int
acl_check_avx512_cpu_flags(void)
{
return (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512CD) &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW));
}
#endif
/*
* Helper function for acl_check_alg.
* Check support for x86 specific classify methods.
@ -152,13 +164,19 @@ acl_check_alg_ppc(enum rte_acl_classify_alg alg)
static int
acl_check_alg_x86(enum rte_acl_classify_alg alg)
{
if (alg == RTE_ACL_CLASSIFY_AVX512X16 ||
alg == RTE_ACL_CLASSIFY_AVX512X32) {
if (alg == RTE_ACL_CLASSIFY_AVX512X32) {
#ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512CD) &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW))
if (acl_check_avx512_cpu_flags() != 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
return 0;
#endif
return -ENOTSUP;
}
if (alg == RTE_ACL_CLASSIFY_AVX512X16) {
#ifdef CC_AVX512_SUPPORT
if (acl_check_avx512_cpu_flags() != 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
return 0;
#endif
return -ENOTSUP;
@ -166,7 +184,8 @@ acl_check_alg_x86(enum rte_acl_classify_alg alg)
if (alg == RTE_ACL_CLASSIFY_AVX2) {
#ifdef CC_AVX2_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
return 0;
#endif
return -ENOTSUP;
@ -174,7 +193,8 @@ acl_check_alg_x86(enum rte_acl_classify_alg alg)
if (alg == RTE_ACL_CLASSIFY_SSE) {
#ifdef RTE_ARCH_X86
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
return 0;
#endif
return -ENOTSUP;
@ -226,6 +246,7 @@ acl_get_best_alg(void)
#elif defined(RTE_ARCH_PPC_64)
RTE_ACL_CLASSIFY_ALTIVEC,
#elif defined(RTE_ARCH_X86)
RTE_ACL_CLASSIFY_AVX512X32,
RTE_ACL_CLASSIFY_AVX512X16,
RTE_ACL_CLASSIFY_AVX2,
RTE_ACL_CLASSIFY_SSE,

View File

@ -329,6 +329,7 @@ rte_acl_classify_alg(const struct rte_acl_ctx *ctx,
* New default classify algorithm for given ACL context.
* It is the caller responsibility to ensure that the value refers to the
* existing algorithm, and that it could be run on the given CPU.
* The max SIMD bitwidth value in EAL must also allow for the chosen algorithm.
* @return
* - -EINVAL if the parameters are invalid.
* - -ENOTSUP requested algorithm is not supported by given platform.