lib/util: optimize base64 encode and decode using ARM SVE intrinsics

Optimize base64 encode and decode using ARM SVE intrinsics. Signed-off-by: Rui Chang <rui.chang@arm.com> Change-Id: I63ae5b9105aef690e1fad8ec1ef98c8339758e9f Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/8858 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ziye Yang <ziye.yang@intel.com> Reviewed-by: Paul Luse <paul.e.luse@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
2021-07-18 20:36:18 +08:00 · 2021-07-18 20:36:18 +08:00 · d483d8a4f1
commit d483d8a4f1
parent 849f05c9df
3 changed files with 522 additions and 3 deletions
--- a/lib/util/base64.c
+++ b/lib/util/base64.c
@ -2,6 +2,7 @@
 *   BSD LICENSE
 *
 *   Copyright(c) Intel Corporation. All rights reserved.
+ *   Copyright(c) ARM Limited. 2021 All rights reserved.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
@ -36,8 +37,13 @@
 #include "spdk/base64.h"

 #ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+#include "base64_sve.c"
+#else
 #include "base64_neon.c"
 #endif
+#endif
+

 #define BASE64_ENC_BITMASK 0x3FUL
 #define BASE64_PADDING_CHAR '='
@ -102,8 +108,13 @@ base64_encode(char *dst, const char *enc_table, const void *src, size_t src_len)
 	}

 #ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+	base64_encode_sve(&dst, enc_table, &src, &src_len);
+#else
 	base64_encode_neon64(&dst, enc_table, &src, &src_len);
 #endif
+#endif
+

 	while (src_len >= 4) {
 		raw_u32 = from_be32(src);
@ -148,7 +159,7 @@ spdk_base64_urlsafe_encode(char *dst, const void *src, size_t src_len)
 	return base64_encode(dst, base64_urfsafe_enc_table, src, src_len);
 }

-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
 static int
 base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table,
 	      const uint8_t *dec_table_opt, const char *src)
@ -199,13 +210,18 @@ base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, const char
 	src_in = (const uint8_t *) src;

 #ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+	base64_decode_sve(&dst, dec_table, &src_in, &src_strlen);
+#else
 	base64_decode_neon64(&dst, dec_table_opt, &src_in, &src_strlen);
+#endif

 	if (src_strlen == 0) {
 		return 0;
 	}
 #endif

+
 	/* space of dst can be used by to_be32 */
 	while (src_strlen > 4) {
 		tmp[0] = dec_table[*src_in++];
@ -243,7 +259,7 @@ base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, const char
 int
 spdk_base64_decode(void *dst, size_t *dst_len, const char *src)
 {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
 	return base64_decode(dst, dst_len, base64_dec_table, base64_dec_table_neon64, src);
 #else
 	return base64_decode(dst, dst_len, base64_dec_table, src);
@ -253,7 +269,7 @@ spdk_base64_decode(void *dst, size_t *dst_len, const char *src)
 int
 spdk_base64_urlsafe_decode(void *dst, size_t *dst_len, const char *src)
 {
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
 	return base64_decode(dst, dst_len, base64_urlsafe_dec_table, base64_urlsafe_dec_table_neon64,
 			     src);
 #else
--- a/lib/util/base64_sve.c
+++ b/lib/util/base64_sve.c
@ -0,0 +1,500 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) ARM Limited. 2021 All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions are
+ *   met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ *   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ *   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __aarch64__
+#error Unsupported hardware
+#endif
+
+#include "spdk/stdinc.h"
+#include <arm_sve.h>
+
+static int
+table_lookup_8vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
+		  svuint8_t tbl_vec4, svuint8_t tbl_vec5, svuint8_t tbl_vec6, svuint8_t tbl_vec7,
+		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
+{
+	svuint8_t res2, res3, res4, res5, res6, res7;
+
+	/*
+	 * In base64 decode table, the first 32 elements are invalid value,
+	 * so skip tbl_vec0 and tbl_vec1
+	 */
+	indices = svsub_n_u8_z(p8_in, indices, 2 * vl);
+	res2 = svtbl_u8(tbl_vec2, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res3 = svtbl_u8(tbl_vec3, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res4 = svtbl_u8(tbl_vec4, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res5 = svtbl_u8(tbl_vec5, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res6 = svtbl_u8(tbl_vec6, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res7 = svtbl_u8(tbl_vec7, indices);
+
+	*output = svdup_n_u8(0);
+	*output = svadd_u8_z(p8_in, res2, *output);
+	*output = svadd_u8_z(p8_in, res3, *output);
+	*output = svadd_u8_z(p8_in, res4, *output);
+	*output = svadd_u8_z(p8_in, res5, *output);
+	*output = svadd_u8_z(p8_in, res6, *output);
+	*output = svadd_u8_z(p8_in, res7, *output);
+
+	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+table_lookup_4vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t tbl_vec3,
+		  svuint8_t indices, svuint8_t *output, svbool_t p8_in, uint64_t vl)
+{
+	svuint8_t res0, res1, res2, res3;
+
+	res0 = svtbl_u8(tbl_vec0, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res1 = svtbl_u8(tbl_vec1, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res2 = svtbl_u8(tbl_vec2, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res3 = svtbl_u8(tbl_vec3, indices);
+
+	*output = svdup_n_u8(0);
+
+	*output = svadd_u8_z(p8_in, res0, *output);
+	*output = svadd_u8_z(p8_in, res1, *output);
+	*output = svadd_u8_z(p8_in, res2, *output);
+	*output = svadd_u8_z(p8_in, res3, *output);
+
+	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+table_lookup_3vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t tbl_vec2, svuint8_t indices,
+		  svuint8_t *output, svbool_t p8_in, uint64_t vl)
+{
+	svuint8_t res0, res1, res2;
+
+	res0 = svtbl_u8(tbl_vec0, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res1 = svtbl_u8(tbl_vec1, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res2 = svtbl_u8(tbl_vec2, indices);
+
+	*output = svdup_n_u8(0);
+
+	*output = svadd_u8_z(p8_in, res0, *output);
+	*output = svadd_u8_z(p8_in, res1, *output);
+	*output = svadd_u8_z(p8_in, res2, *output);
+
+	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+table_lookup_2vec(svuint8_t tbl_vec0, svuint8_t tbl_vec1, svuint8_t indices, svuint8_t *output,
+		  svbool_t p8_in, uint64_t vl)
+{
+	svuint8_t res0, res1;
+
+	res0 = svtbl_u8(tbl_vec0, indices);
+	indices = svsub_n_u8_z(p8_in, indices, vl);
+	res1 = svtbl_u8(tbl_vec1, indices);
+
+	*output = svdup_n_u8(0);
+
+	*output = svadd_u8_z(p8_in, res0, *output);
+	*output = svadd_u8_z(p8_in, res1, *output);
+
+	if (svcntp_b8(p8_in, svcmpeq_n_u8(p8_in, *output, 255))) {
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline void
+convert_6bits_to_8bits(svbool_t pred, uint8_t *src, svuint8_t *temp0, svuint8_t *temp1,
+		       svuint8_t *temp2, svuint8_t *temp3)
+{
+	svuint8_t str0, str1, str2;
+	svuint8x3_t ld_enc_input;
+
+	ld_enc_input = svld3_u8(pred, src);
+
+	str0 = svget3_u8(ld_enc_input, 0);
+	str1 = svget3_u8(ld_enc_input, 1);
+	str2 = svget3_u8(ld_enc_input, 2);
+
+
+	*temp0 = svlsr_n_u8_z(pred, str0, 2);
+	*temp1 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str1, 4), svlsl_n_u8_z(pred, str0,
+					     4)),
+			    svdup_u8(0x3F));
+	*temp2 = svand_u8_z(pred, svorr_u8_z(pred, svlsr_n_u8_z(pred, str2, 6), svlsl_n_u8_z(pred, str1,
+					     2)),
+			    svdup_u8(0x3F));
+	*temp3 = svand_u8_z(pred, str2, svdup_u8(0x3F));
+}
+
+static inline void
+convert_8bits_to_6bits(svbool_t pred, svuint8_t temp0, svuint8_t temp1, svuint8_t temp2,
+		       svuint8_t temp3, svuint8_t *output0, svuint8_t *output1, svuint8_t *output2)
+{
+	*output0 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp0, 2), svlsr_n_u8_z(pred, temp1, 4));
+	*output1 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp1, 4), svlsr_n_u8_z(pred, temp2, 2));
+	*output2 = svorr_u8_z(pred, svlsl_n_u8_z(pred, temp2, 6), temp3);
+}
+
+static void
+base64_encode_sve(char **dst, const char *enc_table, const void **src, size_t *src_len)
+{
+	uint64_t vl = svcntb();
+	svuint8_t temp0, temp1, temp2, temp3;
+	svuint8_t output0, output1, output2, output3;
+	svuint8_t tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3;
+	svuint8x4_t st_enc_output;
+	svbool_t p8_all = svptrue_b8();
+	svbool_t pred;
+	uint64_t i = 0;
+	uint64_t pred_count = 0;
+	uint64_t N = (*src_len / 3) * 3;
+
+	if (vl == 16) {
+
+		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
+		tbl_enc1 = svld1_u8(p8_all, (uint8_t *)enc_table + 16);
+		tbl_enc2 = svld1_u8(p8_all, (uint8_t *)enc_table + 32);
+		tbl_enc3 = svld1_u8(p8_all, (uint8_t *)enc_table + 48);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 3, N / 3);
+
+			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
+
+			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp0, &output0, pred, vl);
+			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp1, &output1, pred, vl);
+			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp2, &output2, pred, vl);
+			table_lookup_4vec(tbl_enc0, tbl_enc1, tbl_enc2, tbl_enc3, temp3, &output3, pred, vl);
+
+			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
+			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 3;
+			*dst += pred_count * 4;
+			*src_len -= pred_count * 3;
+			i += pred_count * 3;
+
+		}
+	} else if (vl == 32 || vl == 48) {
+
+		tbl_enc0 = svld1_u8(p8_all, (uint8_t *)enc_table + 0);
+		pred = svwhilelt_b8(vl, (uint64_t)64);
+		tbl_enc1 = svld1_u8(pred, (uint8_t *)enc_table + vl);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 3, N / 3);
+
+			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
+
+			table_lookup_2vec(tbl_enc0, tbl_enc1, temp0, &output0, pred, vl);
+			table_lookup_2vec(tbl_enc0, tbl_enc1, temp1, &output1, pred, vl);
+			table_lookup_2vec(tbl_enc0, tbl_enc1, temp2, &output2, pred, vl);
+			table_lookup_2vec(tbl_enc0, tbl_enc1, temp3, &output3, pred, vl);
+
+			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
+			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 3;
+			*dst += pred_count * 4;
+			*src_len -= pred_count * 3;
+			i += pred_count * 3;
+
+		}
+	} else if (vl >= 64) {
+
+		pred = svwhilelt_b8((uint64_t)0, (uint64_t)64);
+		tbl_enc0 = svld1_u8(pred, (uint8_t *)enc_table);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 3, N / 3);
+
+			convert_6bits_to_8bits(pred, (uint8_t *)*src, &temp0, &temp1, &temp2, &temp3);
+
+			output0 = svtbl_u8(tbl_enc0, temp0);
+			output1 = svtbl_u8(tbl_enc0, temp1);
+			output2 = svtbl_u8(tbl_enc0, temp2);
+			output3 = svtbl_u8(tbl_enc0, temp3);
+
+			st_enc_output = svcreate4_u8(output0, output1, output2, output3);
+			svst4_u8(pred, (uint8_t *)*dst, st_enc_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 3;
+			*dst += pred_count * 4;
+			*src_len -= pred_count * 3;
+			i += pred_count * 3;
+
+		}
+	}
+}
+
+static void
+base64_decode_sve(void **dst, const uint8_t *dec_table, const uint8_t **src, size_t *src_len)
+{
+	uint64_t vl = svcntb();
+	svuint8_t str0, str1, str2, str3;
+	svuint8_t temp0, temp1, temp2, temp3;
+	svuint8_t output0, output1, output2;
+	svuint8_t tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6, tbl_dec7;
+	svuint8x3_t st_dec_output;
+	svbool_t p8_all = svptrue_b8();
+	svbool_t pred;
+	uint64_t i = 0;
+	uint64_t pred_count = 0;
+	uint64_t N = (*src_len / 4) * 4;
+	svuint8x4_t ld_dec_input;
+
+	if (vl == 16) {
+		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
+		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + 16);
+		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + 32);
+		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + 48);
+		tbl_dec4 = svld1_u8(p8_all, (uint8_t *)dec_table + 64);
+		tbl_dec5 = svld1_u8(p8_all, (uint8_t *)dec_table + 80);
+		tbl_dec6 = svld1_u8(p8_all, (uint8_t *)dec_table + 96);
+		tbl_dec7 = svld1_u8(p8_all, (uint8_t *)dec_table + 112);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 4, N / 4);
+
+			ld_dec_input = svld4_u8(pred, *src);
+
+			str0 = svget4_u8(ld_dec_input, 0);
+			str1 = svget4_u8(ld_dec_input, 1);
+			str2 = svget4_u8(ld_dec_input, 2);
+			str3 = svget4_u8(ld_dec_input, 3);
+
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
+
+			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
+					      tbl_dec7, str0, &temp0, pred, vl)) { return; }
+			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
+					      tbl_dec7, str1, &temp1, pred, vl)) { return; }
+			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
+					      tbl_dec7, str2, &temp2, pred, vl)) { return; }
+			if (table_lookup_8vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, tbl_dec4, tbl_dec5, tbl_dec6,
+					      tbl_dec7, str3, &temp3, pred, vl)) { return; }
+
+			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
+
+			st_dec_output = svcreate3_u8(output0, output1, output2);
+			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 4;
+			*dst += pred_count * 3;
+			*src_len -= pred_count * 4;
+			i += pred_count * 4;
+
+		}
+	} else if (vl == 32) {
+		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
+		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
+		tbl_dec2 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 2);
+		tbl_dec3 = svld1_u8(p8_all, (uint8_t *)dec_table + vl * 3);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 4, N / 4);
+
+			ld_dec_input = svld4_u8(pred, *src);
+
+			str0 = svget4_u8(ld_dec_input, 0);
+			str1 = svget4_u8(ld_dec_input, 1);
+			str2 = svget4_u8(ld_dec_input, 2);
+			str3 = svget4_u8(ld_dec_input, 3);
+
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
+
+			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str0, &temp0, pred, vl)) { return; }
+			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str1, &temp1, pred, vl)) { return; }
+			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str2, &temp2, pred, vl)) { return; }
+			if (table_lookup_4vec(tbl_dec0, tbl_dec1, tbl_dec2, tbl_dec3, str3, &temp3, pred, vl)) { return; }
+
+			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
+
+			st_dec_output = svcreate3_u8(output0, output1, output2);
+			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 4;
+			*dst += pred_count * 3;
+			*src_len -= pred_count * 4;
+			i += pred_count * 4;
+
+		}
+
+	} else if (vl == 48) {
+		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
+		tbl_dec1 = svld1_u8(p8_all, (uint8_t *)dec_table + vl);
+		pred = svwhilelt_b8(vl * 2, (uint64_t)128);
+		tbl_dec2 = svld1_u8(pred, (uint8_t *)dec_table + 2 * vl);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 4, N / 4);
+
+			ld_dec_input = svld4_u8(pred, *src);
+
+			str0 = svget4_u8(ld_dec_input, 0);
+			str1 = svget4_u8(ld_dec_input, 1);
+			str2 = svget4_u8(ld_dec_input, 2);
+			str3 = svget4_u8(ld_dec_input, 3);
+
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
+
+			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str0, &temp0, pred, vl)) { return; }
+			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str1, &temp1, pred, vl)) { return; }
+			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str2, &temp2, pred, vl)) { return; }
+			if (table_lookup_3vec(tbl_dec0, tbl_dec1, tbl_dec2, str3, &temp3, pred, vl)) { return; }
+
+			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
+
+			st_dec_output = svcreate3_u8(output0, output1, output2);
+			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 4;
+			*dst += pred_count * 3;
+			*src_len -= pred_count * 4;
+			i += pred_count * 4;
+
+		}
+	} else if (vl == 64 || vl == 80 || vl == 96 || vl == 112) {
+		tbl_dec0 = svld1_u8(p8_all, (uint8_t *)dec_table + 0);
+		pred = svwhilelt_b8(vl, (uint64_t)128);
+		tbl_dec1 = svld1_u8(pred, (uint8_t *)dec_table + vl);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 4, N / 4);
+
+			ld_dec_input = svld4_u8(pred, *src);
+
+			str0 = svget4_u8(ld_dec_input, 0);
+			str1 = svget4_u8(ld_dec_input, 1);
+			str2 = svget4_u8(ld_dec_input, 2);
+			str3 = svget4_u8(ld_dec_input, 3);
+
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
+
+			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str0, &temp0, pred, vl)) { return; }
+			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str1, &temp1, pred, vl)) { return; }
+			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str2, &temp2, pred, vl)) { return; }
+			if (table_lookup_2vec(tbl_dec0, tbl_dec1, str3, &temp3, pred, vl)) { return; }
+
+			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
+
+			st_dec_output = svcreate3_u8(output0, output1, output2);
+			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 4;
+			*dst += pred_count * 3;
+			*src_len -= pred_count * 4;
+			i += pred_count * 4;
+
+		}
+	} else if (vl >= 128) {
+		pred = svwhilelt_b8((uint64_t)0, (uint64_t)128);
+		tbl_dec0 = svld1_u8(pred, (uint8_t *)dec_table + 0);
+
+		while (i < N) {
+			pred = svwhilelt_b8(i / 4, N / 4);
+
+			ld_dec_input = svld4_u8(pred, *src);
+
+			str0 = svget4_u8(ld_dec_input, 0);
+			str1 = svget4_u8(ld_dec_input, 1);
+			str2 = svget4_u8(ld_dec_input, 2);
+			str3 = svget4_u8(ld_dec_input, 3);
+
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str0, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str1, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str2, 128))) { return; }
+			if (svcntp_b8(pred, svcmpge_n_u8(pred, str3, 128))) { return; }
+
+			temp0 = svtbl_u8(tbl_dec0, str0);
+			temp1 = svtbl_u8(tbl_dec0, str1);
+			temp2 = svtbl_u8(tbl_dec0, str2);
+			temp3 = svtbl_u8(tbl_dec0, str3);
+
+			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp0, 255))) { return; }
+			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp1, 255))) { return; }
+			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp2, 255))) { return; }
+			if (svcntp_b8(pred, svcmpeq_n_u8(pred, temp3, 255))) { return; }
+
+			convert_8bits_to_6bits(pred, temp0, temp1, temp2, temp3, &output0, &output1, &output2);
+
+			st_dec_output = svcreate3_u8(output0, output1, output2);
+			svst3_u8(pred, (uint8_t *)*dst, st_dec_output);
+
+			pred_count = svcntp_b8(pred, pred);
+			*src += pred_count * 4;
+			*dst += pred_count * 3;
+			*src_len -= pred_count * 4;
+			i += pred_count * 4;
+
+		}
+	}
+}
--- a/test/common/skipped_build_files.txt
+++ b/test/common/skipped_build_files.txt
@ -13,6 +13,9 @@ test/unit/lib/nvmf/fc_ls.c/fc_ls_ut
 # Not configured for Neon testing
 lib/util/base64_neon

+# Not configured for ARM SVE testing
+lib/util/base64_sve
+
 # Not configured for mlx5 dv testing
 lib/rdma/rdma_mlx5_dv