acl: new library

The ACL library is used to perform an N-tuple search over a set of rules with multiple categories and find the best match for each category. Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com> Tested-by: Waterman Cao <waterman.cao@intel.com> Acked-by: Pablo de Lara Guarch <pablo.de.lara.guarch@intel.com> [Thomas: some code-style changes]
2014-06-13 12:26:50 +01:00 · 2014-06-13 12:26:50 +01:00 · dc276b5780
commit dc276b5780
parent 36c248ebc6
17 changed files with 5234 additions and 2 deletions
--- a/config/common_bsdapp
+++ b/config/common_bsdapp
@ -245,6 +245,13 @@ CONFIG_RTE_LIBRTE_HASH_DEBUG=n
 CONFIG_RTE_LIBRTE_LPM=y
 CONFIG_RTE_LIBRTE_LPM_DEBUG=n

+#
+# Compile librte_acl
+#
+CONFIG_RTE_LIBRTE_ACL=y
+CONFIG_RTE_LIBRTE_ACL_DEBUG=n
+CONFIG_RTE_LIBRTE_ACL_STANDALONE=n
+
 #
 # Compile librte_power
 #
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@ -279,6 +279,13 @@ CONFIG_RTE_LIBRTE_HASH_DEBUG=n
 CONFIG_RTE_LIBRTE_LPM=y
 CONFIG_RTE_LIBRTE_LPM_DEBUG=n

+#
+# Compile librte_acl
+#
+CONFIG_RTE_LIBRTE_ACL=y
+CONFIG_RTE_LIBRTE_ACL_DEBUG=n
+CONFIG_RTE_LIBRTE_ACL_STANDALONE=n
+
 #
 # Compile librte_power
 #
--- a/doc/doxy-api-index.md
+++ b/doc/doxy-api-index.md
@ -78,7 +78,8 @@ There are many libraries, so their headers may be grouped by topics:
  [SCTP]               (@ref rte_sctp.h),
  [TCP]                (@ref rte_tcp.h),
  [UDP]                (@ref rte_udp.h),
-  [LPM route]          (@ref rte_lpm.h)
+  [LPM route]          (@ref rte_lpm.h),
+  [ACL]                (@ref rte_acl.h)

 - **QoS**:
  [metering]           (@ref rte_meter.h),
--- a/doc/doxy-api.conf
+++ b/doc/doxy-api.conf
@ -31,6 +31,7 @@
 PROJECT_NAME            = DPDK
 INPUT                   = doc/doxy-api-index.md \
                          lib/librte_eal/common/include \
+                          lib/librte_acl \
                          lib/librte_distributor \
                          lib/librte_ether \
                          lib/librte_hash \
--- a/lib/Makefile
+++ b/lib/Makefile
@ -49,11 +49,11 @@ DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += librte_pmd_vmxnet3
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += librte_pmd_xenvirt
 DIRS-$(CONFIG_RTE_LIBRTE_HASH) += librte_hash
 DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm
+DIRS-$(CONFIG_RTE_LIBRTE_ACL) += librte_acl
 DIRS-$(CONFIG_RTE_LIBRTE_NET) += librte_net
 DIRS-$(CONFIG_RTE_LIBRTE_POWER) += librte_power
 DIRS-$(CONFIG_RTE_LIBRTE_METER) += librte_meter
 DIRS-$(CONFIG_RTE_LIBRTE_SCHED) += librte_sched
-DIRS-$(CONFIG_RTE_LIBRTE_ACL) += librte_acl
 DIRS-$(CONFIG_RTE_LIBRTE_KVARGS) += librte_kvargs
 DIRS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += librte_distributor

--- a/lib/librte_acl/Makefile
+++ b/lib/librte_acl/Makefile
@ -0,0 +1,60 @@
+#   BSD LICENSE
+#
+#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel Corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_acl.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += tb_mem.c
+
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include := rte_acl_osdep.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl.h
+
+ifeq ($(CONFIG_RTE_LIBRTE_ACL_STANDALONE),y)
+# standalone build
+SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl_osdep_alone.h
+else
+# this lib needs eal
+DEPDIRS-$(CONFIG_RTE_LIBRTE_ACL) += lib/librte_eal lib/librte_malloc
+endif
+
+include $(RTE_SDK)/mk/rte.lib.mk
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@ -0,0 +1,182 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_ACL_H_
+#define	_ACL_H_
+
+#ifdef __cplusplus
+extern"C" {
+#endif /* __cplusplus */
+
+#define RTE_ACL_QUAD_MAX	5
+#define RTE_ACL_QUAD_SIZE	4
+#define RTE_ACL_QUAD_SINGLE	UINT64_C(0x7f7f7f7f00000000)
+
+#define RTE_ACL_SINGLE_TRIE_SIZE	2000
+
+#define RTE_ACL_DFA_MAX		UINT8_MAX
+#define RTE_ACL_DFA_SIZE	(UINT8_MAX + 1)
+
+typedef int bits_t;
+
+#define	RTE_ACL_BIT_SET_SIZE	((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))
+
+struct rte_acl_bitset {
+	bits_t             bits[RTE_ACL_BIT_SET_SIZE];
+};
+
+#define	RTE_ACL_NODE_DFA	(0 << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_SINGLE	(1U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_QEXACT	(2U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_QRANGE	(3U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_MATCH	(4U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_TYPE	(7U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_UNDEFINED	UINT32_MAX
+
+/*
+ * Structure of a node is a set of ptrs and each ptr has a bit map
+ * of values associated with this transition.
+ */
+struct rte_acl_ptr_set {
+	struct rte_acl_bitset values;	/* input values associated with ptr */
+	struct rte_acl_node  *ptr;	/* transition to next node */
+};
+
+struct rte_acl_classifier_results {
+	int results[RTE_ACL_MAX_CATEGORIES];
+};
+
+struct rte_acl_match_results {
+	uint32_t results[RTE_ACL_MAX_CATEGORIES];
+	int32_t priority[RTE_ACL_MAX_CATEGORIES];
+};
+
+struct rte_acl_node {
+	uint64_t node_index;  /* index for this node */
+	uint32_t level;       /* level 0-n in the trie */
+	uint32_t ref_count;   /* ref count for this node */
+	struct rte_acl_bitset  values;
+	/* set of all values that map to another node
+	 * (union of bits in each transition.
+	 */
+	uint32_t                num_ptrs; /* number of ptr_set in use */
+	uint32_t                max_ptrs; /* number of allocated ptr_set */
+	uint32_t                min_add;  /* number of ptr_set per allocation */
+	struct rte_acl_ptr_set *ptrs;     /* transitions array for this node */
+	int32_t                 match_flag;
+	int32_t                 match_index; /* index to match data */
+	uint32_t                node_type;
+	int32_t                 fanout;
+	/* number of ranges (transitions w/ consecutive bits) */
+	int32_t                 id;
+	struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
+	char                         transitions[RTE_ACL_QUAD_SIZE];
+	/* boundaries for ranged node */
+	struct rte_acl_node     *next;
+	/* free list link or pointer to duplicate node during merge */
+	struct rte_acl_node     *prev;
+	/* points to node from which this node was duplicated */
+
+	uint32_t                subtree_id;
+	uint32_t                subtree_ref_count;
+
+};
+enum {
+	RTE_ACL_SUBTREE_NODE = 0x80000000
+};
+
+/*
+ * Types of tries used to generate runtime structure(s)
+ */
+enum {
+	RTE_ACL_FULL_TRIE = 0,
+	RTE_ACL_NOSRC_TRIE = 1,
+	RTE_ACL_NODST_TRIE = 2,
+	RTE_ACL_NOPORTS_TRIE = 4,
+	RTE_ACL_NOVLAN_TRIE = 8,
+	RTE_ACL_UNUSED_TRIE = 0x80000000
+};
+
+
+/** MAX number of tries per one ACL context.*/
+#define RTE_ACL_MAX_TRIES	8
+
+/** Max number of characters in PM name.*/
+#define RTE_ACL_NAMESIZE	32
+
+
+struct rte_acl_trie {
+	uint32_t        type;
+	uint32_t        count;
+	int32_t         smallest;  /* smallest rule in this trie */
+	uint32_t        root_index;
+	const uint32_t *data_index;
+	uint32_t        num_data_indexes;
+};
+
+struct rte_acl_bld_trie {
+	struct rte_acl_node *trie;
+};
+
+struct rte_acl_ctx {
+	TAILQ_ENTRY(rte_acl_ctx) next;    /**< Next in list. */
+	char                name[RTE_ACL_NAMESIZE];
+	/** Name of the ACL context. */
+	int32_t             socket_id;
+	/** Socket ID to allocate memory from. */
+	void               *rules;
+	uint32_t            max_rules;
+	uint32_t            rule_sz;
+	uint32_t            num_rules;
+	uint32_t            num_categories;
+	uint32_t            num_tries;
+	uint32_t            match_index;
+	uint64_t            no_match;
+	uint64_t            idle;
+	uint64_t           *trans_table;
+	uint32_t           *data_indexes;
+	struct rte_acl_trie trie[RTE_ACL_MAX_TRIES];
+	void               *mem;
+	size_t              mem_sz;
+	struct rte_acl_config config; /* copy of build config. */
+};
+
+int rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
+	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
+	uint32_t num_categories, uint32_t data_index_sz, int match_num);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ACL_H_ */
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
--- a/lib/librte_acl/acl_gen.c
+++ b/lib/librte_acl/acl_gen.c
@ -0,0 +1,475 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_acl.h>
+#include "acl_vect.h"
+#include "acl.h"
+
+#define	QRANGE_MIN	((uint8_t)INT8_MIN)
+
+#define	RTE_ACL_VERIFY(exp)	do {                                          \
+	if (!(exp))                                                           \
+		rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \
+} while (0)
+
+struct acl_node_counters {
+	int                match;
+	int                match_used;
+	int                single;
+	int                quad;
+	int                quad_vectors;
+	int                dfa;
+	int                smallest_match;
+};
+
+struct rte_acl_indices {
+	int                dfa_index;
+	int                quad_index;
+	int                single_index;
+	int                match_index;
+};
+
+static void
+acl_gen_log_stats(const struct rte_acl_ctx *ctx,
+	const struct acl_node_counters *counts)
+{
+	RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
+		"runtime memory footprint on socket %d:\n"
+		"single nodes/bytes used: %d/%zu\n"
+		"quad nodes/bytes used: %d/%zu\n"
+		"DFA nodes/bytes used: %d/%zu\n"
+		"match nodes/bytes used: %d/%zu\n"
+		"total: %zu bytes\n",
+		ctx->name, ctx->socket_id,
+		counts->single, counts->single * sizeof(uint64_t),
+		counts->quad, counts->quad_vectors * sizeof(uint64_t),
+		counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t),
+		counts->match,
+		counts->match * sizeof(struct rte_acl_match_results),
+		ctx->mem_sz);
+}
+
+/*
+*  Counts the number of groups of sequential bits that are
+*  either 0 or 1, as specified by the zero_one parameter. This is used to
+*  calculate the number of ranges in a node to see if it fits in a quad range
+*  node.
+*/
+static int
+acl_count_sequential_groups(struct rte_acl_bitset *bits, int zero_one)
+{
+	int n, ranges, last_bit;
+
+	ranges = 0;
+	last_bit = zero_one ^ 1;
+
+	for (n = QRANGE_MIN; n < UINT8_MAX + 1; n++) {
+		if (bits->bits[n / (sizeof(bits_t) * 8)] &
+				(1 << (n % (sizeof(bits_t) * 8)))) {
+			if (zero_one == 1 && last_bit != 1)
+				ranges++;
+			last_bit = 1;
+		} else {
+			if (zero_one == 0 && last_bit != 0)
+				ranges++;
+			last_bit = 0;
+		}
+	}
+	for (n = 0; n < QRANGE_MIN; n++) {
+		if (bits->bits[n / (sizeof(bits_t) * 8)] &
+				(1 << (n % (sizeof(bits_t) * 8)))) {
+			if (zero_one == 1 && last_bit != 1)
+				ranges++;
+			last_bit = 1;
+		} else {
+			if (zero_one == 0 && last_bit != 0)
+				ranges++;
+			last_bit = 0;
+		}
+	}
+
+	return ranges;
+}
+
+/*
+ * Count number of ranges spanned by the node's pointers
+ */
+static int
+acl_count_fanout(struct rte_acl_node *node)
+{
+	uint32_t n;
+	int ranges;
+
+	if (node->fanout != 0)
+		return node->fanout;
+
+	ranges = acl_count_sequential_groups(&node->values, 0);
+
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL)
+			ranges += acl_count_sequential_groups(
+				&node->ptrs[n].values, 1);
+	}
+
+	node->fanout = ranges;
+	return node->fanout;
+}
+
+/*
+ * Determine the type of nodes and count each type
+ */
+static int
+acl_count_trie_types(struct acl_node_counters *counts,
+	struct rte_acl_node *node, int match, int force_dfa)
+{
+	uint32_t n;
+	int num_ptrs;
+
+	/* skip if this node has been counted */
+	if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
+		return match;
+
+	if (node->match_flag != 0 || node->num_ptrs == 0) {
+		counts->match++;
+		if (node->match_flag == -1)
+			node->match_flag = match++;
+		node->node_type = RTE_ACL_NODE_MATCH;
+		if (counts->smallest_match > node->match_flag)
+			counts->smallest_match = node->match_flag;
+		return match;
+	}
+
+	num_ptrs = acl_count_fanout(node);
+
+	/* Force type to dfa */
+	if (force_dfa)
+		num_ptrs = RTE_ACL_DFA_SIZE;
+
+	/* determine node type based on number of ranges */
+	if (num_ptrs == 1) {
+		counts->single++;
+		node->node_type = RTE_ACL_NODE_SINGLE;
+	} else if (num_ptrs <= RTE_ACL_QUAD_MAX) {
+		counts->quad++;
+		counts->quad_vectors += node->fanout;
+		node->node_type = RTE_ACL_NODE_QRANGE;
+	} else {
+		counts->dfa++;
+		node->node_type = RTE_ACL_NODE_DFA;
+	}
+
+	/*
+	 * recursively count the types of all children
+	 */
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL)
+			match = acl_count_trie_types(counts, node->ptrs[n].ptr,
+				match, 0);
+	}
+
+	return match;
+}
+
+static void
+acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
+	int resolved)
+{
+	uint32_t n, x;
+	int m, ranges, last_bit;
+	struct rte_acl_node *child;
+	struct rte_acl_bitset *bits;
+	uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
+
+	ranges = 0;
+	last_bit = 0;
+
+	for (n = 0; n < RTE_DIM(dfa); n++)
+		dfa[n] = no_match;
+
+	for (x = 0; x < node->num_ptrs; x++) {
+
+		child = node->ptrs[x].ptr;
+		if (child == NULL)
+			continue;
+
+		bits = &node->ptrs[x].values;
+		for (n = 0; n < RTE_DIM(dfa); n++) {
+
+			if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
+				(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
+
+				dfa[n] = resolved ? child->node_index : x;
+				ranges += (last_bit == 0);
+				last_bit = 1;
+			} else {
+				last_bit = 0;
+			}
+		}
+	}
+
+	/*
+	 * Rather than going from 0 to 256, the range count and
+	 * the layout are from 80-ff then 0-7f due to signed compare
+	 * for SSE (cmpgt).
+	 */
+	if (node->node_type == RTE_ACL_NODE_QRANGE) {
+
+		m = 0;
+		node_a = node_array;
+		index = dfa[QRANGE_MIN];
+		*node_a++ = index;
+
+		for (x = QRANGE_MIN + 1; x < UINT8_MAX + 1; x++) {
+			if (dfa[x] != index) {
+				index = dfa[x];
+				*node_a++ = index;
+				node->transitions[m++] = (uint8_t)(x - 1);
+			}
+		}
+
+		for (x = 0; x < INT8_MAX + 1; x++) {
+			if (dfa[x] != index) {
+				index = dfa[x];
+				*node_a++ = index;
+				node->transitions[m++] = (uint8_t)(x - 1);
+			}
+		}
+
+		/* fill unused locations with max value - nothing is greater */
+		for (; m < RTE_ACL_QUAD_SIZE; m++)
+			node->transitions[m] = INT8_MAX;
+
+		RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
+
+	} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
+		for (n = 0; n < RTE_DIM(dfa); n++)
+			node_array[n] = dfa[n];
+	}
+}
+
+/*
+ * Routine that allocates space for this node and recursively calls
+ * to allocate space for each child. Once all the children are allocated,
+ * then resolve all transitions for this node.
+ */
+static void
+acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
+	uint64_t no_match, struct rte_acl_indices *index, int num_categories)
+{
+	uint32_t n, *qtrp;
+	uint64_t *array_ptr;
+	struct rte_acl_match_results *match;
+
+	if (node->node_index != RTE_ACL_NODE_UNDEFINED)
+		return;
+
+	array_ptr = NULL;
+
+	switch (node->node_type) {
+	case RTE_ACL_NODE_DFA:
+		node->node_index = index->dfa_index | node->node_type;
+		array_ptr = &node_array[index->dfa_index];
+		index->dfa_index += RTE_ACL_DFA_SIZE;
+		for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+			array_ptr[n] = no_match;
+		break;
+	case RTE_ACL_NODE_SINGLE:
+		node->node_index = RTE_ACL_QUAD_SINGLE | index->single_index |
+			node->node_type;
+		array_ptr = &node_array[index->single_index];
+		index->single_index += 1;
+		array_ptr[0] = no_match;
+		break;
+	case RTE_ACL_NODE_QRANGE:
+		array_ptr = &node_array[index->quad_index];
+		acl_add_ptrs(node, array_ptr, no_match,  0);
+		qtrp = (uint32_t *)node->transitions;
+		node->node_index = qtrp[0];
+		node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
+		node->node_index |= index->quad_index | node->node_type;
+		index->quad_index += node->fanout;
+		break;
+	case RTE_ACL_NODE_MATCH:
+		match = ((struct rte_acl_match_results *)
+			(node_array + index->match_index));
+		memcpy(match + node->match_flag, node->mrt, sizeof(*node->mrt));
+		node->node_index = node->match_flag | node->node_type;
+		break;
+	case RTE_ACL_NODE_UNDEFINED:
+		RTE_ACL_VERIFY(node->node_type !=
+			(uint32_t)RTE_ACL_NODE_UNDEFINED);
+		break;
+	}
+
+	/* recursively allocate space for all children */
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL)
+			acl_gen_node(node->ptrs[n].ptr,
+				node_array,
+				no_match,
+				index,
+				num_categories);
+	}
+
+	/* All children are resolved, resolve this node's pointers */
+	switch (node->node_type) {
+	case RTE_ACL_NODE_DFA:
+		acl_add_ptrs(node, array_ptr, no_match, 1);
+		break;
+	case RTE_ACL_NODE_SINGLE:
+		for (n = 0; n < node->num_ptrs; n++) {
+			if (node->ptrs[n].ptr != NULL)
+				array_ptr[0] = node->ptrs[n].ptr->node_index;
+		}
+		break;
+	case RTE_ACL_NODE_QRANGE:
+		acl_add_ptrs(node, array_ptr, no_match, 1);
+		break;
+	case RTE_ACL_NODE_MATCH:
+		break;
+	case RTE_ACL_NODE_UNDEFINED:
+		RTE_ACL_VERIFY(node->node_type !=
+			(uint32_t)RTE_ACL_NODE_UNDEFINED);
+		break;
+	}
+}
+
+static int
+acl_calc_counts_indicies(struct acl_node_counters *counts,
+	struct rte_acl_indices *indices, struct rte_acl_trie *trie,
+	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
+	int match_num)
+{
+	uint32_t n;
+
+	memset(indices, 0, sizeof(*indices));
+	memset(counts, 0, sizeof(*counts));
+
+	/* Get stats on nodes */
+	for (n = 0; n < num_tries; n++) {
+		counts->smallest_match = INT32_MAX;
+		match_num = acl_count_trie_types(counts, node_bld_trie[n].trie,
+			match_num, 1);
+		trie[n].smallest = counts->smallest_match;
+	}
+
+	indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
+	indices->quad_index = indices->dfa_index +
+		counts->dfa * RTE_ACL_DFA_SIZE;
+	indices->single_index = indices->quad_index + counts->quad_vectors;
+	indices->match_index = indices->single_index + counts->single + 1;
+	indices->match_index = RTE_ALIGN(indices->match_index,
+		(XMM_SIZE / sizeof(uint64_t)));
+
+	return match_num;
+}
+
+/*
+ * Generate the runtime structure using build structure
+ */
+int
+rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
+	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
+	uint32_t num_categories, uint32_t data_index_sz, int match_num)
+{
+	void *mem;
+	size_t total_size;
+	uint64_t *node_array, no_match;
+	uint32_t n, match_index;
+	struct rte_acl_match_results *match;
+	struct acl_node_counters counts;
+	struct rte_acl_indices indices;
+
+	/* Fill counts and indicies arrays from the nodes. */
+	match_num = acl_calc_counts_indicies(&counts, &indices, trie,
+		node_bld_trie, num_tries, match_num);
+
+	/* Allocate runtime memory (align to cache boundary) */
+	total_size = RTE_ALIGN(data_index_sz, CACHE_LINE_SIZE) +
+		indices.match_index * sizeof(uint64_t) +
+		(match_num + 2) * sizeof(struct rte_acl_match_results) +
+		XMM_SIZE;
+
+	mem = rte_zmalloc_socket(ctx->name, total_size, CACHE_LINE_SIZE,
+			ctx->socket_id);
+	if (mem == NULL) {
+		RTE_LOG(ERR, ACL,
+			"allocation of %zu bytes on socket %d for %s failed\n",
+			total_size, ctx->socket_id, ctx->name);
+		return -ENOMEM;
+	}
+
+	/* Fill the runtime structure */
+	match_index = indices.match_index;
+	node_array = (uint64_t *)((uintptr_t)mem +
+		RTE_ALIGN(data_index_sz, CACHE_LINE_SIZE));
+
+	/*
+	 * Setup the NOMATCH node (a SINGLE at the
+	 * highest index, that points to itself)
+	 */
+
+	node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
+	no_match = RTE_ACL_NODE_MATCH;
+
+	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		node_array[n] = no_match;
+
+	match = ((struct rte_acl_match_results *)(node_array + match_index));
+	memset(match, 0, sizeof(*match));
+
+	for (n = 0; n < num_tries; n++) {
+
+		acl_gen_node(node_bld_trie[n].trie, node_array, no_match,
+			&indices, num_categories);
+
+		if (node_bld_trie[n].trie->node_index == no_match)
+			trie[n].root_index = 0;
+		else
+			trie[n].root_index = node_bld_trie[n].trie->node_index;
+	}
+
+	ctx->mem = mem;
+	ctx->mem_sz = total_size;
+	ctx->data_indexes = mem;
+	ctx->num_tries = num_tries;
+	ctx->num_categories = num_categories;
+	ctx->match_index = match_index;
+	ctx->no_match = no_match;
+	ctx->idle = node_array[RTE_ACL_DFA_SIZE];
+	ctx->trans_table = node_array;
+	memcpy(ctx->trie, trie, sizeof(ctx->trie));
+
+	acl_gen_log_stats(ctx, &counts);
+	return 0;
+}
--- a/lib/librte_acl/acl_run.c
+++ b/lib/librte_acl/acl_run.c
@ -0,0 +1,944 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_acl.h>
+#include "acl_vect.h"
+#include "acl.h"
+
+#define MAX_SEARCHES_SSE8	8
+#define MAX_SEARCHES_SSE4	4
+#define MAX_SEARCHES_SSE2	2
+#define MAX_SEARCHES_SCALAR	2
+
+#define GET_NEXT_4BYTES(prm, idx)	\
+	(*((const int32_t *)((prm)[(idx)].data + *(prm)[idx].data_index++)))
+
+
+#define RTE_ACL_NODE_INDEX	((uint32_t)~RTE_ACL_NODE_TYPE)
+
+#define	SCALAR_QRANGE_MULT	0x01010101
+#define	SCALAR_QRANGE_MASK	0x7f7f7f7f
+#define	SCALAR_QRANGE_MIN	0x80808080
+
+enum {
+	SHUFFLE32_SLOT1 = 0xe5,
+	SHUFFLE32_SLOT2 = 0xe6,
+	SHUFFLE32_SLOT3 = 0xe7,
+	SHUFFLE32_SWAP64 = 0x4e,
+};
+
+/*
+ * Structure to manage N parallel trie traversals.
+ * The runtime trie traversal routines can process 8, 4, or 2 tries
+ * in parallel. Each packet may require multiple trie traversals (up to 4).
+ * This structure is used to fill the slots (0 to n-1) for parallel processing
+ * with the trie traversals needed for each packet.
+ */
+struct acl_flow_data {
+	uint32_t            num_packets;
+	/* number of packets processed */
+	uint32_t            started;
+	/* number of trie traversals in progress */
+	uint32_t            trie;
+	/* current trie index (0 to N-1) */
+	uint32_t            cmplt_size;
+	uint32_t            total_packets;
+	uint32_t            categories;
+	/* number of result categories per packet. */
+	/* maximum number of packets to process */
+	const uint64_t     *trans;
+	const uint8_t     **data;
+	uint32_t           *results;
+	struct completion  *last_cmplt;
+	struct completion  *cmplt_array;
+};
+
+/*
+ * Structure to maintain running results for
+ * a single packet (up to 4 tries).
+ */
+struct completion {
+	uint32_t *results;                          /* running results. */
+	int32_t   priority[RTE_ACL_MAX_CATEGORIES]; /* running priorities. */
+	uint32_t  count;                            /* num of remaining tries */
+	/* true for allocated struct */
+} __attribute__((aligned(XMM_SIZE)));
+
+/*
+ * One parms structure for each slot in the search engine.
+ */
+struct parms {
+	const uint8_t              *data;
+	/* input data for this packet */
+	const uint32_t             *data_index;
+	/* data indirection for this trie */
+	struct completion          *cmplt;
+	/* completion data for this packet */
+};
+
+/*
+ * Define an global idle node for unused engine slots
+ */
+static const uint32_t idle[UINT8_MAX + 1];
+
+static const rte_xmm_t mm_type_quad_range = {
+	.u32 = {
+		RTE_ACL_NODE_QRANGE,
+		RTE_ACL_NODE_QRANGE,
+		RTE_ACL_NODE_QRANGE,
+		RTE_ACL_NODE_QRANGE,
+	},
+};
+
+static const rte_xmm_t mm_type_quad_range64 = {
+	.u32 = {
+		RTE_ACL_NODE_QRANGE,
+		RTE_ACL_NODE_QRANGE,
+		0,
+		0,
+	},
+};
+
+static const rte_xmm_t mm_shuffle_input = {
+	.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
+};
+
+static const rte_xmm_t mm_shuffle_input64 = {
+	.u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},
+};
+
+static const rte_xmm_t mm_ones_16 = {
+	.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
+};
+
+static const rte_xmm_t mm_bytes = {
+	.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
+};
+
+static const rte_xmm_t mm_bytes64 = {
+	.u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
+};
+
+static const rte_xmm_t mm_match_mask = {
+	.u32 = {
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+	},
+};
+
+static const rte_xmm_t mm_match_mask64 = {
+	.u32 = {
+		RTE_ACL_NODE_MATCH,
+		0,
+		RTE_ACL_NODE_MATCH,
+		0,
+	},
+};
+
+static const rte_xmm_t mm_index_mask = {
+	.u32 = {
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+	},
+};
+
+static const rte_xmm_t mm_index_mask64 = {
+	.u32 = {
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		0,
+		0,
+	},
+};
+
+/*
+ * Allocate a completion structure to manage the tries for a packet.
+ */
+static inline struct completion *
+alloc_completion(struct completion *p, uint32_t size, uint32_t tries,
+	uint32_t *results)
+{
+	uint32_t n;
+
+	for (n = 0; n < size; n++) {
+
+		if (p[n].count == 0) {
+
+			/* mark as allocated and set number of tries. */
+			p[n].count = tries;
+			p[n].results = results;
+			return &(p[n]);
+		}
+	}
+
+	/* should never get here */
+	return NULL;
+}
+
+/*
+ * Resolve priority for a single result trie.
+ */
+static inline void
+resolve_single_priority(uint64_t transition, int n,
+	const struct rte_acl_ctx *ctx, struct parms *parms,
+	const struct rte_acl_match_results *p)
+{
+	if (parms[n].cmplt->count == ctx->num_tries ||
+			parms[n].cmplt->priority[0] <=
+			p[transition].priority[0]) {
+
+		parms[n].cmplt->priority[0] = p[transition].priority[0];
+		parms[n].cmplt->results[0] = p[transition].results[0];
+	}
+
+	parms[n].cmplt->count--;
+}
+
+/*
+ * Resolve priority for multiple results. This consists comparing
+ * the priority of the current traversal with the running set of
+ * results for the packet. For each result, keep a running array of
+ * the result (rule number) and its priority for each category.
+ */
+static inline void
+resolve_priority(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
+	struct parms *parms, const struct rte_acl_match_results *p,
+	uint32_t categories)
+{
+	uint32_t x;
+	xmm_t results, priority, results1, priority1, selector;
+	xmm_t *saved_results, *saved_priority;
+
+	for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
+
+		saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
+		saved_priority =
+			(xmm_t *)(&parms[n].cmplt->priority[x]);
+
+		/* get results and priorities for completed trie */
+		results = MM_LOADU((const xmm_t *)&p[transition].results[x]);
+		priority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);
+
+		/* if this is not the first completed trie */
+		if (parms[n].cmplt->count != ctx->num_tries) {
+
+			/* get running best results and their priorities */
+			results1 = MM_LOADU(saved_results);
+			priority1 = MM_LOADU(saved_priority);
+
+			/* select results that are highest priority */
+			selector = MM_CMPGT32(priority1, priority);
+			results = MM_BLENDV8(results, results1, selector);
+			priority = MM_BLENDV8(priority, priority1, selector);
+		}
+
+		/* save running best results and their priorities */
+		MM_STOREU(saved_results, results);
+		MM_STOREU(saved_priority, priority);
+	}
+
+	/* Count down completed tries for this search request */
+	parms[n].cmplt->count--;
+}
+
+/*
+ * Routine to fill a slot in the parallel trie traversal array (parms) from
+ * the list of packets (flows).
+ */
+static inline uint64_t
+acl_start_next_trie(struct acl_flow_data *flows, struct parms *parms, int n,
+	const struct rte_acl_ctx *ctx)
+{
+	uint64_t transition;
+
+	/* if there are any more packets to process */
+	if (flows->num_packets < flows->total_packets) {
+		parms[n].data = flows->data[flows->num_packets];
+		parms[n].data_index = ctx->trie[flows->trie].data_index;
+
+		/* if this is the first trie for this packet */
+		if (flows->trie == 0) {
+			flows->last_cmplt = alloc_completion(flows->cmplt_array,
+				flows->cmplt_size, ctx->num_tries,
+				flows->results +
+				flows->num_packets * flows->categories);
+		}
+
+		/* set completion parameters and starting index for this slot */
+		parms[n].cmplt = flows->last_cmplt;
+		transition =
+			flows->trans[parms[n].data[*parms[n].data_index++] +
+			ctx->trie[flows->trie].root_index];
+
+		/*
+		 * if this is the last trie for this packet,
+		 * then setup next packet.
+		 */
+		flows->trie++;
+		if (flows->trie >= ctx->num_tries) {
+			flows->trie = 0;
+			flows->num_packets++;
+		}
+
+		/* keep track of number of active trie traversals */
+		flows->started++;
+
+	/* no more tries to process, set slot to an idle position */
+	} else {
+		transition = ctx->idle;
+		parms[n].data = (const uint8_t *)idle;
+		parms[n].data_index = idle;
+	}
+	return transition;
+}
+
+/*
+ * Detect matches. If a match node transition is found, then this trie
+ * traversal is complete and fill the slot with the next trie
+ * to be processed.
+ */
+static inline uint64_t
+acl_match_check_transition(uint64_t transition, int slot,
+	const struct rte_acl_ctx *ctx, struct parms *parms,
+	struct acl_flow_data *flows)
+{
+	const struct rte_acl_match_results *p;
+
+	p = (const struct rte_acl_match_results *)
+		(flows->trans + ctx->match_index);
+
+	if (transition & RTE_ACL_NODE_MATCH) {
+
+		/* Remove flags from index and decrement active traversals */
+		transition &= RTE_ACL_NODE_INDEX;
+		flows->started--;
+
+		/* Resolve priorities for this trie and running results */
+		if (flows->categories == 1)
+			resolve_single_priority(transition, slot, ctx,
+				parms, p);
+		else
+			resolve_priority(transition, slot, ctx, parms, p,
+				flows->categories);
+
+		/* Fill the slot with the next trie or idle trie */
+		transition = acl_start_next_trie(flows, parms, slot, ctx);
+
+	} else if (transition == ctx->idle) {
+		/* reset indirection table for idle slots */
+		parms[slot].data_index = idle;
+	}
+
+	return transition;
+}
+
+/*
+ * Extract transitions from an XMM register and check for any matches
+ */
+static void
+acl_process_matches(xmm_t *indicies, int slot, const struct rte_acl_ctx *ctx,
+	struct parms *parms, struct acl_flow_data *flows)
+{
+	uint64_t transition1, transition2;
+
+	/* extract transition from low 64 bits. */
+	transition1 = MM_CVT64(*indicies);
+
+	/* extract transition from high 64 bits. */
+	*indicies = MM_SHUFFLE32(*indicies, SHUFFLE32_SWAP64);
+	transition2 = MM_CVT64(*indicies);
+
+	transition1 = acl_match_check_transition(transition1, slot, ctx,
+		parms, flows);
+	transition2 = acl_match_check_transition(transition2, slot + 1, ctx,
+		parms, flows);
+
+	/* update indicies with new transitions. */
+	*indicies = MM_SET64(transition2, transition1);
+}
+
+/*
+ * Check for a match in 2 transitions (contained in SSE register)
+ */
+static inline void
+acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
+	struct acl_flow_data *flows, xmm_t *indicies, xmm_t match_mask)
+{
+	xmm_t temp;
+
+	temp = MM_AND(match_mask, *indicies);
+	while (!MM_TESTZ(temp, temp)) {
+		acl_process_matches(indicies, slot, ctx, parms, flows);
+		temp = MM_AND(match_mask, *indicies);
+	}
+}
+
+/*
+ * Check for any match in 4 transitions (contained in 2 SSE registers)
+ */
+static inline void
+acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
+	struct acl_flow_data *flows, xmm_t *indicies1, xmm_t *indicies2,
+	xmm_t match_mask)
+{
+	xmm_t temp;
+
+	/* put low 32 bits of each transition into one register */
+	temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
+		0x88);
+	/* test for match node */
+	temp = MM_AND(match_mask, temp);
+
+	while (!MM_TESTZ(temp, temp)) {
+		acl_process_matches(indicies1, slot, ctx, parms, flows);
+		acl_process_matches(indicies2, slot + 2, ctx, parms, flows);
+
+		temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
+					(__m128)*indicies2,
+					0x88);
+		temp = MM_AND(match_mask, temp);
+	}
+}
+
+/*
+ * Calculate the address of the next transition for
+ * all types of nodes. Note that only DFA nodes and range
+ * nodes actually transition to another node. Match
+ * nodes don't move.
+ */
+static inline xmm_t
+acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
+	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
+	xmm_t *indicies1, xmm_t *indicies2)
+{
+	xmm_t addr, node_types, temp;
+
+	/*
+	 * Note that no transition is done for a match
+	 * node and therefore a stream freezes when
+	 * it reaches a match.
+	 */
+
+	/* Shuffle low 32 into temp and high 32 into indicies2 */
+	temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
+		0x88);
+	*indicies2 = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
+		(__m128)*indicies2, 0xdd);
+
+	/* Calc node type and node addr */
+	node_types = MM_ANDNOT(index_mask, temp);
+	addr = MM_AND(index_mask, temp);
+
+	/*
+	 * Calc addr for DFAs - addr = dfa_index + input_byte
+	 */
+
+	/* mask for DFA type (0) nodes */
+	temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
+
+	/* add input byte to DFA position */
+	temp = MM_AND(temp, bytes);
+	temp = MM_AND(temp, next_input);
+	addr = MM_ADD32(addr, temp);
+
+	/*
+	 * Calc addr for Range nodes -> range_index + range(input)
+	 */
+	node_types = MM_CMPEQ32(node_types, type_quad_range);
+
+	/*
+	 * Calculate number of range boundaries that are less than the
+	 * input value. Range boundaries for each node are in signed 8 bit,
+	 * ordered from -128 to 127 in the indicies2 register.
+	 * This is effectively a popcnt of bytes that are greater than the
+	 * input byte.
+	 */
+
+	/* shuffle input byte to all 4 positions of 32 bit value */
+	temp = MM_SHUFFLE8(next_input, shuffle_input);
+
+	/* check ranges */
+	temp = MM_CMPGT8(temp, *indicies2);
+
+	/* convert -1 to 1 (bytes greater than input byte */
+	temp = MM_SIGN8(temp, temp);
+
+	/* horizontal add pairs of bytes into words */
+	temp = MM_MADD8(temp, temp);
+
+	/* horizontal add pairs of words into dwords */
+	temp = MM_MADD16(temp, ones_16);
+
+	/* mask to range type nodes */
+	temp = MM_AND(temp, node_types);
+
+	/* add index into node position */
+	return MM_ADD32(addr, temp);
+}
+
+/*
+ * Process 4 transitions (in 2 SIMD registers) in parallel
+ */
+static inline xmm_t
+transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
+	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
+	const uint64_t *trans, xmm_t *indicies1, xmm_t *indicies2)
+{
+	xmm_t addr;
+	uint64_t trans0, trans2;
+
+	 /* Calculate the address (array index) for all 4 transitions. */
+
+	addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
+		bytes, type_quad_range, indicies1, indicies2);
+
+	 /* Gather 64 bit transitions and pack back into 2 registers. */
+
+	trans0 = trans[MM_CVT32(addr)];
+
+	/* get slot 2 */
+
+	/* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
+	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);
+	trans2 = trans[MM_CVT32(addr)];
+
+	/* get slot 1 */
+
+	/* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
+	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
+	*indicies1 = MM_SET64(trans[MM_CVT32(addr)], trans0);
+
+	/* get slot 3 */
+
+	/* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
+	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);
+	*indicies2 = MM_SET64(trans[MM_CVT32(addr)], trans2);
+
+	return MM_SRL32(next_input, 8);
+}
+
+static inline void
+acl_set_flow(struct acl_flow_data *flows, struct completion *cmplt,
+	uint32_t cmplt_size, const uint8_t **data, uint32_t *results,
+	uint32_t data_num, uint32_t categories, const uint64_t *trans)
+{
+	flows->num_packets = 0;
+	flows->started = 0;
+	flows->trie = 0;
+	flows->last_cmplt = NULL;
+	flows->cmplt_array = cmplt;
+	flows->total_packets = data_num;
+	flows->categories = categories;
+	flows->cmplt_size = cmplt_size;
+	flows->data = data;
+	flows->results = results;
+	flows->trans = trans;
+}
+
+/*
+ * Execute trie traversal with 8 traversals in parallel
+ */
+static inline void
+search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SSE8];
+	struct completion cmplt[MAX_SEARCHES_SSE8];
+	struct parms parms[MAX_SEARCHES_SSE8];
+	xmm_t input0, input1;
+	xmm_t indicies1, indicies2, indicies3, indicies4;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	/*
+	 * indicies1 contains index_array[0,1]
+	 * indicies2 contains index_array[2,3]
+	 * indicies3 contains index_array[4,5]
+	 * indicies4 contains index_array[6,7]
+	 */
+
+	indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
+	indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
+
+	indicies3 = MM_LOADU((xmm_t *) &index_array[4]);
+	indicies4 = MM_LOADU((xmm_t *) &index_array[6]);
+
+	 /* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows,
+		&indicies1, &indicies2, mm_match_mask.m);
+	acl_match_check_x4(4, ctx, parms, &flows,
+		&indicies3, &indicies4, mm_match_mask.m);
+
+	while (flows.started > 0) {
+
+		/* Gather 4 bytes of input data for each stream. */
+		input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
+			0);
+		input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
+			0);
+
+		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
+		input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);
+
+		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);
+		input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);
+
+		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);
+		input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);
+
+		 /* Process the 4 bytes of input on each stream. */
+
+		input0 = transition4(mm_index_mask.m, input0,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		input1 = transition4(mm_index_mask.m, input1,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies3, &indicies4);
+
+		input0 = transition4(mm_index_mask.m, input0,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		input1 = transition4(mm_index_mask.m, input1,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies3, &indicies4);
+
+		input0 = transition4(mm_index_mask.m, input0,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		input1 = transition4(mm_index_mask.m, input1,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies3, &indicies4);
+
+		input0 = transition4(mm_index_mask.m, input0,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		input1 = transition4(mm_index_mask.m, input1,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies3, &indicies4);
+
+		 /* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows,
+			&indicies1, &indicies2, mm_match_mask.m);
+		acl_match_check_x4(4, ctx, parms, &flows,
+			&indicies3, &indicies4, mm_match_mask.m);
+	}
+}
+
+/*
+ * Execute trie traversal with 4 traversals in parallel
+ */
+static inline void
+search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	 uint32_t *results, int total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SSE4];
+	struct completion cmplt[MAX_SEARCHES_SSE4];
+	struct parms parms[MAX_SEARCHES_SSE4];
+	xmm_t input, indicies1, indicies2;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
+	indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
+
+	/* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows,
+		&indicies1, &indicies2, mm_match_mask.m);
+
+	while (flows.started > 0) {
+
+		/* Gather 4 bytes of input data for each stream. */
+		input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
+		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
+		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
+		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
+
+		/* Process the 4 bytes of input on each stream. */
+		input = transition4(mm_index_mask.m, input,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		 input = transition4(mm_index_mask.m, input,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		 input = transition4(mm_index_mask.m, input,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		 input = transition4(mm_index_mask.m, input,
+			mm_shuffle_input.m, mm_ones_16.m,
+			mm_bytes.m, mm_type_quad_range.m,
+			flows.trans, &indicies1, &indicies2);
+
+		/* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows,
+			&indicies1, &indicies2, mm_match_mask.m);
+	}
+}
+
+static inline xmm_t
+transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
+	xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
+	const uint64_t *trans, xmm_t *indicies1)
+{
+	uint64_t t;
+	xmm_t addr, indicies2;
+
+	indicies2 = MM_XOR(ones_16, ones_16);
+
+	addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
+		bytes, type_quad_range, indicies1, &indicies2);
+
+	/* Gather 64 bit transitions and pack 2 per register. */
+
+	t = trans[MM_CVT32(addr)];
+
+	/* get slot 1 */
+	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
+	*indicies1 = MM_SET64(trans[MM_CVT32(addr)], t);
+
+	return MM_SRL32(next_input, 8);
+}
+
+/*
+ * Execute trie traversal with 2 traversals in parallel.
+ */
+static inline void
+search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SSE2];
+	struct completion cmplt[MAX_SEARCHES_SSE2];
+	struct parms parms[MAX_SEARCHES_SSE2];
+	xmm_t input, indicies;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SSE2; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	indicies = MM_LOADU((xmm_t *) &index_array[0]);
+
+	/* Check for any matches. */
+	acl_match_check_x2(0, ctx, parms, &flows, &indicies, mm_match_mask64.m);
+
+	while (flows.started > 0) {
+
+		/* Gather 4 bytes of input data for each stream. */
+		input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
+		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
+
+		/* Process the 4 bytes of input on each stream. */
+
+		input = transition2(mm_index_mask64.m, input,
+			mm_shuffle_input64.m, mm_ones_16.m,
+			mm_bytes64.m, mm_type_quad_range64.m,
+			flows.trans, &indicies);
+
+		input = transition2(mm_index_mask64.m, input,
+			mm_shuffle_input64.m, mm_ones_16.m,
+			mm_bytes64.m, mm_type_quad_range64.m,
+			flows.trans, &indicies);
+
+		input = transition2(mm_index_mask64.m, input,
+			mm_shuffle_input64.m, mm_ones_16.m,
+			mm_bytes64.m, mm_type_quad_range64.m,
+			flows.trans, &indicies);
+
+		input = transition2(mm_index_mask64.m, input,
+			mm_shuffle_input64.m, mm_ones_16.m,
+			mm_bytes64.m, mm_type_quad_range64.m,
+			flows.trans, &indicies);
+
+		/* Check for any matches. */
+		acl_match_check_x2(0, ctx, parms, &flows, &indicies,
+			mm_match_mask64.m);
+	}
+}
+
+/*
+ * When processing the transition, rather than using if/else
+ * construct, the offset is calculated for DFA and QRANGE and
+ * then conditionally added to the address based on node type.
+ * This is done to avoid branch mis-predictions. Since the
+ * offset is rather simple calculation it is more efficient
+ * to do the calculation and do a condition move rather than
+ * a conditional branch to determine which calculation to do.
+ */
+static inline uint32_t
+scan_forward(uint32_t input, uint32_t max)
+{
+	return (input == 0) ? max : rte_bsf32(input);
+}
+
+static inline uint64_t
+scalar_transition(const uint64_t *trans_table, uint64_t transition,
+	uint8_t input)
+{
+	uint32_t addr, index, ranges, x, a, b, c;
+
+	/* break transition into component parts */
+	ranges = transition >> (sizeof(index) * CHAR_BIT);
+
+	/* calc address for a QRANGE node */
+	c = input * SCALAR_QRANGE_MULT;
+	a = ranges | SCALAR_QRANGE_MIN;
+	index = transition & ~RTE_ACL_NODE_INDEX;
+	a -= (c & SCALAR_QRANGE_MASK);
+	b = c & SCALAR_QRANGE_MIN;
+	addr = transition ^ index;
+	a &= SCALAR_QRANGE_MIN;
+	a ^= (ranges ^ b) & (a ^ b);
+	x = scan_forward(a, 32) >> 3;
+	addr += (index == RTE_ACL_NODE_DFA) ? input : x;
+
+	/* pickup next transition */
+	transition = *(trans_table + addr);
+	return transition;
+}
+
+int
+rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories)
+{
+	int n;
+	uint64_t transition0, transition1;
+	uint32_t input0, input1;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SCALAR];
+	struct completion cmplt[MAX_SEARCHES_SCALAR];
+	struct parms parms[MAX_SEARCHES_SCALAR];
+
+	if (categories != 1 &&
+		((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
+		return -EINVAL;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, num,
+		categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SCALAR; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	transition0 = index_array[0];
+	transition1 = index_array[1];
+
+	while (flows.started > 0) {
+
+		input0 = GET_NEXT_4BYTES(parms, 0);
+		input1 = GET_NEXT_4BYTES(parms, 1);
+
+		for (n = 0; n < 4; n++) {
+			if (likely((transition0 & RTE_ACL_NODE_MATCH) == 0))
+				transition0 = scalar_transition(flows.trans,
+					transition0, (uint8_t)input0);
+
+			input0 >>= CHAR_BIT;
+
+			if (likely((transition1 & RTE_ACL_NODE_MATCH) == 0))
+				transition1 = scalar_transition(flows.trans,
+					transition1, (uint8_t)input1);
+
+			input1 >>= CHAR_BIT;
+
+		}
+		if ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
+			transition0 = acl_match_check_transition(transition0,
+				0, ctx, parms, &flows);
+			transition1 = acl_match_check_transition(transition1,
+				1, ctx, parms, &flows);
+
+		}
+	}
+	return 0;
+}
+
+int
+rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories)
+{
+	if (categories != 1 &&
+		((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
+		return -EINVAL;
+
+	if (likely(num >= MAX_SEARCHES_SSE8))
+		search_sse_8(ctx, data, results, num, categories);
+	else if (num >= MAX_SEARCHES_SSE4)
+		search_sse_4(ctx, data, results, num, categories);
+	else
+		search_sse_2(ctx, data, results, num, categories);
+
+	return 0;
+}
--- a/lib/librte_acl/acl_vect.h
+++ b/lib/librte_acl/acl_vect.h
@ -0,0 +1,132 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_VECT_H_
+#define _RTE_ACL_VECT_H_
+
+/**
+ * @file
+ *
+ * RTE ACL SSE/AVX related header.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	MM_ADD16(a, b)		_mm_add_epi16(a, b)
+#define	MM_ADD32(a, b)		_mm_add_epi32(a, b)
+#define	MM_ALIGNR8(a, b, c)	_mm_alignr_epi8(a, b, c)
+#define	MM_AND(a, b)		_mm_and_si128(a, b)
+#define MM_ANDNOT(a, b)		_mm_andnot_si128(a, b)
+#define MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
+#define MM_CMPEQ16(a, b)	_mm_cmpeq_epi16(a, b)
+#define MM_CMPEQ32(a, b)	_mm_cmpeq_epi32(a, b)
+#define	MM_CMPEQ8(a, b)		_mm_cmpeq_epi8(a, b)
+#define MM_CMPGT32(a, b)	_mm_cmpgt_epi32(a, b)
+#define MM_CMPGT8(a, b)		_mm_cmpgt_epi8(a, b)
+#define MM_CVT(a)		_mm_cvtsi32_si128(a)
+#define	MM_CVT32(a)		_mm_cvtsi128_si32(a)
+#define MM_CVTU32(a)		_mm_cvtsi32_si128(a)
+#define	MM_INSERT16(a, c, b)	_mm_insert_epi16(a, c, b)
+#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
+#define	MM_LOAD(a)		_mm_load_si128(a)
+#define	MM_LOADH_PI(a, b)	_mm_loadh_pi(a, b)
+#define	MM_LOADU(a)		_mm_loadu_si128(a)
+#define	MM_MADD16(a, b)		_mm_madd_epi16(a, b)
+#define	MM_MADD8(a, b)		_mm_maddubs_epi16(a, b)
+#define	MM_MOVEMASK8(a)		_mm_movemask_epi8(a)
+#define MM_OR(a, b)		_mm_or_si128(a, b)
+#define	MM_SET1_16(a)		_mm_set1_epi16(a)
+#define	MM_SET1_32(a)		_mm_set1_epi32(a)
+#define	MM_SET1_64(a)		_mm_set1_epi64(a)
+#define	MM_SET1_8(a)		_mm_set1_epi8(a)
+#define	MM_SET32(a, b, c, d)	_mm_set_epi32(a, b, c, d)
+#define	MM_SHUFFLE32(a, b)	_mm_shuffle_epi32(a, b)
+#define	MM_SHUFFLE8(a, b)	_mm_shuffle_epi8(a, b)
+#define	MM_SHUFFLEPS(a, b, c)	_mm_shuffle_ps(a, b, c)
+#define	MM_SIGN8(a, b)		_mm_sign_epi8(a, b)
+#define	MM_SLL64(a, b)		_mm_sll_epi64(a, b)
+#define	MM_SRL128(a, b)		_mm_srli_si128(a, b)
+#define MM_SRL16(a, b)		_mm_srli_epi16(a, b)
+#define	MM_SRL32(a, b)		_mm_srli_epi32(a, b)
+#define	MM_STORE(a, b)		_mm_store_si128(a, b)
+#define	MM_STOREU(a, b)		_mm_storeu_si128(a, b)
+#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
+#define	MM_XOR(a, b)		_mm_xor_si128(a, b)
+
+#define	MM_SET16(a, b, c, d, e, f, g, h)	\
+	_mm_set_epi16(a, b, c, d, e, f, g, h)
+
+#define	MM_SET8(c0, c1, c2, c3, c4, c5, c6, c7,	\
+		c8, c9, cA, cB, cC, cD, cE, cF)	\
+	_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,	\
+		c8, c9, cA, cB, cC, cD, cE, cF)
+
+#ifdef RTE_ARCH_X86_64
+
+#define	MM_CVT64(a)		_mm_cvtsi128_si64(a)
+
+#else
+
+#define	MM_CVT64(a)	({ \
+	rte_xmm_t m;       \
+	m.m = (a);         \
+	(m.u64[0]);        \
+})
+
+#endif /*RTE_ARCH_X86_64 */
+
+/*
+ * Prior to version 12.1 icc doesn't support _mm_set_epi64x.
+ */
+#if (defined(__ICC) && __ICC < 1210)
+
+#define	MM_SET64(a, b)	({ \
+	rte_xmm_t m;       \
+	m.u64[0] = b;      \
+	m.u64[1] = a;      \
+	(m.m);             \
+})
+
+#else
+
+#define	MM_SET64(a, b)		_mm_set_epi64x(a, b)
+
+#endif /* (defined(__ICC) && __ICC < 1210) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ACL_VECT_H_ */
--- a/lib/librte_acl/rte_acl.c
+++ b/lib/librte_acl/rte_acl.c
@ -0,0 +1,415 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_acl.h>
+#include "acl.h"
+
+#define	BIT_SIZEOF(x)	(sizeof(x) * CHAR_BIT)
+
+TAILQ_HEAD(rte_acl_list, rte_acl_ctx);
+
+struct rte_acl_ctx *
+rte_acl_find_existing(const char *name)
+{
+	struct rte_acl_ctx *ctx;
+	struct rte_acl_list *acl_list;
+
+	/* check that we have an initialised tail queue */
+	acl_list = RTE_TAILQ_LOOKUP_BY_IDX(RTE_TAILQ_ACL, rte_acl_list);
+	if (acl_list == NULL) {
+		rte_errno = E_RTE_NO_TAILQ;
+		return NULL;
+	}
+
+	rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK);
+	TAILQ_FOREACH(ctx, acl_list, next) {
+		if (strncmp(name, ctx->name, sizeof(ctx->name)) == 0)
+			break;
+	}
+	rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK);
+
+	if (ctx == NULL)
+		rte_errno = ENOENT;
+	return ctx;
+}
+
+void
+rte_acl_free(struct rte_acl_ctx *ctx)
+{
+	if (ctx == NULL)
+		return;
+
+	RTE_EAL_TAILQ_REMOVE(RTE_TAILQ_ACL, rte_acl_list, ctx);
+
+	rte_free(ctx->mem);
+	rte_free(ctx);
+}
+
+struct rte_acl_ctx *
+rte_acl_create(const struct rte_acl_param *param)
+{
+	size_t sz;
+	struct rte_acl_ctx *ctx;
+	struct rte_acl_list *acl_list;
+	char name[sizeof(ctx->name)];
+
+	/* check that we have an initialised tail queue */
+	acl_list = RTE_TAILQ_LOOKUP_BY_IDX(RTE_TAILQ_ACL, rte_acl_list);
+	if (acl_list == NULL) {
+		rte_errno = E_RTE_NO_TAILQ;
+		return NULL;
+	}
+
+	/* check that input parameters are valid. */
+	if (param == NULL || param->name == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	rte_snprintf(name, sizeof(name), "ACL_%s", param->name);
+
+	/* calculate amount of memory required for pattern set. */
+	sz = sizeof(*ctx) + param->max_rule_num * param->rule_size;
+
+	/* get EAL TAILQ lock. */
+	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
+
+	/* if we already have one with that name */
+	TAILQ_FOREACH(ctx, acl_list, next) {
+		if (strncmp(param->name, ctx->name, sizeof(ctx->name)) == 0)
+			break;
+	}
+
+	/* if ACL with such name doesn't exist, then create a new one. */
+	if (ctx == NULL && (ctx = rte_zmalloc_socket(name, sz, CACHE_LINE_SIZE,
+			param->socket_id)) != NULL) {
+
+		/* init new allocated context. */
+		ctx->rules = ctx + 1;
+		ctx->max_rules = param->max_rule_num;
+		ctx->rule_sz = param->rule_size;
+		ctx->socket_id = param->socket_id;
+		rte_snprintf(ctx->name, sizeof(ctx->name), "%s", param->name);
+
+		TAILQ_INSERT_TAIL(acl_list, ctx, next);
+
+	} else if (ctx == NULL) {
+		RTE_LOG(ERR, ACL,
+			"allocation of %zu bytes on socket %d for %s failed\n",
+			sz, param->socket_id, name);
+	}
+
+	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+	return ctx;
+}
+
+static int
+acl_add_rules(struct rte_acl_ctx *ctx, const void *rules, uint32_t num)
+{
+	uint8_t *pos;
+
+	if (num + ctx->num_rules > ctx->max_rules)
+		return -ENOMEM;
+
+	pos = ctx->rules;
+	pos += ctx->rule_sz * ctx->num_rules;
+	memcpy(pos, rules, num * ctx->rule_sz);
+	ctx->num_rules += num;
+
+	return 0;
+}
+
+static int
+acl_check_rule(const struct rte_acl_rule_data *rd)
+{
+	if ((rd->category_mask & LEN2MASK(RTE_ACL_MAX_CATEGORIES)) == 0 ||
+			rd->priority > RTE_ACL_MAX_PRIORITY ||
+			rd->priority < RTE_ACL_MIN_PRIORITY ||
+			rd->userdata == RTE_ACL_INVALID_USERDATA)
+		return -EINVAL;
+	return 0;
+}
+
+int
+rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules,
+	uint32_t num)
+{
+	const struct rte_acl_rule *rv;
+	uint32_t i;
+	int32_t rc;
+
+	if (ctx == NULL || rules == NULL || 0 == ctx->rule_sz)
+		return -EINVAL;
+
+	for (i = 0; i != num; i++) {
+		rv = (const struct rte_acl_rule *)
+			((uintptr_t)rules + i * ctx->rule_sz);
+		rc = acl_check_rule(&rv->data);
+		if (rc != 0) {
+			RTE_LOG(ERR, ACL, "%s(%s): rule #%u is invalid\n",
+				__func__, ctx->name, i + 1);
+			return rc;
+		}
+	}
+
+	return acl_add_rules(ctx, rules, num);
+}
+
+/*
+ * Reset all rules.
+ * Note that RT structures are not affected.
+ */
+void
+rte_acl_reset_rules(struct rte_acl_ctx *ctx)
+{
+	if (ctx != NULL)
+		ctx->num_rules = 0;
+}
+
+/*
+ * Reset all rules and destroys RT structures.
+ */
+void
+rte_acl_reset(struct rte_acl_ctx *ctx)
+{
+	if (ctx != NULL) {
+		rte_acl_reset_rules(ctx);
+		rte_acl_build(ctx, &ctx->config);
+	}
+}
+
+/*
+ * Dump ACL context to the stdout.
+ */
+void
+rte_acl_dump(const struct rte_acl_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	printf("acl context <%s>@%p\n", ctx->name, ctx);
+	printf("  max_rules=%"PRIu32"\n", ctx->max_rules);
+	printf("  rule_size=%"PRIu32"\n", ctx->rule_sz);
+	printf("  num_rules=%"PRIu32"\n", ctx->num_rules);
+	printf("  num_categories=%"PRIu32"\n", ctx->num_categories);
+	printf("  num_tries=%"PRIu32"\n", ctx->num_tries);
+}
+
+/*
+ * Dump all ACL contexts to the stdout.
+ */
+void
+rte_acl_list_dump(void)
+{
+	struct rte_acl_ctx *ctx;
+	struct rte_acl_list *acl_list;
+
+	/* check that we have an initialised tail queue */
+	acl_list = RTE_TAILQ_LOOKUP_BY_IDX(RTE_TAILQ_ACL, rte_acl_list);
+	if (acl_list == NULL) {
+		rte_errno = E_RTE_NO_TAILQ;
+		return;
+	}
+
+	rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK);
+	TAILQ_FOREACH(ctx, acl_list, next) {
+		rte_acl_dump(ctx);
+	}
+	rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK);
+}
+
+/*
+ * Support for legacy ipv4vlan rules.
+ */
+
+RTE_ACL_RULE_DEF(acl_ipv4vlan_rule, RTE_ACL_IPV4VLAN_NUM_FIELDS);
+
+static int
+acl_ipv4vlan_check_rule(const struct rte_acl_ipv4vlan_rule *rule)
+{
+	if (rule->src_port_low > rule->src_port_high ||
+			rule->dst_port_low > rule->dst_port_high ||
+			rule->src_mask_len > BIT_SIZEOF(rule->src_addr) ||
+			rule->dst_mask_len > BIT_SIZEOF(rule->dst_addr))
+		return -EINVAL;
+
+	return acl_check_rule(&rule->data);
+}
+
+static void
+acl_ipv4vlan_convert_rule(const struct rte_acl_ipv4vlan_rule *ri,
+	struct acl_ipv4vlan_rule *ro)
+{
+	ro->data = ri->data;
+
+	ro->field[RTE_ACL_IPV4VLAN_PROTO_FIELD].value.u8 = ri->proto;
+	ro->field[RTE_ACL_IPV4VLAN_VLAN1_FIELD].value.u16 = ri->vlan;
+	ro->field[RTE_ACL_IPV4VLAN_VLAN2_FIELD].value.u16 = ri->domain;
+	ro->field[RTE_ACL_IPV4VLAN_SRC_FIELD].value.u32 = ri->src_addr;
+	ro->field[RTE_ACL_IPV4VLAN_DST_FIELD].value.u32 = ri->dst_addr;
+	ro->field[RTE_ACL_IPV4VLAN_SRCP_FIELD].value.u16 = ri->src_port_low;
+	ro->field[RTE_ACL_IPV4VLAN_DSTP_FIELD].value.u16 = ri->dst_port_low;
+
+	ro->field[RTE_ACL_IPV4VLAN_PROTO_FIELD].mask_range.u8 = ri->proto_mask;
+	ro->field[RTE_ACL_IPV4VLAN_VLAN1_FIELD].mask_range.u16 = ri->vlan_mask;
+	ro->field[RTE_ACL_IPV4VLAN_VLAN2_FIELD].mask_range.u16 =
+		ri->domain_mask;
+	ro->field[RTE_ACL_IPV4VLAN_SRC_FIELD].mask_range.u32 =
+		ri->src_mask_len;
+	ro->field[RTE_ACL_IPV4VLAN_DST_FIELD].mask_range.u32 = ri->dst_mask_len;
+	ro->field[RTE_ACL_IPV4VLAN_SRCP_FIELD].mask_range.u16 =
+		ri->src_port_high;
+	ro->field[RTE_ACL_IPV4VLAN_DSTP_FIELD].mask_range.u16 =
+		ri->dst_port_high;
+}
+
+int
+rte_acl_ipv4vlan_add_rules(struct rte_acl_ctx *ctx,
+	const struct rte_acl_ipv4vlan_rule *rules,
+	uint32_t num)
+{
+	int32_t rc;
+	uint32_t i;
+	struct acl_ipv4vlan_rule rv;
+
+	if (ctx == NULL || rules == NULL || ctx->rule_sz != sizeof(rv))
+		return -EINVAL;
+
+	/* check input rules. */
+	for (i = 0; i != num; i++) {
+		rc = acl_ipv4vlan_check_rule(rules + i);
+		if (rc != 0) {
+			RTE_LOG(ERR, ACL, "%s(%s): rule #%u is invalid\n",
+				__func__, ctx->name, i + 1);
+			return rc;
+		}
+	}
+
+	if (num + ctx->num_rules > ctx->max_rules)
+		return -ENOMEM;
+
+	/* perform conversion to the internal format and add to the context. */
+	for (i = 0, rc = 0; i != num && rc == 0; i++) {
+		acl_ipv4vlan_convert_rule(rules + i, &rv);
+		rc = acl_add_rules(ctx, &rv, 1);
+	}
+
+	return rc;
+}
+
+static void
+acl_ipv4vlan_config(struct rte_acl_config *cfg,
+	const uint32_t layout[RTE_ACL_IPV4VLAN_NUM],
+	uint32_t num_categories)
+{
+	static const struct rte_acl_field_def
+		ipv4_defs[RTE_ACL_IPV4VLAN_NUM_FIELDS] = {
+		{
+			.type = RTE_ACL_FIELD_TYPE_BITMASK,
+			.size = sizeof(uint8_t),
+			.field_index = RTE_ACL_IPV4VLAN_PROTO_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_PROTO,
+		},
+		{
+			.type = RTE_ACL_FIELD_TYPE_BITMASK,
+			.size = sizeof(uint16_t),
+			.field_index = RTE_ACL_IPV4VLAN_VLAN1_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_VLAN,
+		},
+		{
+			.type = RTE_ACL_FIELD_TYPE_BITMASK,
+			.size = sizeof(uint16_t),
+			.field_index = RTE_ACL_IPV4VLAN_VLAN2_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_VLAN,
+		},
+		{
+			.type = RTE_ACL_FIELD_TYPE_MASK,
+			.size = sizeof(uint32_t),
+			.field_index = RTE_ACL_IPV4VLAN_SRC_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_SRC,
+		},
+		{
+			.type = RTE_ACL_FIELD_TYPE_MASK,
+			.size = sizeof(uint32_t),
+			.field_index = RTE_ACL_IPV4VLAN_DST_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_DST,
+		},
+		{
+			.type = RTE_ACL_FIELD_TYPE_RANGE,
+			.size = sizeof(uint16_t),
+			.field_index = RTE_ACL_IPV4VLAN_SRCP_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_PORTS,
+		},
+		{
+			.type = RTE_ACL_FIELD_TYPE_RANGE,
+			.size = sizeof(uint16_t),
+			.field_index = RTE_ACL_IPV4VLAN_DSTP_FIELD,
+			.input_index = RTE_ACL_IPV4VLAN_PORTS,
+		},
+	};
+
+	memcpy(&cfg->defs, ipv4_defs, sizeof(ipv4_defs));
+	cfg->num_fields = RTE_DIM(ipv4_defs);
+
+	cfg->defs[RTE_ACL_IPV4VLAN_PROTO_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_PROTO];
+	cfg->defs[RTE_ACL_IPV4VLAN_VLAN1_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_VLAN];
+	cfg->defs[RTE_ACL_IPV4VLAN_VLAN2_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_VLAN] +
+		cfg->defs[RTE_ACL_IPV4VLAN_VLAN1_FIELD].size;
+	cfg->defs[RTE_ACL_IPV4VLAN_SRC_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_SRC];
+	cfg->defs[RTE_ACL_IPV4VLAN_DST_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_DST];
+	cfg->defs[RTE_ACL_IPV4VLAN_SRCP_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_PORTS];
+	cfg->defs[RTE_ACL_IPV4VLAN_DSTP_FIELD].offset =
+		layout[RTE_ACL_IPV4VLAN_PORTS] +
+		cfg->defs[RTE_ACL_IPV4VLAN_SRCP_FIELD].size;
+
+	cfg->num_categories = num_categories;
+}
+
+int
+rte_acl_ipv4vlan_build(struct rte_acl_ctx *ctx,
+	const uint32_t layout[RTE_ACL_IPV4VLAN_NUM],
+	uint32_t num_categories)
+{
+	struct rte_acl_config cfg;
+
+	if (ctx == NULL || layout == NULL)
+		return -EINVAL;
+
+	acl_ipv4vlan_config(&cfg, layout, num_categories);
+	return rte_acl_build(ctx, &cfg);
+}
--- a/lib/librte_acl/rte_acl.h
+++ b/lib/librte_acl/rte_acl.h
@ -0,0 +1,453 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_H_
+#define _RTE_ACL_H_
+
+/**
+ * @file
+ *
+ * RTE Classifier.
+ */
+
+#include <rte_acl_osdep.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	RTE_ACL_MAX_CATEGORIES	16
+
+#define	RTE_ACL_RESULTS_MULTIPLIER	(XMM_SIZE / sizeof(uint32_t))
+
+#define RTE_ACL_MAX_LEVELS 64
+#define RTE_ACL_MAX_FIELDS 64
+
+union rte_acl_field_types {
+	uint8_t  u8;
+	uint16_t u16;
+	uint32_t u32;
+	uint64_t u64;
+};
+
+enum {
+	RTE_ACL_FIELD_TYPE_MASK = 0,
+	RTE_ACL_FIELD_TYPE_RANGE,
+	RTE_ACL_FIELD_TYPE_BITMASK
+};
+
+/**
+ * ACL Field defintion.
+ * Each field in the ACL rule has an associate definition.
+ * It defines the type of field, its size, its offset in the input buffer,
+ * the field index, and the input index.
+ * For performance reasons, the inner loop of the search function is unrolled
+ * to process four input bytes at a time. This requires the input to be grouped
+ * into sets of 4 consecutive bytes. The loop processes the first input byte as
+ * part of the setup and then subsequent bytes must be in groups of 4
+ * consecutive bytes.
+ */
+struct rte_acl_field_def {
+	uint8_t  type;        /**< type - RTE_ACL_FIELD_TYPE_*. */
+	uint8_t	 size;        /**< size of field 1,2,4, or 8. */
+	uint8_t	 field_index; /**< index of field inside the rule. */
+	uint8_t  input_index; /**< 0-N input index. */
+	uint32_t offset;      /**< offset to start of field. */
+};
+
+/**
+ * ACL build configuration.
+ * Defines the fields of an ACL trie and number of categories to build with.
+ */
+struct rte_acl_config {
+	uint32_t num_categories; /**< Number of categories to build with. */
+	uint32_t num_fields;     /**< Number of field definitions. */
+	struct rte_acl_field_def defs[RTE_ACL_MAX_FIELDS];
+	/**< array of field definitions. */
+};
+
+/**
+ * Defines the value of a field for a rule.
+ */
+struct rte_acl_field {
+	union rte_acl_field_types value;
+	/**< a 1,2,4, or 8 byte value of the field. */
+	union rte_acl_field_types mask_range;
+	/**<
+	 * depending on field type:
+	 * mask -> 1.2.3.4/32 value=0x1020304, mask_range=32,
+	 * range -> 0 : 65535 value=0, mask_range=65535,
+	 * bitmask -> 0x06/0xff value=6, mask_range=0xff.
+	 */
+};
+
+enum {
+	RTE_ACL_TYPE_SHIFT = 29,
+	RTE_ACL_MAX_INDEX = LEN2MASK(RTE_ACL_TYPE_SHIFT),
+	RTE_ACL_MAX_PRIORITY = RTE_ACL_MAX_INDEX,
+	RTE_ACL_MIN_PRIORITY = 0,
+};
+
+#define	RTE_ACL_INVALID_USERDATA	0
+
+/**
+ * Miscellaneous data for ACL rule.
+ */
+struct rte_acl_rule_data {
+	uint32_t category_mask; /**< Mask of categories for that rule. */
+	int32_t  priority;      /**< Priority for that rule. */
+	uint32_t userdata;      /**< Associated with the rule user data. */
+};
+
+/**
+ * Defines single ACL rule.
+ * data - miscellaneous data for the rule.
+ * field[] - value and mask or range for each field.
+ */
+#define	RTE_ACL_RULE_DEF(name, fld_num)	struct name {\
+	struct rte_acl_rule_data data;               \
+	struct rte_acl_field field[fld_num];         \
+}
+
+RTE_ACL_RULE_DEF(rte_acl_rule, 0);
+
+#define	RTE_ACL_RULE_SZ(fld_num)	\
+	(sizeof(struct rte_acl_rule) + sizeof(struct rte_acl_field) * (fld_num))
+
+
+/** Max number of characters in name.*/
+#define	RTE_ACL_NAMESIZE		32
+
+/**
+ * Parameters used when creating the ACL context.
+ */
+struct rte_acl_param {
+	const char *name;         /**< Name of the ACL context. */
+	int         socket_id;    /**< Socket ID to allocate memory for. */
+	uint32_t    rule_size;    /**< Size of each rule. */
+	uint32_t    max_rule_num; /**< Maximum number of rules. */
+};
+
+
+/**
+ * Create a new ACL context.
+ *
+ * @param param
+ *   Parameters used to create and initialise the ACL context.
+ * @return
+ *   Pointer to ACL context structure that is used in future ACL
+ *   operations, or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - E_RTE_NO_TAILQ - no tailq list could be got for the ACL context list
+ *   - EINVAL - invalid parameter passed to function
+ */
+struct rte_acl_ctx *
+rte_acl_create(const struct rte_acl_param *param);
+
+/**
+ * Find an existing ACL context object and return a pointer to it.
+ *
+ * @param name
+ *   Name of the ACL context as passed to rte_acl_create()
+ * @return
+ *   Pointer to ACL context or NULL if object not found
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - ENOENT - value not available for return
+ */
+struct rte_acl_ctx *
+rte_acl_find_existing(const char *name);
+
+/**
+ * De-allocate all memory used by ACL context.
+ *
+ * @param ctx
+ *   ACL context to free
+ */
+void
+rte_acl_free(struct rte_acl_ctx *ctx);
+
+/**
+ * Add rules to an existing ACL context.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to add patterns to.
+ * @param rules
+ *   Array of rules to add to the ACL context.
+ *   Note that all fields in rte_acl_rule structures are expected
+ *   to be in host byte order.
+ *   Each rule expected to be in the same format and not exceed size
+ *   specified at ACL context creation time.
+ * @param num
+ *   Number of elements in the input array of rules.
+ * @return
+ *   - -ENOMEM if there is no space in the ACL context for these rules.
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int
+rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules,
+	uint32_t num);
+
+/**
+ * Delete all rules from the ACL context.
+ * This function is not multi-thread safe.
+ * Note that internal run-time structures are not affected.
+ *
+ * @param ctx
+ *   ACL context to delete rules from.
+ */
+void
+rte_acl_reset_rules(struct rte_acl_ctx *ctx);
+
+/**
+ * Analyze set of rules and build required internal run-time structures.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to build.
+ * @param cfg
+ *   Pointer to struct rte_acl_config - defines build parameters.
+ * @return
+ *   - -ENOMEM if couldn't allocate enough memory.
+ *   - -EINVAL if the parameters are invalid.
+ *   - Negative error code if operation failed.
+ *   - Zero if operation completed successfully.
+ */
+int
+rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg);
+
+/**
+ * Delete all rules from the ACL context and
+ * destroy all internal run-time structures.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to reset.
+ */
+void
+rte_acl_reset(struct rte_acl_ctx *ctx);
+
+/**
+ * Search for a matching ACL rule for each input data buffer.
+ * Each input data buffer can have up to *categories* matches.
+ * That implies that results array should be big enough to hold
+ * (categories * num) elements.
+ * Also categories parameter should be either one or multiple of
+ * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES.
+ * If more than one rule is applicable for given input buffer and
+ * given category, then rule with highest priority will be returned as a match.
+ * Note, that it is a caller responsibility to ensure that input parameters
+ * are valid and point to correct memory locations.
+ *
+ * @param ctx
+ *   ACL context to search with.
+ * @param data
+ *   Array of pointers to input data buffers to perform search.
+ *   Note that all fields in input data buffers supposed to be in network
+ *   byte order (MSB).
+ * @param results
+ *   Array of search results, *categories* results per each input data buffer.
+ * @param num
+ *   Number of elements in the input data buffers array.
+ * @param categories
+ *   Number of maximum possible matches for each input buffer, one possible
+ *   match per category.
+ * @return
+ *   zero on successful completion.
+ *   -EINVAL for incorrect arguments.
+ */
+int
+rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
+/**
+ * Perform scalar search for a matching ACL rule for each input data buffer.
+ * Note, that while the search itself will avoid explicit use of SSE/AVX
+ * intrinsics, code for comparing matching results/priorities sill might use
+ * vector intrinsics (for  categories > 1).
+ * Each input data buffer can have up to *categories* matches.
+ * That implies that results array should be big enough to hold
+ * (categories * num) elements.
+ * Also categories parameter should be either one or multiple of
+ * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES.
+ * If more than one rule is applicable for given input buffer and
+ * given category, then rule with highest priority will be returned as a match.
+ * Note, that it is a caller's responsibility to ensure that input parameters
+ * are valid and point to correct memory locations.
+ *
+ * @param ctx
+ *   ACL context to search with.
+ * @param data
+ *   Array of pointers to input data buffers to perform search.
+ *   Note that all fields in input data buffers supposed to be in network
+ *   byte order (MSB).
+ * @param results
+ *   Array of search results, *categories* results per each input data buffer.
+ * @param num
+ *   Number of elements in the input data buffers array.
+ * @param categories
+ *   Number of maximum possible matches for each input buffer, one possible
+ *   match per category.
+ * @return
+ *   zero on successful completion.
+ *   -EINVAL for incorrect arguments.
+ */
+int
+rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
+/**
+ * Dump an ACL context structure to the console.
+ *
+ * @param ctx
+ *   ACL context to dump.
+ */
+void
+rte_acl_dump(const struct rte_acl_ctx *ctx);
+
+/**
+ * Dump all ACL context structures to the console.
+ */
+void
+rte_acl_list_dump(void);
+
+/**
+ * Legacy support for 7-tuple IPv4 and VLAN rule.
+ * This structure and corresponding API is deprecated.
+ */
+struct rte_acl_ipv4vlan_rule {
+	struct rte_acl_rule_data data; /**< Miscellaneous data for the rule. */
+	uint8_t proto;                 /**< IPv4 protocol ID. */
+	uint8_t proto_mask;            /**< IPv4 protocol ID mask. */
+	uint16_t vlan;                 /**< VLAN ID. */
+	uint16_t vlan_mask;            /**< VLAN ID mask. */
+	uint16_t domain;               /**< VLAN domain. */
+	uint16_t domain_mask;          /**< VLAN domain mask. */
+	uint32_t src_addr;             /**< IPv4 source address. */
+	uint32_t src_mask_len;         /**< IPv4 source address mask. */
+	uint32_t dst_addr;             /**< IPv4 destination address. */
+	uint32_t dst_mask_len;         /**< IPv4 destination address mask. */
+	uint16_t src_port_low;         /**< L4 source port low. */
+	uint16_t src_port_high;        /**< L4 source port high. */
+	uint16_t dst_port_low;         /**< L4 destination port low. */
+	uint16_t dst_port_high;        /**< L4 destination port high. */
+};
+
+/**
+ * Specifies fields layout inside rte_acl_rule for rte_acl_ipv4vlan_rule.
+ */
+enum {
+	RTE_ACL_IPV4VLAN_PROTO_FIELD,
+	RTE_ACL_IPV4VLAN_VLAN1_FIELD,
+	RTE_ACL_IPV4VLAN_VLAN2_FIELD,
+	RTE_ACL_IPV4VLAN_SRC_FIELD,
+	RTE_ACL_IPV4VLAN_DST_FIELD,
+	RTE_ACL_IPV4VLAN_SRCP_FIELD,
+	RTE_ACL_IPV4VLAN_DSTP_FIELD,
+	RTE_ACL_IPV4VLAN_NUM_FIELDS
+};
+
+/**
+ * Macro to define rule size for rte_acl_ipv4vlan_rule.
+ */
+#define	RTE_ACL_IPV4VLAN_RULE_SZ	\
+	RTE_ACL_RULE_SZ(RTE_ACL_IPV4VLAN_NUM_FIELDS)
+
+/*
+ * That effectively defines order of IPV4VLAN classifications:
+ *  - PROTO
+ *  - VLAN (TAG and DOMAIN)
+ *  - SRC IP ADDRESS
+ *  - DST IP ADDRESS
+ *  - PORTS (SRC and DST)
+ */
+enum {
+	RTE_ACL_IPV4VLAN_PROTO,
+	RTE_ACL_IPV4VLAN_VLAN,
+	RTE_ACL_IPV4VLAN_SRC,
+	RTE_ACL_IPV4VLAN_DST,
+	RTE_ACL_IPV4VLAN_PORTS,
+	RTE_ACL_IPV4VLAN_NUM
+};
+
+/**
+ * Add ipv4vlan rules to an existing ACL context.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to add patterns to.
+ * @param rules
+ *   Array of rules to add to the ACL context.
+ *   Note that all fields in rte_acl_ipv4vlan_rule structures are expected
+ *   to be in host byte order.
+ * @param num
+ *   Number of elements in the input array of rules.
+ * @return
+ *   - -ENOMEM if there is no space in the ACL context for these rules.
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int
+rte_acl_ipv4vlan_add_rules(struct rte_acl_ctx *ctx,
+	const struct rte_acl_ipv4vlan_rule *rules,
+	uint32_t num);
+
+/**
+ * Analyze set of ipv4vlan rules and build required internal
+ * run-time structures.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to build.
+ * @param layout
+ *   Layout of input data to search through.
+ * @param num_categories
+ *   Maximum number of categories to use in that build.
+ * @return
+ *   - -ENOMEM if couldn't allocate enough memory.
+ *   - -EINVAL if the parameters are invalid.
+ *   - Negative error code if operation failed.
+ *   - Zero if operation completed successfully.
+ */
+int
+rte_acl_ipv4vlan_build(struct rte_acl_ctx *ctx,
+	const uint32_t layout[RTE_ACL_IPV4VLAN_NUM],
+	uint32_t num_categories);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ACL_H_ */
--- a/lib/librte_acl/rte_acl_osdep.h
+++ b/lib/librte_acl/rte_acl_osdep.h
@ -0,0 +1,92 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_OSDEP_H_
+#define _RTE_ACL_OSDEP_H_
+
+/**
+ * @file
+ *
+ * RTE ACL DPDK/OS dependent file.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/queue.h>
+
+/*
+ * Common defines.
+ */
+
+#define	LEN2MASK(ln)	((uint32_t)(((uint64_t)1 << (ln)) - 1))
+
+#define DIM(x) RTE_DIM(x)
+
+/*
+ * To build ACL standalone.
+ */
+#ifdef RTE_LIBRTE_ACL_STANDALONE
+#include <rte_acl_osdep_alone.h>
+#else
+
+#include <rte_common.h>
+#include <rte_common_vect.h>
+#include <rte_memory.h>
+#include <rte_log.h>
+#include <rte_memcpy.h>
+#include <rte_prefetch.h>
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_errno.h>
+#include <rte_string_fns.h>
+#include <rte_cpuflags.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+
+#endif /* RTE_LIBRTE_ACL_STANDALONE */
+
+#endif /* _RTE_ACL_OSDEP_H_ */
--- a/lib/librte_acl/rte_acl_osdep_alone.h
+++ b/lib/librte_acl/rte_acl_osdep_alone.h
@ -0,0 +1,278 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_OSDEP_ALONE_H_
+#define _RTE_ACL_OSDEP_ALONE_H_
+
+/**
+ * @file
+ *
+ * RTE ACL OS dependent file.
+ * An example how to build/use ACL library standalone
+ * (without rest of DPDK).
+ * Don't include that file on it's own, use <rte_acl_osdep.h>.
+ */
+
+#if (defined(__ICC) || (__GNUC__ == 4 &&  __GNUC_MINOR__ < 4))
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#if defined(__SSE4_2__) || defined(__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+#else
+
+#include <x86intrin.h>
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	DUMMY_MACRO	do {} while (0)
+
+/*
+ * rte_common related.
+ */
+#define	__rte_unused	__attribute__((__unused__))
+
+#define RTE_PTR_ADD(ptr, x)	((typeof(ptr))((uintptr_t)(ptr) + (x)))
+
+#define	RTE_PTR_ALIGN_FLOOR(ptr, align) \
+	(typeof(ptr))((uintptr_t)(ptr) & ~((uintptr_t)(align) - 1))
+
+#define	RTE_PTR_ALIGN_CEIL(ptr, align) \
+	RTE_PTR_ALIGN_FLOOR(RTE_PTR_ADD(ptr, (align) - 1), align)
+
+#define	RTE_PTR_ALIGN(ptr, align)	RTE_PTR_ALIGN_CEIL(ptr, align)
+
+#define	RTE_ALIGN_FLOOR(val, align) \
+	(typeof(val))((val) & (~((typeof(val))((align) - 1))))
+
+#define	RTE_ALIGN_CEIL(val, align) \
+	RTE_ALIGN_FLOOR(((val) + ((typeof(val))(align) - 1)), align)
+
+#define	RTE_ALIGN(ptr, align)	RTE_ALIGN_CEIL(ptr, align)
+
+#define	RTE_MIN(a, b)	({ \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a < _b ? _a : _b;   \
+	})
+
+#define	RTE_DIM(a)		(sizeof(a) / sizeof((a)[0]))
+
+/**
+ * Searches the input parameter for the least significant set bit
+ * (starting from zero).
+ * If a least significant 1 bit is found, its bit index is returned.
+ * If the content of the input paramer is zero, then the content of the return
+ * value is undefined.
+ * @param v
+ *     input parameter, should not be zero.
+ * @return
+ *     least significant set bit in the input parameter.
+ */
+static inline uint32_t
+rte_bsf32(uint32_t v)
+{
+	asm("bsf %1,%0"
+		: "=r" (v)
+		: "rm" (v));
+	return v;
+}
+
+/*
+ * rte_common_vect related.
+ */
+typedef __m128i xmm_t;
+
+#define	XMM_SIZE	(sizeof(xmm_t))
+#define	XMM_MASK	(XMM_SIZE - 1)
+
+typedef union rte_mmsse {
+	xmm_t    m;
+	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
+	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+	double   pd[XMM_SIZE / sizeof(double)];
+} rte_xmm_t;
+
+/*
+ * rte_cycles related.
+ */
+static inline uint64_t
+rte_rdtsc(void)
+{
+	union {
+		uint64_t tsc_64;
+		struct {
+			uint32_t lo_32;
+			uint32_t hi_32;
+		};
+	} tsc;
+
+	asm volatile("rdtsc" :
+		"=a" (tsc.lo_32),
+		"=d" (tsc.hi_32));
+	return tsc.tsc_64;
+}
+
+/*
+ * rte_lcore related.
+ */
+#define rte_lcore_id()	(0)
+
+/*
+ * rte_errno related.
+ */
+#define	rte_errno	errno
+#define	E_RTE_NO_TAILQ	(-1)
+
+/*
+ * rte_rwlock related.
+ */
+#define	rte_rwlock_read_lock(x)		DUMMY_MACRO
+#define	rte_rwlock_read_unlock(x)	DUMMY_MACRO
+#define	rte_rwlock_write_lock(x)	DUMMY_MACRO
+#define	rte_rwlock_write_unlock(x)	DUMMY_MACRO
+
+/*
+ * rte_memory related.
+ */
+#define	SOCKET_ID_ANY	-1                  /**< Any NUMA socket. */
+#define	CACHE_LINE_SIZE	64                  /**< Cache line size. */
+#define	CACHE_LINE_MASK	(CACHE_LINE_SIZE-1) /**< Cache line mask. */
+
+/**
+ * Force alignment to cache line.
+ */
+#define	__rte_cache_aligned	__attribute__((__aligned__(CACHE_LINE_SIZE)))
+
+
+/*
+ * rte_byteorder related.
+ */
+#define	rte_le_to_cpu_16(x)	(x)
+#define	rte_le_to_cpu_32(x)	(x)
+
+#define rte_cpu_to_be_16(x)	\
+	(((x) & UINT8_MAX) << CHAR_BIT | ((x) >> CHAR_BIT & UINT8_MAX))
+#define rte_cpu_to_be_32(x)	__builtin_bswap32(x)
+
+/*
+ * rte_branch_prediction related.
+ */
+#ifndef	likely
+#define	likely(x)	__builtin_expect((x), 1)
+#endif	/* likely */
+
+#ifndef	unlikely
+#define	unlikely(x)	__builtin_expect((x), 0)
+#endif	/* unlikely */
+
+
+/*
+ * rte_tailq related.
+ */
+static inline void *
+rte_dummy_tailq(void)
+{
+	static __thread TAILQ_HEAD(rte_dummy_head, rte_dummy) dummy_head;
+	TAILQ_INIT(&dummy_head);
+	return &dummy_head;
+}
+
+#define	RTE_TAILQ_LOOKUP_BY_IDX(idx, struct_name)	rte_dummy_tailq()
+
+#define RTE_EAL_TAILQ_REMOVE(idx, type, elm)	DUMMY_MACRO
+
+/*
+ * rte_string related
+ */
+#define	rte_snprintf(str, len, frmt, args...)	snprintf(str, len, frmt, ##args)
+
+/*
+ * rte_log related
+ */
+#define RTE_LOG(l, t, fmt, args...)	printf(fmt, ##args)
+
+/*
+ * rte_malloc related
+ */
+#define	rte_free(x)	free(x)
+
+static inline void *
+rte_zmalloc_socket(__rte_unused const char *type, size_t size, unsigned align,
+	__rte_unused int socket)
+{
+	void *ptr;
+	int rc;
+
+	rc = posix_memalign(&ptr, align, size);
+	if (rc != 0) {
+		rte_errno = rc;
+		return NULL;
+	}
+
+	memset(ptr, 0, size);
+	return ptr;
+}
+
+/*
+ * rte_debug related
+ */
+#define	rte_panic(fmt, args...)	do {         \
+	RTE_LOG(CRIT, EAL, fmt, ##args);     \
+	abort();                             \
+} while (0)
+
+#define	rte_exit(err, fmt, args...)	do { \
+	RTE_LOG(CRIT, EAL, fmt, ##args);     \
+	exit(err);                           \
+} while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ACL_OSDEP_ALONE_H_ */
--- a/lib/librte_acl/tb_mem.c
+++ b/lib/librte_acl/tb_mem.c
@ -0,0 +1,104 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tb_mem.h"
+
+/*
+ *  Memory managment routines for temporary memory.
+ *  That memory is used only during build phase and is released after
+ *  build is finished.
+ */
+
+static struct tb_mem_block *
+tb_pool(struct tb_mem_pool *pool, size_t sz)
+{
+	struct tb_mem_block *block;
+	uint8_t *ptr;
+	size_t size;
+
+	size = sz + pool->alignment - 1;
+	block = calloc(1, size + sizeof(*pool->block));
+	if (block == NULL) {
+		RTE_LOG(ERR, MALLOC, "%s(%zu)\n failed, currently allocated "
+			"by pool: %zu bytes\n", __func__, sz, pool->alloc);
+		return NULL;
+	}
+
+	block->pool = pool;
+
+	block->next = pool->block;
+	pool->block = block;
+
+	pool->alloc += size;
+
+	ptr = (uint8_t *)(block + 1);
+	block->mem = RTE_PTR_ALIGN_CEIL(ptr, pool->alignment);
+	block->size = size - (block->mem - ptr);
+
+	return block;
+}
+
+void *
+tb_alloc(struct tb_mem_pool *pool, size_t size)
+{
+	struct tb_mem_block *block;
+	void *ptr;
+	size_t new_sz;
+
+	size = RTE_ALIGN_CEIL(size, pool->alignment);
+
+	block = pool->block;
+	if (block == NULL || block->size < size) {
+		new_sz = (size > pool->min_alloc) ? size : pool->min_alloc;
+		block = tb_pool(pool, new_sz);
+		if (block == NULL)
+			return NULL;
+	}
+	ptr = block->mem;
+	block->size -= size;
+	block->mem += size;
+	return ptr;
+}
+
+void
+tb_free_pool(struct tb_mem_pool *pool)
+{
+	struct tb_mem_block *next, *block;
+
+	for (block = pool->block; block != NULL; block = next) {
+		next = block->next;
+		free(block);
+	}
+	pool->block = NULL;
+	pool->alloc = 0;
+}
--- a/lib/librte_acl/tb_mem.h
+++ b/lib/librte_acl/tb_mem.h
@ -0,0 +1,73 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TB_MEM_H_
+#define _TB_MEM_H_
+
+/**
+ * @file
+ *
+ * RTE ACL temporary (build phase) memory managment.
+ * Contains structures and functions to manage temporary (used by build only)
+ * memory. Memory allocated in large blocks to speed 'free' when trie is
+ * destructed (finish of build phase).
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_acl_osdep.h>
+
+struct tb_mem_block {
+	struct tb_mem_block *next;
+	struct tb_mem_pool  *pool;
+	size_t               size;
+	uint8_t             *mem;
+};
+
+struct tb_mem_pool {
+	struct tb_mem_block *block;
+	size_t               alignment;
+	size_t               min_alloc;
+	size_t               alloc;
+};
+
+void *tb_alloc(struct tb_mem_pool *pool, size_t size);
+void tb_free_pool(struct tb_mem_pool *pool);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TB_MEM_H_ */