Compare commits

...

50 Commits
dev ... master

Author SHA1 Message Date
3320852dd5 sandybridge doesn't support clflushopt 2023-12-06 04:22:46 +08:00
76a41666a0 fix dpdk 2023-12-06 03:38:32 +08:00
b57fe6e5ea akh morn 2023-12-06 03:23:00 +08:00
oscar
fc687426ae stuff 2023-05-01 15:28:51 -04:00
quackerd
aba80e8869 stuff 2023-05-01 21:18:34 +02:00
quackerd
1a90104d53 minor fix 2023-03-29 22:00:59 +02:00
quackerd
59b8c36ced multiarch 2023-03-17 21:13:05 +01:00
quackerd
4effb3f1bd multiarch 2023-03-16 09:43:34 +01:00
oscar
bb9792cf06 memloadgen allocate memory in thread 2023-03-15 19:44:46 -04:00
oscar
a385866002 memloadgen allocate memory in thread 2023-03-15 19:10:52 -04:00
oscar
7e4fd3d721 memloadgen allocate memory in thread 2023-03-15 19:07:36 -04:00
oscar
05965dbb94 memloadgen allocate memory in thread 2023-03-15 18:43:37 -04:00
quackerd
25c18b4fc5 stdin based pct control 2023-03-05 16:48:54 +01:00
quackerd
28d469e8ff better printing 2023-03-05 15:59:42 +01:00
quackerd
6cd0e7d12f add signal control 2023-03-05 15:58:06 +01:00
quackerd
521a49d945 add magic number 2023-03-05 15:15:13 +01:00
quackerd
a9cac61069 cleanup and stuff 2023-01-04 17:25:32 +01:00
quackerd
f20ae16e31 temp commit 2022-12-14 20:52:12 +01:00
quackerd
2a543d7e4d iperf 2022-11-30 20:37:51 +01:00
quackerd
a3b7b7db5d iperf 2022-11-26 00:08:26 +01:00
quackerd
5e76edab89 useless but useful check 2022-11-24 10:11:14 +01:00
oscar
d0c7329f9f iperf 2022-11-23 20:05:48 -05:00
quackerd
4ff2de5d1e dpdk refactor 2022-11-22 16:27:27 +01:00
quackerd
933e9708f3 refactor iperf conf to human readable 2022-11-22 13:58:33 +01:00
quackerd
e85928e3f5 iperf script change 2022-11-21 22:52:13 +01:00
quackerd
df880a453c new scripts 2022-11-18 09:27:04 +01:00
oscar
b5be9c38fe memloadgen 2022-11-16 15:37:39 -05:00
quackerd
18339fb109 memloadgen pct support 2022-11-16 08:44:43 +01:00
quackerd
1836bd89df memloadgen rate control 2022-11-11 22:11:50 +01:00
quackerd
075902ba1d add break 2022-11-01 11:27:34 +01:00
quackerd
68b621fd3c snapshot memloadgen transaction change 2022-11-01 11:01:23 +01:00
565dbca278 latest dpdk & refactoring 2022-06-22 23:40:48 +08:00
a716583b19 update various components for new machines 2022-05-25 06:55:01 -04:00
d217bde46a bug fix 2022-03-29 00:50:10 +08:00
6e7e152915 posix support 2022-03-29 00:47:46 +08:00
0d26960686 nvme support 2022-03-21 23:01:24 +08:00
186150ca00 fixed hardcoded exit 2022-03-21 19:45:42 +08:00
27c6cd188d device driver abstraction 2022-03-21 19:43:49 +08:00
2ecfacff11 spdk 2022-03-20 22:17:26 +08:00
0dc463ba35 memload generator 2022-02-21 21:41:40 +08:00
997587c519 temp save 2021-03-17 21:45:01 -04:00
cd4785f08a add mem region support for nm malloc 2021-03-04 02:25:34 -05:00
4d50e55e1e +fix workload gen 2021-03-04 01:54:13 -05:00
7fd7c7f776 +libnm refactor and numa allocator support.
+khat threads now have numa-local memory.
2021-03-03 22:22:06 -05:00
b85777e6f0 +stuff? 2021-02-23 13:12:27 -05:00
162d41a4cc + cat packet loss control and max packet loss tolerance \ + output and parse packet loss for master and slaves 2021-02-22 06:54:53 -05:00
1fd9be7f13
+ packet loss control & + packet depth control 2021-02-21 05:16:39 -05:00
d1e43dcf2f
+Bench scripts 2021-02-20 04:53:55 -05:00
06b93ddf1c memload gen
Summary: Add memload generator

Test Plan: by hand

Reviewers: ali

Differential Revision: https://review.rcs.uwaterloo.ca/D415
2021-02-16 05:15:11 -05:00
f655e5f5cb Initial commit of benchmarks
Summary:
+ UDP and PTP over UDP & hw timestamping
+ Khat protocol
+ Rat protocol
+ Nanosecond timestamping
+ Load generation
+ NUMA detection library
+ Test scripts
+ Server & Client multi threading & tx/rx queues
+ RSS on all packets w/ randomized L4 ports

Test Plan: by hand

Reviewers: ali

Reviewed By: ali

Differential Revision: https://review.rcs.uwaterloo.ca/D408
2021-02-10 14:12:47 -05:00
55 changed files with 9183 additions and 1226 deletions

3
.arcconfig Normal file
View File

@ -0,0 +1,3 @@
{
"phabricator.uri" : "https://review.rcs.uwaterloo.ca/"
}

198
.clang-format Normal file
View File

@ -0,0 +1,198 @@
# $FreeBSD$
# Basic .clang-format
---
BasedOnStyle: WebKit
AlignAfterOpenBracket: DontAlign
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: false
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: InlineOnly
AllowShortIfStatementsOnASingleLine: Never
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: TopLevelDefinitions
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: MultiLine
BinPackArguments: true
BinPackParameters: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: WebKit
BreakBeforeTernaryOperators: false
# TODO: BreakStringLiterals can cause very strange formatting so turn it off?
BreakStringLiterals: false
# Prefer:
# some_var = function(arg1,
# arg2)
# over:
# some_var =
# function(arg1, arg2)
PenaltyBreakAssignment: 100
# Prefer:
# some_long_function(arg1, arg2
# arg3)
# over:
# some_long_function(
# arg1, arg2, arg3)
PenaltyBreakBeforeFirstCallParameter: 100
CompactNamespaces: true
DerivePointerAlignment: false
DisableFormat: false
ForEachMacros:
- ARB_ARRFOREACH
- ARB_ARRFOREACH_REVWCOND
- ARB_ARRFOREACH_REVERSE
- ARB_FOREACH
- ARB_FOREACH_FROM
- ARB_FOREACH_SAFE
- ARB_FOREACH_REVERSE
- ARB_FOREACH_REVERSE_FROM
- ARB_FOREACH_REVERSE_SAFE
- BIT_FOREACH_ISCLR
- BIT_FOREACH_ISSET
- CPU_FOREACH
- CPU_FOREACH_ISCLR
- CPU_FOREACH_ISSET
- FOREACH_THREAD_IN_PROC
- FOREACH_PROC_IN_SYSTEM
- FOREACH_PRISON_CHILD
- FOREACH_PRISON_DESCENDANT
- FOREACH_PRISON_DESCENDANT_LOCKED
- FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL
- MNT_VNODE_FOREACH_ALL
- MNT_VNODE_FOREACH_ACTIVE
- RB_FOREACH
- RB_FOREACH_FROM
- RB_FOREACH_SAFE
- RB_FOREACH_REVERSE
- RB_FOREACH_REVERSE_FROM
- RB_FOREACH_REVERSE_SAFE
- SLIST_FOREACH
- SLIST_FOREACH_FROM
- SLIST_FOREACH_FROM_SAFE
- SLIST_FOREACH_SAFE
- SLIST_FOREACH_PREVPTR
- SPLAY_FOREACH
- LIST_FOREACH
- LIST_FOREACH_FROM
- LIST_FOREACH_FROM_SAFE
- LIST_FOREACH_SAFE
- STAILQ_FOREACH
- STAILQ_FOREACH_FROM
- STAILQ_FOREACH_FROM_SAFE
- STAILQ_FOREACH_SAFE
- TAILQ_FOREACH
- TAILQ_FOREACH_FROM
- TAILQ_FOREACH_FROM_SAFE
- TAILQ_FOREACH_REVERSE
- TAILQ_FOREACH_REVERSE_FROM
- TAILQ_FOREACH_REVERSE_FROM_SAFE
- TAILQ_FOREACH_REVERSE_SAFE
- TAILQ_FOREACH_SAFE
- VM_MAP_ENTRY_FOREACH
- VM_PAGE_DUMP_FOREACH
IndentCaseLabels: false
IndentPPDirectives: None
Language: Cpp
NamespaceIndentation: None
PointerAlignment: Right
ContinuationIndentWidth: 4
IndentWidth: 8
TabWidth: 8
ColumnLimit: 80
UseTab: Always
SpaceAfterCStyleCast: false
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^\"opt_.*\.h\"'
Priority: 1
SortPriority: 10
- Regex: '^<sys/cdefs\.h>'
Priority: 2
SortPriority: 20
- Regex: '^<sys/types\.h>'
Priority: 2
SortPriority: 21
- Regex: '^<sys/param\.h>'
Priority: 2
SortPriority: 22
- Regex: '^<sys/systm\.h>'
Priority: 2
SortPriority: 23
- Regex: '^<sys.*/'
Priority: 2
SortPriority: 24
- Regex: '^<vm/vm\.h>'
Priority: 3
SortPriority: 30
- Regex: '^<vm/'
Priority: 3
SortPriority: 31
- Regex: '^<machine/'
Priority: 4
SortPriority: 40
- Regex: '^<(x86|amd64|i386|xen)/'
Priority: 5
SortPriority: 50
- Regex: '^<dev/'
Priority: 6
SortPriority: 60
- Regex: '^<net.*/'
Priority: 7
SortPriority: 70
- Regex: '^<protocols/'
Priority: 7
SortPriority: 71
- Regex: '^<(fs|nfs(|client|server)|ufs)/'
Priority: 8
SortPriority: 80
- Regex: '^<[^/].*\.h'
Priority: 9
SortPriority: 90
- Regex: '^\".*\.h\"'
Priority: 10
SortPriority: 100
# LLVM's header include ordering style is almost the exact opposite of ours.
# Unfortunately, they have hard-coded their preferences into clang-format.
# Clobbering this regular expression to avoid matching prevents non-system
# headers from being forcibly moved to the top of the include list.
# http://llvm.org/docs/CodingStandards.html#include-style
IncludeIsMainRegex: 'BLAH_DONT_MATCH_ANYTHING'
SortIncludes: true
KeepEmptyLinesAtTheStartOfBlocks: true
TypenameMacros:
- ARB_ELMTYPE
- ARB_HEAD
- ARB8_HEAD
- ARB16_HEAD
- ARB32_HEAD
- ARB_ENTRY
- ARB8_ENTRY
- ARB16_ENTRY
- ARB32_ENTRY
- LIST_CLASS_ENTRY
- LIST_CLASS_HEAD
- LIST_ENTRY
- LIST_HEAD
- QUEUE_TYPEOF
- RB_ENTRY
- RB_HEAD
- SLIST_CLASS_HEAD
- SLIST_CLASS_ENTRY
- SLIST_HEAD
- SLIST_ENTRY
- SMR_POINTER
- SPLAY_ENTRY
- SPLAY_HEAD
- STAILQ_CLASS_ENTRY
- STAILQ_CLASS_HEAD
- STAILQ_ENTRY
- STAILQ_HEAD
- TAILQ_CLASS_ENTRY
- TAILQ_CLASS_HEAD
- TAILQ_ENTRY
- TAILQ_HEAD

1
.clang-tidy Normal file
View File

@ -0,0 +1 @@
Checks: "-*,clang-diagnostic-*,clang-analyzer-*,modernize*,performance*,-modernize-use-trailing-return-type,-modernize-avoid-c-arrays"

274
.gitignore vendored Normal file
View File

@ -0,0 +1,274 @@
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
################ C STUFF ##########################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
# Prerequisites
*.d
# Object files
*.o
*.ko
*.obj
*.elf
# Linker output
*.ilk
*.map
*.exp
# Precompiled Headers
*.gch
*.pch
# Libraries
*.lib
*.a
*.la
*.lo
# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib
# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex
# Debug files
*.dSYM/
*.su
*.idb
*.pdb
# Kernel Module Compile Results
*.mod*
*.cmd
.tmp_versions/
modules.order
Module.symvers
Mkfile.old
dkms.conf
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
################ PYTHON STUFF ##########################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
################ C++ STUFF ##########################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
*.clangd
compile_commands.json

0
.gitmodules vendored Normal file
View File

View File

@ -1,33 +1,86 @@
cmake_minimum_required(VERSION 3.0)
find_program(CC_GCC gcc)
find_program(CXX_GCC g++)
set(CMAKE_C_COMPILER ${CC_GCC})
set(CMAKE_CXX_COMPILER ${CXX_GCC})
project(khat)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}")
find_package(dpdk REQUIRED)
find_package(PkgConfig REQUIRED)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY bin)
pkg_check_modules(DPDK libdpdk)
pkg_check_modules(SPDK spdk_event_bdev spdk_env_dpdk)
pkg_check_modules(SPDK_SYS spdk_syslibs)
pkg_check_modules(UUID uuid)
pkg_check_modules(TOPO bsdtopo)
set(CC_FLAGS -O2 -g -Wall -Wextra -Werror -std=c++11
-Wno-deprecated-declarations
-Wno-packed-not-aligned
-Wno-address-of-packed-member
-msse4)
-Wno-zero-length-array
-Wno-gnu-zero-variadic-macro-arguments
-march=native)
set(C_FLAGS -O2 -g -Wall -Wextra -Werror -std=c2x
-Wno-deprecated-declarations
-Wno-address-of-packed-member
-Wno-zero-length-array
-Wno-gnu-zero-variadic-macro-arguments
-march=native)
include_directories(${CMAKE_SOURCE_DIR}/inc)
include_directories(${dpdk_INCLUDE_DIRS})
include_directories()
add_executable(khat khat/khat.cc)
add_executable(cat cat/cat.cc)
set(LIBNTR_C_FLAGS -O3 -g -Wall -Wextra -Werror -std=c2x)
set(LIBGEN_CC_FLAGS -O3 -g -Wall -Wextra -Werror -std=c++17)
set(LINK_LIBS ${dpdk_LIBRARIES} pthread)
add_library(ntr SHARED libntr/ntr.c)
target_compile_options(ntr PRIVATE ${LIBNTR_C_FLAGS})
target_link_libraries(khat ${LINK_LIBS})
target_compile_options(khat PRIVATE ${CC_FLAGS})
add_library(gen SHARED libgen/generator.cc libgen/loadgen.cc)
target_link_libraries(gen PRIVATE pthread ntr ${TOPO_LINK_LIBRARIES} nms)
target_compile_options(gen PRIVATE ${LIBGEN_CC_FLAGS} ${TOPO_CFLAGS})
target_link_libraries(cat ${LINK_LIBS})
target_compile_options(cat PRIVATE ${CC_FLAGS})
add_library(netsup SHARED net/libnetsup/dpdk.cc net/libnetsup/portconf.cc)
target_link_libraries(netsup PRIVATE ntr ${DPDK_LINK_LIBRARIES})
target_compile_options(netsup PRIVATE ${LIBGEN_CC_FLAGS} ${DPDK_CFLAGS})
add_library(nms SHARED libnms/alloc.c)
target_link_libraries(nms PRIVATE ${TOPO_LINK_LIBRARIES})
target_compile_options(nms PRIVATE ${TOPO_CFLAGS})
add_executable(khat EXCLUDE_FROM_ALL net/khat.cc)
target_link_libraries(khat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
target_compile_options(khat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
add_executable(cat EXCLUDE_FROM_ALL net/cat.cc)
target_link_libraries(cat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
target_compile_options(cat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
add_executable(rat EXCLUDE_FROM_ALL net/rat.cc)
target_link_libraries(rat PRIVATE pthread ntr gen netsup nms ${DPDK_LINK_LIBRARIES} ${TOPO_LINK_LIBRARIES})
target_compile_options(rat PRIVATE ${CC_FLAGS} ${DPDK_CFLAGS} ${TOPO_CFLAGS})
add_executable(birb EXCLUDE_FROM_ALL storage/birb.cc storage/io_gen.cc storage/drivers/bdev.cc storage/drivers/bdev_thread.cc storage/drivers/nvme.cc storage/drivers/nvme_thread.cc)
target_include_directories(birb PRIVATE ${SPDK_INCLUDE_DIRS} ${DPDK_INCLUDE_DIRS} ${UUID_INCLUDE_DIRS})
target_compile_options(birb PRIVATE ${CC_FLAGS} ${SPDK_CFLAGS} ${UUID_CFLAGS})
target_link_directories(birb PRIVATE ${SPDK_LIBRARY_DIRS} ${SPDK_SYS_STATIC_LIBRARY_DIRS} ${UUID_LIBRARY_DIRS})
target_link_libraries(birb PRIVATE pthread ntr gen -Wl,--whole-archive ${SPDK_LIBRARIES} -Wl,--no-whole-archive ${SPDK_SYS_STATIC_LIBRARIES})
add_executable(birb_posix EXCLUDE_FROM_ALL storage/birb_posix.cc storage/io_gen.cc)
target_compile_options(birb_posix PRIVATE ${CC_FLAGS})
target_link_libraries(birb_posix PRIVATE pthread ntr gen)
add_executable(memloadgen util/memloadgen.cc)
target_link_libraries(memloadgen PRIVATE pthread gen ntr nms ${TOPO_LINK_LIBRARIES})
target_compile_options(memloadgen PRIVATE ${CC_FLAGS} ${TOPO_CFLAGS})
add_executable(mornafah util/mornafah.c)
target_link_libraries(mornafah PRIVATE pthread gen ntr nms ${TOPO_LINK_LIBRARIES})
target_compile_options(mornafah PRIVATE ${C_FLAGS} ${TOPO_CFLAGS})
add_executable(nms_test tests/nms_test.c)
set_target_properties(nms_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tests)
target_link_libraries(nms_test PRIVATE nms)
target_compile_options(nms_test PRIVATE ${C_FLAGS})

View File

@ -1,142 +0,0 @@
# Try to find dpdk
#
# Once done, this will define
#
# dpdk::dpdk
# dpdk_FOUND
# dpdk_INCLUDE_DIR
# dpdk_LIBRARIES
find_package(PkgConfig QUIET)
if(PKG_CONFIG_FOUND)
pkg_check_modules(dpdk QUIET libdpdk)
endif()
if(dpdk_INCLUDE_DIRS)
# good
elseif(TARGET dpdk::dpdk)
get_target_property(dpdk_INCLUDE_DIRS
dpdk::dpdk INTERFACE_INCLUDE_DIRECTORIES)
else()
find_path(dpdk_config_INCLUDE_DIR rte_config.h
HINTS
ENV DPDK_DIR
PATH_SUFFIXES
dpdk
include)
find_path(dpdk_common_INCLUDE_DIR rte_common.h
HINTS
ENC DPDK_DIR
PATH_SUFFIXES
dpdk
include)
set(dpdk_INCLUDE_DIRS "${dpdk_config_INCLUDE_DIR}")
if(NOT dpdk_config_INCLUDE_DIR EQUAL dpdk_common_INCLUDE_DIR)
list(APPEND dpdk_INCLUDE_DIRS "${dpdk_common_INCLUDE_DIR}")
endif()
endif()
set(components
bus_pci
bus_vdev
cfgfile
cmdline
eal
ethdev
hash
kvargs
mbuf
mempool
mempool_ring
mempool_stack
net
pci
pmd_af_packet
pmd_bnxt
pmd_bond
pmd_cxgbe
pmd_e1000
pmd_ena
pmd_enic
pmd_i40e
pmd_ixgbe
pmd_mlx5
pmd_nfp
pmd_qede
pmd_ring
pmd_sfc_efx
pmd_vmxnet3_uio
ring
timer)
# for collecting dpdk library targets, it will be used when defining dpdk::dpdk
set(_dpdk_libs)
# for list of dpdk library archive paths
set(dpdk_LIBRARIES)
foreach(c ${components})
set(dpdk_lib dpdk::${c})
if(TARGET ${dpdk_lib})
get_target_property(DPDK_rte_${c}_LIBRARY
${dpdk_lib} IMPORTED_LOCATION)
else()
find_library(DPDK_rte_${c}_LIBRARY rte_${c}
HINTS
ENV DPDK_DIR
${dpdk_LIBRARY_DIRS}
PATH_SUFFIXES lib)
endif()
if(DPDK_rte_${c}_LIBRARY)
if (NOT TARGET ${dpdk_lib})
add_library(${dpdk_lib} UNKNOWN IMPORTED)
set_target_properties(${dpdk_lib} PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${dpdk_INCLUDE_DIRS}"
IMPORTED_LOCATION "${DPDK_rte_${c}_LIBRARY}")
if(c STREQUAL pmd_mlx5)
find_package(verbs QUIET)
if(verbs_FOUND)
target_link_libraries(${dpdk_lib} INTERFACE IBVerbs::verbs)
endif()
endif()
endif()
list(APPEND _dpdk_libs ${dpdk_lib})
list(APPEND dpdk_LIBRARIES ${DPDK_rte_${c}_LIBRARY})
endif()
endforeach()
mark_as_advanced(dpdk_INCLUDE_DIRS ${dpdk_LIBRARIES})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(dpdk DEFAULT_MSG
dpdk_INCLUDE_DIRS
dpdk_LIBRARIES)
if(dpdk_FOUND)
if(NOT TARGET dpdk::cflags)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
set(rte_cflags "-march=core2")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|ARM")
set(rte_cflags "-march=armv7-a")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
set(rte_cflags "-march=armv8-a+crc")
endif()
add_library(dpdk::cflags INTERFACE IMPORTED)
if (rte_cflags)
set_target_properties(dpdk::cflags PROPERTIES
INTERFACE_COMPILE_OPTIONS "${rte_cflags}")
endif()
endif()
if(NOT TARGET dpdk::dpdk)
add_library(dpdk::dpdk INTERFACE IMPORTED)
find_package(Threads QUIET)
list(APPEND _dpdk_libs
Threads::Threads
dpdk::cflags)
set_target_properties(dpdk::dpdk PROPERTIES
INTERFACE_LINK_LIBRARIES "${_dpdk_libs}"
INTERFACE_INCLUDE_DIRECTORIES "${dpdk_INCLUDE_DIRS}")
endif()
endif()
unset(_dpdk_libs)

View File

@ -1,444 +0,0 @@
#include <cstdio>
#include <ctime>
#include <netinet/in.h>
#include <rte_config.h>
#include <rte_common.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_cycles.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include <rte_ether.h>
#include <rte_launch.h>
#include <rte_log.h>
#include <atomic>
#include <vector>
#include <fstream>
#include <unistd.h>
#include "ntrlog.h"
#include "pkt.h"
#include "rte_byteorder.h"
#include "rte_ip.h"
// init NTRLOG
NTR_DECL_IMPL;
constexpr unsigned int MBUF_MAX_COUNT = 8191;
constexpr unsigned int MBUF_CACHE_SIZE = 250;
constexpr unsigned int RX_RING_SIZE = 1024;
constexpr unsigned int TX_RING_SIZE = 1024;
constexpr unsigned int RX_RING_NUM = 1;
constexpr unsigned int TX_RING_NUM = 1;
constexpr unsigned int BURST_SIZE = 32;
static const struct rte_eth_conf port_conf_default{};
struct datapt{
uint64_t server_proc = 0;
uint64_t rtt = 0;
};
struct options_t {
unsigned int run_time = 5;
unsigned int warmup_time = 0;
char output[256] = "output.txt";
struct rte_ether_addr server_mac;
// states
std::atomic<bool> s_stop {false};
std::atomic<bool> s_record {false};
std::vector<struct datapt *> s_stats;
struct rte_mempool * s_mbuf_pool;
uint16_t s_portid;
struct rte_ether_addr s_host_mac;
};
struct options_t options;
static uint16_t
rx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
{
// XXX: need to get the timestamp in every loop?
uint64_t now = rte_rdtsc();
struct packet_data * pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i]);
if (pkt_data == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_calc_latency: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
continue;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now);
pkt_data->clt_ts_rx = rte_cpu_to_be_64(now);
}
return nb_pkts;
}
static uint16_t
tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
{
// XXX: need to get the timestamp in every loop?
uint64_t now = rte_rdtsc();
struct packet_data * pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i]);
if (pkt_data == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_add_timestamp: ignoring invalid packet 0x%p.\n", (void*)pkts[i]);
continue;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now);
pkt_data->clt_ts_tx = rte_cpu_to_be_64(now);
}
return nb_pkts;
}
#define STATE_SEND (0)
#define STATE_RECV (1)
static int
locore_main(void * _unused __rte_unused)
{
struct rte_mbuf *tx_buf;
struct rte_mbuf *rx_bufs[BURST_SIZE];
struct packet_data *pkt_data;
uint32_t core_id = rte_lcore_id();
uint32_t epoch = 0;
int state = STATE_SEND;
// XXX: check link status instead
sleep(1);
if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n", options.s_portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n", core_id);
tx_buf = rte_pktmbuf_alloc(options.s_mbuf_pool);
if (tx_buf == NULL) {
rte_exit(EXIT_FAILURE, "cannot allocate tx_buf\n");
}
pkt_data = construct_udp_pkt_hdr(tx_buf,
&options.s_host_mac, &options.server_mac,
RTE_IPV4(192, 168, 100, 150), RTE_IPV4(192, 168, 100, 151),
1337, 1337);
if (pkt_data == NULL) {
rte_exit(EXIT_FAILURE, "cannot allocate space for packet_data in mbuf\n");
}
pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);
while(!options.s_stop.load()) {
// always pop incoming packets
const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, rx_bufs, BURST_SIZE);
if (nb_rx != 0) {
// only process packets when we are ready to receive
for (int i = 0; i < nb_rx; i++) {
struct packet_data * each = check_valid_packet(rx_bufs[i]);
if (each == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: ignoring invalid packet %p.\n", (void*)rx_bufs[i]);
dump_pkt(rx_bufs[i]);
rte_pktmbuf_free(rx_bufs[i]);
continue;
}
if (rte_be_to_cpu_32(each->epoch) == epoch && state == STATE_RECV) {
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: received packet %p for epoch %d\n", (void*)rx_bufs[i], epoch);
if (options.s_record.load()) {
// keep statistics
struct datapt * dpt = new datapt;
dpt->rtt = rte_be_to_cpu_64(each->clt_ts_rx) - rte_be_to_cpu_64(each->clt_ts_tx);
dpt->server_proc = rte_be_to_cpu_64(each->srv_ts_tx) - rte_be_to_cpu_64(each->srv_ts_rx);
options.s_stats.push_back(dpt);
}
// bump the epoch and stop processing other packets
state = STATE_SEND;
epoch++;
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: ignoring packet 0x%p with invalid epoch %d.\n", (void*)rx_bufs[i], epoch);
}
rte_pktmbuf_free(rx_bufs[i]);
}
}
if (state == STATE_SEND) {
// set new epoch
pkt_data->epoch = rte_cpu_to_be_32(epoch);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: sending packet %p with epoch %d\n", (void*)tx_buf, epoch);
const uint16_t nb_tx = rte_eth_tx_burst(options.s_portid, 0, &tx_buf, 1);
if (nb_tx < 1) {
rte_exit(EXIT_FAILURE, "failed to send packet 0x%p, epoch %d\n", (void*)tx_buf, epoch);
}
state = STATE_RECV;
}
}
rte_pktmbuf_free(tx_buf);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d successfully stopped.\n", core_id);
return 0;
}
static int
port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
{
struct rte_eth_dev_info dev_info;
struct rte_eth_conf port_conf = port_conf_default;
struct rte_eth_txconf txconf;
struct rte_eth_rxconf rxconf;
uint16_t nb_rxd = RX_RING_SIZE;
uint16_t nb_txd = TX_RING_SIZE;
port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
if(!rte_eth_dev_is_valid_port(portid)) {
return -1;
}
int ret = rte_eth_dev_info_get(portid, &dev_info);
if (ret != 0) {
return ret;
}
port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
if (ret != 0)
return ret;
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
if (ret != 0)
return ret;
/* Allocate and set up 1 RX queue per Ethernet port. */
rxconf = dev_info.default_rxconf;
rxconf.offloads = port_conf.rxmode.offloads;
for (uint32_t i = 0; i < RX_RING_NUM; i++) {
ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
if (ret < 0)
return ret;
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
/* Allocate and set up 1 TX queue per Ethernet port. */
for (uint32_t i = 0; i < TX_RING_NUM; i++) {
ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
if (ret < 0)
return ret;
}
ret = rte_eth_dev_start(portid);
if (ret < 0)
return ret;
/* Display the port MAC address. */
struct rte_ether_addr addr;
ret = rte_eth_macaddr_get(portid, &addr);
if (ret != 0)
return ret;
/* Enable RX in promiscuous mode for the Ethernet device. */
ret = rte_eth_promiscuous_enable(portid);
if (ret != 0)
return ret;
rte_eth_add_tx_callback(portid, 0, tx_add_timestamp, NULL);
rte_eth_add_rx_callback(portid, 0, rx_calc_latency, NULL);
return 0;
}
static void dump_options()
{
fprintf(stdout, "Configuration:\n" \
" run time = %d\n" \
" warmup time = %d\n" \
" output file = %s\n" \
" server MAC = %x:%x:%x:%x:%x:%x\n",
options.run_time,
options.warmup_time,
options.output,
options.server_mac.addr_bytes[0],
options.server_mac.addr_bytes[1],
options.server_mac.addr_bytes[2],
options.server_mac.addr_bytes[3],
options.server_mac.addr_bytes[4],
options.server_mac.addr_bytes[5]);
}
static void usage()
{
fprintf(stdout,
"Usage:\n " \
" -v(vv): verbose mode\n" \
" -h: display the information\n" \
" -o: output filename\n" \
" -t: run time\n" \
" -T: warmup time\n" \
" -s: server's mac\n\n" );
}
int main(int argc, char* argv[])
{
unsigned int nb_ports;
struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
std::ofstream log_file;
// init dpdk
int ret = rte_eal_init(argc, argv);
if (ret < 0) {
rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
}
argc -= ret;
argv += ret;
// set warning level
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
{
int c;
// parse arguments
while((c = getopt(argc, argv, "hvo:t:T:s:")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 's':
if (rte_ether_unformat_addr(optarg, &options.server_mac) == -1) {
rte_exit(EXIT_FAILURE, "cannot parse %s as mac address.\n", optarg);
}
break;
case 't':
options.run_time = atoi(optarg);
break;
case 'T':
options.warmup_time = atoi(optarg);
break;
case 'h':
usage();
rte_exit(EXIT_SUCCESS, NULL);
break;
case 'o':
strncpy(options.output, optarg, sizeof(options.output) - 1);
break;
default:
usage();
rte_exit(EXIT_FAILURE, "unknown argument: %c\n", c);
break;
}
}
}
// open log file for writing
log_file.open(options.output, std::ofstream::out);
if (!log_file) {
rte_exit(EXIT_FAILURE, "failed to open log file %s\n", options.output);
}
nb_ports = rte_eth_dev_count_avail();
if (nb_ports == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
}
// create a mbuf memory pool on the socket
mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
}
mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (mbuf_pool_pkt == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
}
options.s_mbuf_pool = mbuf_pool_pkt;
uint16_t portid = rte_eth_find_next(0);
if (portid == RTE_MAX_ETHPORTS) {
rte_exit(EXIT_FAILURE, "cannot find an available port\n");
}
options.s_portid = portid;
if (port_init(portid, mbuf_pool) != 0) {
rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
}
if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
options.s_host_mac.addr_bytes[0],
options.s_host_mac.addr_bytes[1],
options.s_host_mac.addr_bytes[2],
options.s_host_mac.addr_bytes[3],
options.s_host_mac.addr_bytes[4],
options.s_host_mac.addr_bytes[5]);
dump_options();
uint16_t core_id = rte_get_next_lcore(0, true, false);
if (rte_eal_remote_launch(locore_main, NULL, core_id) != 0) {
rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
}
// poor man's timer
// XXX: use kqueue instead
struct timespec ts;
ts.tv_sec = 1;
ts.tv_nsec = 0;
uint32_t second = 0;
while(true) {
if (second >= options.warmup_time) {
options.s_record.store(true);
}
if (second >= options.run_time + options.warmup_time) {
options.s_stop.store(true);
break;
}
clock_nanosleep(CLOCK_REALTIME, 0, &ts, NULL);
second++;
}
if (rte_eal_wait_lcore(core_id) < 0)
rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
// dump stats
for (auto it = std::begin(options.s_stats); it != std::end(options.s_stats); ++it) {
log_file << (*it)->rtt << "," << (*it)->server_proc << std::endl;
delete *it;
}
log_file.close();
// clean up
rte_eth_dev_stop(portid);
rte_eth_dev_close(portid);
return 0;
}

View File

@ -1,9 +0,0 @@
-xc++
-O2
-std=c++11
-Wall
-Werror
-Wpedantic
-I/usr/include/dpdk
-Iinc
-Wno-deprecated-declarations

61
inc/defs.hh Normal file
View File

@ -0,0 +1,61 @@
#pragma once
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cstdio>
#include <sys/types.h>
#include <sys/cpuset.h>
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
TypeName(const TypeName &) = delete; \
void operator=(const TypeName &) = delete
#define UNUSED __attribute__((unused))
constexpr static unsigned long S2NS = 1000000000UL;
constexpr static unsigned long S2US = 1000000UL;
constexpr static unsigned long MS2NS = 1000000UL;
constexpr static int NEXT_CPU_NULL = -1;
#if defined(__x86_64__)
static inline int
cmask_get_next_cpu(uint64_t *mask)
{
int ffs = ffsll(*mask);
*mask &= ~(1ul << (ffs - 1));
return ffs - 1;
}
static inline int
cmask_get_num_cpus(const uint64_t mask)
{
return __builtin_popcount(mask);
}
#endif
static inline uint64_t
get_uptime()
{
struct timespec tp;
clock_gettime(CLOCK_MONOTONIC, &tp);
return (tp.tv_sec * S2NS + tp.tv_nsec);
}
static inline void
cpulist_to_cpuset(char * cpulist, cpuset_t * cpuset)
{
char * cpu = strtok(cpulist, ",");
CPU_ZERO(cpuset);
while (cpu != nullptr) {
CPU_SET(atoi(cpu), cpuset);
cpu = strtok(nullptr, ",");
}
}
#define ATTR_UNUSED __attribute__((unused))

346
inc/gen.hh Normal file
View File

@ -0,0 +1,346 @@
// modified from mutilate
// -*- c++ -*-
// 1. implement "fixed" generator
// 2. implement discrete generator
// 3. implement combine generator?
#pragma once
#include <assert.h>
#include <inttypes.h>
#include <limits.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <utility>
#include <vector>
#include <sys/_pthreadtypes.h>
#include <sys/param.h>
#include "defs.hh"
#define D(fmt, ...)
#define DIE(fmt, ...) (void)0;
#define FNV_64_PRIME (0x100000001b3ULL)
#define FNV1_64_INIT (0xcbf29ce484222325ULL)
static inline uint64_t
fnv_64_buf(const void *buf, size_t len)
{
uint64_t hval = FNV1_64_INIT;
unsigned char *bp = (unsigned char *)buf; /* start of buffer */
unsigned char *be = bp + len; /* beyond end of buffer */
while (bp < be) {
hval ^= (uint64_t)*bp++;
hval *= FNV_64_PRIME;
}
return hval;
}
static inline uint64_t
fnv_64(uint64_t in)
{
return fnv_64_buf(&in, sizeof(in));
}
// Generator syntax:
//
// \d+ == fixed
// n[ormal]:mean,sd
// e[xponential]:lambda
// p[areto]:scale,shape
// g[ev]:loc,scale,shape
// fb_value, fb_key, fb_rate
class Generator {
public:
Generator() { }
// Generator(const Generator &g) = delete;
// virtual Generator& operator=(const Generator &g) = delete;
virtual ~Generator() { }
virtual double generate(double U = -1.0) = 0;
virtual void set_lambda(double) { DIE("set_lambda() not implemented"); }
protected:
std::string type;
};
class Fixed : public Generator {
public:
Fixed(double _value = 1.0)
: value(_value)
{
D("Fixed(%f)", value);
}
virtual double generate(double) { return value; }
virtual void set_lambda(double lambda)
{
if (lambda > 0.0)
value = 1.0 / lambda;
else
value = 0.0;
}
private:
double value;
};
class Uniform : public Generator {
public:
Uniform(double _scale)
: scale(_scale)
{
D("Uniform(%f)", scale);
}
virtual double generate(double U = -1.0)
{
if (U < 0.0)
U = drand48();
return scale * U;
}
virtual void set_lambda(double lambda)
{
if (lambda > 0.0)
scale = 2.0 / lambda;
else
scale = 0.0;
}
private:
double scale;
};
class Normal : public Generator {
public:
Normal(double _mean = 1.0, double _sd = 1.0)
: mean(_mean)
, sd(_sd)
{
D("Normal(mean=%f, sd=%f)", mean, sd);
}
virtual double generate(double U = -1.0)
{
if (U < 0.0)
U = drand48();
double V = U; // drand48();
double N = sqrt(-2 * log(U)) * cos(2 * M_PI * V);
return mean + sd * N;
}
virtual void set_lambda(double lambda)
{
if (lambda > 0.0)
mean = 1.0 / lambda;
else
mean = 0.0;
}
private:
double mean, sd;
};
class Exponential : public Generator {
public:
Exponential(double _lambda = 1.0)
: lambda(_lambda)
{
D("Exponential(lambda=%f)", lambda);
}
virtual double generate(double U = -1.0)
{
if (lambda <= 0.0)
return 0.0;
if (U < 0.0)
U = drand48();
return -log(U) / lambda;
}
virtual void set_lambda(double lambda) { this->lambda = lambda; }
private:
double lambda;
};
class GPareto : public Generator {
public:
GPareto(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0)
: loc(_loc)
, scale(_scale)
, shape(_shape)
{
assert(shape != 0.0);
D("GPareto(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
}
virtual double generate(double U = -1.0)
{
if (U < 0.0)
U = drand48();
return loc + scale * (pow(U, -shape) - 1) / shape;
}
virtual void set_lambda(double lambda)
{
if (lambda <= 0.0)
scale = 0.0;
else
scale = (1 - shape) / lambda - (1 - shape) * loc;
}
private:
double loc /* mu */;
double scale /* sigma */, shape /* k */;
};
class GEV : public Generator {
public:
GEV(double _loc = 0.0, double _scale = 1.0, double _shape = 1.0)
: e(1.0)
, loc(_loc)
, scale(_scale)
, shape(_shape)
{
assert(shape != 0.0);
D("GEV(loc=%f, scale=%f, shape=%f)", loc, scale, shape);
}
virtual double generate(double U = -1.0)
{
return loc + scale * (pow(e.generate(U), -shape) - 1) / shape;
}
private:
Exponential e;
double loc /* mu */, scale /* sigma */, shape /* k */;
};
class Discrete : public Generator {
public:
~Discrete() { delete def; }
Discrete(Generator *_def = NULL)
: def(_def)
{
if (def == NULL)
def = new Fixed(0.0);
}
virtual double generate(double U = -1.0)
{
double Uc = U;
if (pv.size() > 0 && U < 0.0)
U = drand48();
double sum = 0;
for (auto p : pv) {
sum += p.first;
if (U < sum)
return p.second;
}
return def->generate(Uc);
}
void add(double p, double v)
{
pv.push_back(std::pair<double, double>(p, v));
}
private:
Generator *def;
std::vector<std::pair<double, double>> pv;
};
class KeyGenerator {
public:
KeyGenerator(Generator *_g, double _max = 10000)
: g(_g)
, max(_max)
{
}
std::string generate(uint64_t ind)
{
uint64_t h = fnv_64(ind);
double U = (double)h / (double)ULLONG_MAX;
double G = g->generate(U);
int keylen = MAX(round(G), floor(log10(max)) + 1);
char key[256];
snprintf(key, 256, "%0*" PRIu64, keylen, ind);
// D("%d = %s", ind, key);
return std::string(key);
}
private:
Generator *g;
double max;
};
Generator *createGenerator(std::string str);
Generator *createFacebookKey();
Generator *createFacebookValue();
Generator *createFacebookIA();
// memload generator
class memload_generator {
public:
struct memload_generator_options {
size_t transaction_size {4096};
size_t buffer_size {64*1024*1024};
char ia_dist[64]{"fixed"};
int verbose {0};
uint64_t trans_per_second;
bool shared_buffer {true};
};
private:
DISALLOW_EVIL_CONSTRUCTORS(memload_generator);
struct thread_info {
pthread_t pthr;
void *from_buffer;
void *to_buffer;
std::atomic<bool> reset_ts;
int tid;
int pull;
int coreid;
int target_dom;
struct memload_generator_options * opts;
Generator * ia_gen;
// stat keeping
std::atomic<uint32_t> num_trans;
std::atomic<int> * state;
std::atomic<int> init_status;
};
std::vector<struct thread_info *> thr_infos;
std::atomic<int> state;
static constexpr int STATE_RUN = 0;
static constexpr int STATE_RDY = 1;
static constexpr int STATE_END = 2;
static constexpr int STATE_INIT = 3;
static void *worker_thrd(void *_tinfo);
struct memload_generator_options opts;
public:
memload_generator(cpuset_t * threads, cpuset_t * modes, cpuset_t * target_domain, struct memload_generator_options * opt, bool *success);
uint64_t get_transactions();
bool start();
bool stop();
bool set_transactions(uint64_t tps);
~memload_generator();
};

133
inc/net/netsup.hh Normal file
View File

@ -0,0 +1,133 @@
#pragma once
#include <cstdint>
#include "rte_ethdev.h"
#include "rte_ether.h"
#define MAX_NUMA_NODES (64)
struct device_conf {
int portid;
uint16_t tx_ring_sz;
uint16_t rx_ring_sz;
cpuset_t core_affinity;
int mtu;
uint64_t rx_offloads;
uint64_t tx_offloads;
uint64_t rss_hf;
rte_tx_callback_fn tx_fn;
void * tx_user;
rte_rx_callback_fn rx_fn;
void * rx_user;
bool timesync;
};
struct mem_conf {
int num_elements;
int cache_size;
int data_room_size;
int priv_size;
unsigned int max_pools;
};
constexpr static uint16_t MIN_RANDOM_PORT = 1000;
constexpr static uint16_t DEFAULT_RAT_PORT = 1234;
constexpr static unsigned int INIT_DELAY = 3;
constexpr static unsigned int MAX_NODES = 64;
void
dpdk_init(struct device_conf *dconf, struct mem_conf *mconf);
void
dpdk_cleanup(struct device_conf *dconf);
struct rte_mempool *
mempool_get(int nodeid);
struct port_conf {
const char * driver_name;
uint64_t rxoffload;
uint64_t txoffload;
uint64_t rss_hf;
bool timesync;
};
int
portconf_get(int portid, struct port_conf * out);
// constexpr static int LATENCY_MEASURE_TIMES = 10000;
// static inline void
// sync_port_clock(uint16_t portid)
//{
// int64_t lat = 0;
// int64_t get_time_lat;
// int64_t write_time_lat;
// struct timespec dum;
// struct timespec start;
// struct timespec end;
//
// // measure clock_gettime latency
// for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) {
// // end - start ~= 2x clock_gettime's latency
// clock_gettime(CLOCK_REALTIME, &start);
// clock_gettime(CLOCK_REALTIME, &dum);
// clock_gettime(CLOCK_REALTIME, &end);
//
// if (end.tv_sec != start.tv_sec) {
// rte_exit(EXIT_FAILURE, "clock_gettime too slow\n");
// }
//
// // shouldn't overflow
// lat += (end.tv_nsec - start.tv_nsec) / 2;
// }
// get_time_lat = lat / LATENCY_MEASURE_TIMES;
//
// // measure rte_eth_timesync_write_time latency
// lat = 0;
// for(int i = 0; i < LATENCY_MEASURE_TIMES; i++) {
// // end - start ~= rte_eth_timesync latency + clock_gettime's latency
// clock_gettime(CLOCK_REALTIME, &dum);
// clock_gettime(CLOCK_REALTIME, &start);
// if (rte_eth_timesync_write_time(portid, &dum) != 0) {
// rte_exit(EXIT_FAILURE, "failed to write time\n");
// }
// clock_gettime(CLOCK_REALTIME, &end);
//
// if (end.tv_sec != start.tv_sec) {
// rte_exit(EXIT_FAILURE, "clock_gettime too slow!\n");
// }
//
// // shouldn't overflow
// int64_t elat = (end.tv_nsec - start.tv_nsec) - get_time_lat;
// if (elat < 0) {
// rte_exit(EXIT_FAILURE, "something is wrong with lat \n");
// }
// lat += elat;
// }
// write_time_lat = lat / LATENCY_MEASURE_TIMES;
//
// int64_t delta = (get_time_lat + write_time_lat) / 2;
// int64_t s2ns = (int64_t)S2NS;
// // sync the clock
// while (true) {
// clock_gettime(CLOCK_REALTIME, &dum);
// dum.tv_nsec += delta;
// if (dum.tv_nsec > s2ns) {
// // try again if overflow
// continue;
// }
// if (rte_eth_timesync_write_time(portid, &dum) != 0) {
// rte_exit(EXIT_FAILURE, "failed to write time\n");
// }
// break;
// }
// rte_eth_timesync_enable(portid);
//
// printf("Sync-ed time: get lat %ld write lat %ld\n", get_time_lat,
// write_time_lat);
//}

490
inc/net/pkt.hh Normal file
View File

@ -0,0 +1,490 @@
#pragma once
#include <sys/endian.h>
#include <rte_byteorder.h>
#include <rte_ether.h>
#include <rte_flow.h>
#include <rte_ip.h>
#include <rte_mbuf.h>
#include <rte_mbuf_core.h>
#include <rte_net.h>
#include <rte_udp.h>
#include <unistd.h>
#include "defs.hh"
#include <random>
#define IP_DEFTTL 64 /* from RFC 1340. */
#define IP_VERSION 0x40
#define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
#define IP_ADDR_FMT_SIZE 15
constexpr static uint32_t MAX_JUMBO_MTU = 9000;
constexpr static uint32_t MAX_STANDARD_MTU = 1500;
static inline int
mtu_to_pkt_size(int mtu)
{
return mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
}
static inline void
tx_burst_all(int portid, int txqid, struct rte_mbuf ** tx_bufs, int sz)
{
int remaining = sz;
while(remaining > 0) {
remaining -= rte_eth_tx_burst(
portid, txqid, &tx_bufs[sz - remaining],
remaining);
}
}
constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
const static struct rte_ether_addr POU_MAC {
0x01, 0x00, 0x5e, 0x00, 0x01, 0x81
};
const static uint32_t POU_IP = RTE_IPV4(224, 0, 1, 129);
const static uint16_t POU_PORT = 320;
/* Khat Protocol:
* khat only processes two kinds of packets - LOAD and PROBE
* rat:
* rat -> LOAD -> khat
* khat -> LOAD_RESP -> rat
* cat:
* cat -> PROBE -> khat (cat tx timestamps)
* khat -> PROBE_RESP -> cat (cat rx timestamps and khat tx/rx
* timestamps) khat -> STAT -> cat (khat sends its tx/rx timestamps)
*/
/* Rat Protocol:
* cat & rat:
* 1. both launch with full parameters
* rat with slave flag
* cat with master flag
* 2. rats create threads and wait for cat's signal
* 3. cat creates threads
* 4. cat -> rats SYNC
* 5. rats -> cat SYNC_ACK and start running
* 6. cat start running after received all SYNC_ACKs
* 7. cat stops running, cat -> rats FIN
* 8. rats stops running, rats -> cat FIN_ACK with QPS
* 9. cat exits after receiving all FIN_ACKs and flushing statsGG
*/
struct ptp_hdr {
uint8_t ptp_msg_type;
uint8_t ptp_ver;
uint8_t unused[34];
} __attribute__((packed));
struct pkt_hdr {
struct rte_ether_hdr eth_hdr;
struct rte_ipv4_hdr ipv4_hdr;
struct rte_udp_hdr udp_hdr;
struct ptp_hdr ptp_hdr;
uint16_t type;
uint32_t magic;
char payload[0];
} __attribute__((packed));
struct net_spec {
uint32_t ip;
rte_ether_addr mac_addr;
};
static inline void
pkt_hdr_to_netspec(struct pkt_hdr *pkt, struct net_spec *src,
uint16_t *src_port, struct net_spec *dst, uint16_t *dst_port)
{
if (src != nullptr) {
rte_ether_addr_copy(&pkt->eth_hdr.src_addr, &src->mac_addr);
src->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr);
}
if (src_port != nullptr) {
*src_port = rte_be_to_cpu_16(pkt->udp_hdr.src_port);
}
if (dst != nullptr) {
rte_ether_addr_copy(&pkt->eth_hdr.dst_addr, &dst->mac_addr);
dst->ip = rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr);
}
if (dst_port != nullptr) {
*dst_port = rte_be_to_cpu_16(pkt->udp_hdr.dst_port);
}
};
struct conn_spec {
struct net_spec *src;
uint16_t src_port;
struct net_spec *dst;
uint16_t dst_port;
};
// returns 0 on success
static inline int
str_to_netspec(char *str, struct net_spec *out)
{
const char *tok = "@";
char *token;
char *ptr;
uint32_t a, b, c, d;
token = strtok_r(str, tok, &ptr);
if (token == nullptr ||
sscanf(token, "%d.%d.%d.%d", &a, &b, &c, &d) != 4) {
return -1;
}
out->ip = RTE_IPV4(a, b, c, d);
// mac next
token = strtok_r(nullptr, tok, &ptr);
if (token == nullptr ||
rte_ether_unformat_addr(token, &out->mac_addr) != 0) {
return -1;
}
return 0;
}
constexpr static uint16_t PKT_TYPE_LOAD = 0;
constexpr static uint32_t LOAD_TYPE_CPU = 0; // arg0 = cpu time in us. arg1 = unused
constexpr static uint32_t LOAD_TYPE_MEM = 1; // arg0 = which thread to access. arg1 = how many cachelines to access
constexpr static uint32_t LOAD_TYPE_MAX = LOAD_TYPE_MEM + 1;
struct pkt_payload_load {
uint32_t epoch;
uint32_t type; // type of load
uint32_t arg0;
uint32_t arg1;
};
constexpr static uint16_t PKT_TYPE_PROBE = 1;
constexpr static uint16_t PKT_TYPE_LOAD_RESP = 2;
constexpr static uint16_t PKT_TYPE_PROBE_RESP = 3;
struct pkt_payload_epoch {
uint32_t epoch;
};
constexpr static uint16_t PKT_TYPE_STAT = 4;
struct pkt_payload_stat {
uint32_t epoch;
uint64_t hw_rx;
uint64_t hw_tx;
uint64_t sw_rx;
uint64_t sw_tx;
};
constexpr static uint16_t PKT_TYPE_SYNC = 5;
constexpr static uint16_t PKT_TYPE_SYNC_ACK = 6;
constexpr static uint16_t PKT_TYPE_FIN = 7;
constexpr static uint16_t PKT_TYPE_FIN_ACK = 8;
struct pkt_payload_qps {
uint32_t qps;
uint32_t recved_pkts;
uint32_t lost_pkts;
};
constexpr static uint16_t NUM_PKT_TYPES = PKT_TYPE_FIN_ACK + 1;
// for fast packet verification
static const uint32_t expected_payload_size[NUM_PKT_TYPES] {
sizeof(struct pkt_payload_load), // LOAD
sizeof(struct pkt_payload_epoch), // PROBE
sizeof(struct pkt_payload_epoch), // LOAD_RESP
sizeof(struct pkt_payload_epoch), // PROBE_RESP
sizeof(struct pkt_payload_stat), // STAT
0, // SYNC
0, // SYNC_ACK
0, // FIN
sizeof(struct pkt_payload_qps) // FIN_ACK
};
class rdport_generator {
private:
DISALLOW_EVIL_CONSTRUCTORS(rdport_generator);
constexpr static uint32_t MAX_PORT = 65535;
uint32_t min_port;
uint32_t cur;
std::random_device rd;
std::default_random_engine gen;
std::uniform_int_distribution<uint32_t> dist;
public:
rdport_generator(uint32_t mport)
: min_port(mport)
, cur(0)
, dist(0, MAX_PORT - min_port)
{
gen.seed(get_uptime());
cur = dist(gen);
}
uint16_t next()
{
uint16_t ret = ((cur) % (MAX_PORT - min_port)) + min_port;
cur++;
return ret;
}
};
#define NTR_PKT(dep, level, pkt, prefix_fmt, ...) \
ntr(dep, level, \
prefix_fmt \
"src: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x dst: %d.%d.%d.%d:%d@%02x:%02x:%02x:%02x:%02x:%02x type: %d\n", \
##__VA_ARGS__, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 24) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 16) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 8) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.src_addr) >> 0) & 0xff, \
rte_be_to_cpu_16(pkt->udp_hdr.src_port), \
pkt->eth_hdr.src_addr.addr_bytes[0], \
pkt->eth_hdr.src_addr.addr_bytes[1], \
pkt->eth_hdr.src_addr.addr_bytes[2], \
pkt->eth_hdr.src_addr.addr_bytes[3], \
pkt->eth_hdr.src_addr.addr_bytes[4], \
pkt->eth_hdr.src_addr.addr_bytes[5], \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 24) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 16) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 8) & 0xff, \
(rte_be_to_cpu_32(pkt->ipv4_hdr.dst_addr) >> 0) & 0xff, \
rte_be_to_cpu_16(pkt->udp_hdr.dst_port), \
pkt->eth_hdr.dst_addr.addr_bytes[0], \
pkt->eth_hdr.dst_addr.addr_bytes[1], \
pkt->eth_hdr.dst_addr.addr_bytes[2], \
pkt->eth_hdr.dst_addr.addr_bytes[3], \
pkt->eth_hdr.dst_addr.addr_bytes[4], \
pkt->eth_hdr.dst_addr.addr_bytes[5], rte_be_to_cpu_16(pkt->type))
static inline void
print_mac(struct rte_ether_addr *mac)
{
printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0], mac->addr_bytes[1],
mac->addr_bytes[2], mac->addr_bytes[3], mac->addr_bytes[4],
mac->addr_bytes[5]);
}
static inline void
print_ipv4(uint32_t ip)
{
printf("%d.%d.%d.%d", (ip >> 24) & 0xff, (ip >> 16) & 0xff,
(ip >> 8) & 0xff, (ip >> 0) & 0xff);
}
static inline void
dump_pkt(struct rte_mbuf *pkt)
{
if (rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) {
return;
}
struct rte_ether_hdr _eth_hdr;
auto eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(
pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr);
if (eth_hdr == nullptr) {
return;
}
// ethernet frame
printf(
"Packet %p: Length 0x%x\n", (void *)pkt, rte_pktmbuf_data_len(pkt));
printf(" Ethernet header:\n");
printf(" Src:");
print_mac(&eth_hdr->src_addr);
printf("\n");
printf(" Dst:");
print_mac(&eth_hdr->dst_addr);
printf("\n");
printf(" Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
uint16_t ether_type = rte_be_to_cpu_16(eth_hdr->ether_type);
if (ether_type != RTE_ETHER_TYPE_IPV4) {
return;
}
if (rte_pktmbuf_data_len(pkt) <
sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) {
return;
}
// dump ip header
auto ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
printf(" IPv4 header:\n");
printf(" Src:");
print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr));
printf("\n");
printf(" Dst:");
print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr));
printf("\n");
printf(" Protocol: 0x%x\n", ipv4_hdr->next_proto_id);
}
static inline bool
is_l2ts_pkt(uint16_t type)
{
return type == PKT_TYPE_PROBE || type == PKT_TYPE_PROBE_RESP;
}
// fills the packet with the information except for the payload itself
static inline struct pkt_hdr *
construct_pkt_hdr(
struct rte_mbuf *buf, uint16_t type, const struct conn_spec *conn, int pkt_pad_sz)
{
rte_pktmbuf_reset(buf);
int total_sz = sizeof(struct pkt_hdr) +
expected_payload_size[type];
if (pkt_pad_sz > total_sz) {
total_sz = pkt_pad_sz;
}
auto pkt_data = (struct pkt_hdr *)rte_pktmbuf_append(buf, total_sz);
if (pkt_data == nullptr)
return nullptr;
struct rte_ether_hdr *eth_hdr;
struct rte_ipv4_hdr *ipv4_hdr;
struct rte_udp_hdr *udp_hdr;
bool is_ts_pkt = is_l2ts_pkt(type);
// single segment
buf->nb_segs = 1;
// construct l2 header
eth_hdr = &pkt_data->eth_hdr;
rte_ether_addr_copy(&conn->src->mac_addr, &eth_hdr->src_addr);
if (is_ts_pkt) {
rte_ether_addr_copy(&POU_MAC, &eth_hdr->dst_addr);
} else {
rte_ether_addr_copy(&conn->dst->mac_addr, &eth_hdr->dst_addr);
}
eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
buf->l2_len = sizeof(struct rte_ether_hdr);
// construct l3 header
ipv4_hdr = &pkt_data->ipv4_hdr;
memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
ipv4_hdr->version_ihl = IP_VHL_DEF;
ipv4_hdr->type_of_service = 0;
ipv4_hdr->fragment_offset = 0;
ipv4_hdr->time_to_live = IP_DEFTTL;
ipv4_hdr->next_proto_id = IPPROTO_UDP;
ipv4_hdr->packet_id = 0;
ipv4_hdr->src_addr = rte_cpu_to_be_32(conn->src->ip);
if (is_ts_pkt) {
ipv4_hdr->dst_addr = rte_cpu_to_be_32(POU_IP);
} else {
ipv4_hdr->dst_addr = rte_cpu_to_be_32(conn->dst->ip);
}
ipv4_hdr->total_length = rte_cpu_to_be_16(total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr));
ipv4_hdr->hdr_checksum = 0;
buf->l3_len = sizeof(struct rte_ipv4_hdr);
// construct l4 header
udp_hdr = &pkt_data->udp_hdr;
udp_hdr->src_port = rte_cpu_to_be_16(conn->src_port);
if (is_ts_pkt) {
udp_hdr->dst_port = rte_cpu_to_be_16(POU_PORT);
} else {
udp_hdr->dst_port = rte_cpu_to_be_16(conn->dst_port);
}
udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
udp_hdr->dgram_len = total_sz - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr) - sizeof(struct rte_udp_hdr);
buf->l4_len = sizeof(struct rte_udp_hdr);
buf->ol_flags |= RTE_MBUF_F_TX_IPV4;
buf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
buf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
if (is_ts_pkt) {
// set misc flags
buf->ol_flags |= RTE_MBUF_F_TX_IEEE1588_TMST;
pkt_data->ptp_hdr.ptp_ver = 0x2; // VER 2
pkt_data->ptp_hdr.ptp_msg_type = 0x0; // SYNC
} else {
pkt_data->ptp_hdr.ptp_ver = 0xff; // invalid ver
}
pkt_data->type = rte_cpu_to_be_16(type);
pkt_data->magic = rte_cpu_to_be_32(ETHER_FRAME_MAGIC);
return pkt_data;
}
// returns 0 on success
static inline int
alloc_pkt_hdr(struct rte_mempool *pool, uint16_t type,
const struct conn_spec *conn, int pkt_pad_sz, struct rte_mbuf **mbuf_out,
struct pkt_hdr **hdr_out)
{
struct pkt_hdr *hdr;
struct rte_mbuf *pkt = rte_pktmbuf_alloc(pool);
if (pkt == nullptr) {
return -1;
}
// printf("alloc_pkt_hdr:\n");
// printf("from ");
// print_mac(&conn->src->mac_addr);
// printf("\nto ");
// print_mac(&conn->dst->mac_addr);
// printf("\n");
hdr = construct_pkt_hdr(pkt, type, conn, pkt_pad_sz);
if (hdr == nullptr) {
rte_pktmbuf_free(pkt);
return -1;
}
*mbuf_out = pkt;
*hdr_out = hdr;
return 0;
}
static inline struct pkt_hdr *
check_valid_packet(struct rte_mbuf *pkt, const struct rte_ether_addr *host_mac)
{
struct pkt_hdr *pkt_data = nullptr;
const struct rte_ether_addr *expected_mac = nullptr;
uint16_t type;
const uint32_t data_len = rte_pktmbuf_data_len(pkt);
if (data_len < sizeof(struct pkt_hdr)) {
return nullptr;
}
pkt_data = rte_pktmbuf_mtod(pkt, struct pkt_hdr *);
// check MAGIC
if (rte_be_to_cpu_32(pkt_data->magic) != ETHER_FRAME_MAGIC) {
return nullptr;
}
type = rte_be_to_cpu_16(pkt_data->type);
// check type and payload size
if ((type >= NUM_PKT_TYPES) ||
(data_len <
(sizeof(struct pkt_hdr) +
expected_payload_size[rte_be_to_cpu_16(pkt_data->type)]))) {
return nullptr;
}
// strict dest mac filter
if (host_mac != nullptr) {
if (is_l2ts_pkt(type)) {
// dst mac must be the broadcast addr
expected_mac = &POU_MAC;
} else {
// dst mac must match the host mac
expected_mac = host_mac;
}
if (!rte_is_same_ether_addr(
expected_mac, &pkt_data->eth_hdr.dst_addr))
return nullptr;
}
return pkt_data;
}

26
inc/nms.h Normal file
View File

@ -0,0 +1,26 @@
#pragma once
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif
int
nms_init(int verbose);
void *
nms_malloc(int nodeid, size_t sz);
void *
nms_alloc_static(int nodeid, size_t sz);
void
nms_free_static(void * buf, size_t sz);
void
nms_free(int nodeid, void * addr);
#ifdef __cplusplus
}
#endif // __cplusplus

38
inc/ntr.h Normal file
View File

@ -0,0 +1,38 @@
#pragma once
#include <stdarg.h>
#include <stdio.h>
#define NTR_LEVEL_NONE (0)
#define NTR_LEVEL_ERROR (1)
#define NTR_LEVEL_WARNING (2)
#define NTR_LEVEL_INFO (3)
#define NTR_LEVEL_DEBUG (4)
#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
#define NTR_DEP_NTR (0)
#define NTR_DEP_USER1 (1)
#define NTR_DEP_USER2 (2)
#define NTR_DEP_USER3 (3)
#define NTR_DEP_USER4 (4)
#define NTR_DEP_USER5 (5)
#define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
#ifdef __cplusplus
extern "C" {
#endif
void ntr_init();
__attribute__((format(printf, 3, 4))) void ntr(
int dep, int level, const char *fmt, ...);
void ntr_set_level(int dep, int level);
void ntr_set_output(FILE *f);
int ntr_get_level(int dep);
#ifdef __cplusplus
}
#endif

View File

@ -1,61 +0,0 @@
#pragma once
#include <stdio.h>
#define NTR_LEVEL_NONE (0)
#define NTR_LEVEL_ERROR (1)
#define NTR_LEVEL_WARNING (2)
#define NTR_LEVEL_INFO (3)
#define NTR_LEVEL_DEBUG (4)
#define NTR_LEVEL_DEFAULT (NTR_LEVEL_WARNING)
#define NTR_DEP_NTR (0)
#define NTR_DEP_USER1 (1)
#define NTR_DEP_USER2 (2)
#define NTR_DEP_USER3 (3)
#define NTR_DEP_USER4 (4)
#define NTR_DEP_USER5 (5)
#define NTR_DEP_MAX (NTR_DEP_USER5 + 1)
#define NTR_DECL_IMPL \
int ntr_log_levels[NTR_DEP_MAX] = {NTR_LEVEL_DEFAULT}; \
FILE * ntr_out = stdout
extern int ntr_log_levels[];
extern FILE * ntr_out;
static inline
void ntr(int dep, int level, const char * fmt, ...)
{
va_list vl;
va_start(vl, fmt);
if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
vfprintf(ntr_out, fmt, vl);
}
va_end(vl);
}
static inline
void ntr_set_level(int dep, int level)
{
if (dep < NTR_DEP_MAX) {
ntr_log_levels[dep] = level;
}
}
static inline
void ntr_set_output(FILE * f)
{
if (f != NULL) {
ntr_out = f;
}
}
static inline
int ntr_get_level(int dep)
{
if (dep < NTR_DEP_MAX) {
return ntr_log_levels[dep];
}
return 0;
}

175
inc/pkt.h
View File

@ -1,175 +0,0 @@
#pragma once
#include <rte_mbuf_core.h>
#include <rte_mbuf.h>
#include <rte_udp.h>
#include <rte_byteorder.h>
#include <rte_ip.h>
#include <stdint.h>
#include <rte_flow.h>
#include <rte_ether.h>
#include <unistd.h>
#include <rte_net.h>
#include <rte_vxlan.h>
#define IP_DEFTTL 64 /* from RFC 1340. */
#define IP_VERSION 0x40
#define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
#define IP_ADDR_FMT_SIZE 15
constexpr static uint32_t ETHER_FRAME_MAGIC = 0xDCDCE5E5;
struct packet_hdr {
struct rte_ether_hdr eth_hdr;
struct rte_ipv4_hdr ipv4_hdr;
struct rte_udp_hdr udp_hdr;
} __attribute__((packed));
struct packet_data
{
struct packet_hdr pkt_hdr;
uint32_t magic;
uint32_t epoch;
uint64_t clt_ts_tx;
uint64_t clt_ts_rx;
uint64_t srv_ts_tx;
uint64_t srv_ts_rx;
};
static inline void
print_mac(struct rte_ether_addr * mac)
{
printf("%x:%x:%x:%x:%x:%x", mac->addr_bytes[0],
mac->addr_bytes[1],
mac->addr_bytes[2],
mac->addr_bytes[3],
mac->addr_bytes[4],
mac->addr_bytes[5]);
}
static inline void
print_ipv4(uint32_t ip)
{
printf("%d-%d-%d-%d", (ip >> 24) & 0xff,
(ip >> 16) & 0xff,
(ip >> 8) & 0xff,
(ip >> 0) & 0xff);
}
static inline void
dump_pkt(struct rte_mbuf *pkt)
{
if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr)) {
return;
}
struct rte_ether_hdr _eth_hdr;
struct rte_ether_hdr * eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_read(pkt, 0, sizeof(struct rte_ether_hdr), &_eth_hdr);
if (eth_hdr == NULL) {
return;
}
// ethernet frame
printf("Packet %p: Length 0x%x\n", (void*)pkt, rte_pktmbuf_data_len(pkt));
printf(" Ethernet header:\n");
printf(" Src:");
print_mac(&eth_hdr->s_addr);
printf("\n");
printf(" Dst:");
print_mac(&eth_hdr->d_addr);
printf("\n");
printf(" Type: 0x%x\n", rte_be_to_cpu_16(eth_hdr->ether_type));
uint16_t ether_type = rte_be_to_cpu_16(eth_hdr->ether_type);
if (ether_type != RTE_ETHER_TYPE_IPV4) {
return;
}
if(rte_pktmbuf_data_len(pkt) < sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)) {
return;
}
// dump ip header
struct rte_ipv4_hdr * ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
printf(" IPv4 header:\n");
printf(" Src:");
print_ipv4(rte_be_to_cpu_32(ipv4_hdr->src_addr));
printf("\n");
printf(" Dst:");
print_ipv4(rte_be_to_cpu_32(ipv4_hdr->dst_addr));
printf("\n");
printf(" Protocol: 0x%x\n", ipv4_hdr->next_proto_id);
}
static inline
struct packet_data * construct_udp_pkt_hdr(struct rte_mbuf * buf,
struct rte_ether_addr * src_mac, struct rte_ether_addr * dst_mac,
uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port)
{
rte_pktmbuf_reset(buf);
struct packet_data * pkt_data = (struct packet_data *)rte_pktmbuf_append(buf, sizeof(struct packet_data));
struct rte_ether_hdr * eth_hdr;
struct rte_ipv4_hdr * ipv4_hdr;
struct rte_udp_hdr * udp_hdr;
if (pkt_data == NULL)
return NULL;
// single segment
buf->nb_segs = 1;
// construct l2 header
eth_hdr = &pkt_data->pkt_hdr.eth_hdr;
rte_ether_addr_copy(src_mac, &eth_hdr->s_addr);
rte_ether_addr_copy(dst_mac, &eth_hdr->d_addr);
eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
buf->l2_len = sizeof(struct rte_ether_hdr);
// construct l3 header
ipv4_hdr = &pkt_data->pkt_hdr.ipv4_hdr;
memset(ipv4_hdr, 0, sizeof(struct rte_ipv4_hdr));
ipv4_hdr->version_ihl = IP_VHL_DEF;
ipv4_hdr->type_of_service = 0;
ipv4_hdr->fragment_offset = 0;
ipv4_hdr->time_to_live = IP_DEFTTL;
ipv4_hdr->next_proto_id = IPPROTO_UDP;
ipv4_hdr->packet_id = 0;
ipv4_hdr->src_addr = rte_cpu_to_be_32(src_ip);
ipv4_hdr->dst_addr = rte_cpu_to_be_32(dst_ip);
ipv4_hdr->total_length = rte_cpu_to_be_16(sizeof(struct packet_data) - sizeof(struct rte_ether_hdr));
ipv4_hdr->hdr_checksum = 0;
buf->l3_len = sizeof(struct rte_ipv4_hdr);
// construct l4 header
udp_hdr = &pkt_data->pkt_hdr.udp_hdr;
udp_hdr->src_port = rte_cpu_to_be_16(src_port);
udp_hdr->dst_port = rte_cpu_to_be_16(dst_port);
udp_hdr->dgram_cksum = 0; /* No UDP checksum. */
udp_hdr->dgram_len = rte_cpu_to_be_16(sizeof(struct packet_data) -
sizeof(struct rte_ether_hdr) -
sizeof(struct rte_udp_hdr));
buf->l4_len = sizeof(struct rte_udp_hdr);
return pkt_data;
}
static inline
struct packet_data * check_valid_packet(struct rte_mbuf * pkt)
{
struct packet_data * pkt_data = NULL;
if (rte_pktmbuf_data_len(pkt) < sizeof(struct packet_data)) {
return NULL;
}
pkt_data = rte_pktmbuf_mtod(pkt, struct packet_data *);
if (rte_be_to_cpu_32(pkt_data->magic) == ETHER_FRAME_MAGIC) {
return pkt_data;
}
return NULL;
}

View File

@ -0,0 +1,56 @@
#pragma once
#include "storage/drivers/driver.hh"
#include "spdk/bdev.h"
#include "spdk/bdev_zone.h"
#include "spdk/thread.h"
class birb_bdev_driver : public birb_driver
{
public:
birb_bdev_driver(const char * dev_name);
~birb_bdev_driver() override;
size_t get_capacity() override;
birb_driver_status get_status() override;
struct spdk_bdev * get_bdev();
struct spdk_bdev_desc * get_bdev_desc();
birb_driver_type get_type() override;
size_t get_align() override;
private:
DISALLOW_EVIL_CONSTRUCTORS(birb_bdev_driver);
struct spdk_bdev_desc * bdev_desc;
struct spdk_bdev * bdev;
size_t block_sz;
size_t block_num;
birb_driver_status status;
static void print_all_bdev();
static void bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev * bdev,
void * event_ctx);
};
class birb_bdev_thread_context : public birb_driver_thread_context
{
public:
birb_bdev_thread_context(birb_bdev_driver * driver);
~birb_bdev_thread_context() override;
int read(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
int write(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
void poll() override;
birb_driver::birb_driver_status get_status() override;
private:
struct cb_context {
callback cb;
void * ctx;
};
DISALLOW_EVIL_CONSTRUCTORS(birb_bdev_thread_context);
spdk_io_channel * io_channel;
birb_driver::birb_driver_status status;
birb_bdev_driver * driver;
static void io_callback(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
};

View File

@ -0,0 +1,47 @@
#pragma once
#include "defs.hh"
#include "spdk/thread.h"
#include <cstdlib>
class birb_driver
{
private:
DISALLOW_EVIL_CONSTRUCTORS(birb_driver);
public:
enum birb_driver_status{
BIRB_SUCCESS,
BIRB_FAIL
};
enum birb_driver_type{
BIRB_DRV_NVME,
BIRB_DRV_BDEV
};
virtual size_t get_capacity() = 0;
virtual birb_driver_status get_status() = 0;
virtual size_t get_align() = 0;
virtual birb_driver_type get_type() = 0;
virtual ~birb_driver() = default;
protected:
birb_driver() = default;
};
class birb_driver_thread_context
{
private:
DISALLOW_EVIL_CONSTRUCTORS(birb_driver_thread_context);
public:
using callback = void (*)(bool, void *);
virtual int read(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
virtual int write(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
virtual void poll() = 0;
virtual birb_driver::birb_driver_status get_status() = 0;
virtual ~birb_driver_thread_context() = default;
protected:
birb_driver_thread_context() = default;
};

View File

@ -0,0 +1,65 @@
#pragma once
#include "storage/drivers/driver.hh"
#include "spdk/nvme.h"
#include "spdk/thread.h"
class birb_nvme_driver : public birb_driver
{
public:
birb_nvme_driver(const char * dev_name);
~birb_nvme_driver() override;
size_t get_capacity() override;
birb_driver_status get_status() override;
birb_driver_type get_type() override;
size_t get_align() override;
spdk_nvme_ctrlr * get_ctrlr();
spdk_nvme_ns * get_ns();
spdk_nvme_io_qpair_opts * get_io_qpair_opts();
private:
struct attach_context {
spdk_nvme_ctrlr ** ctrlr;
spdk_nvme_ns ** ns;
const char * dev_name;
int valid;
};
DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_driver);
birb_driver_status status;
spdk_nvme_ctrlr * ctrlr;
spdk_nvme_ns * ns;
spdk_nvme_io_qpair_opts opts;
static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts);
static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts);
};
class birb_nvme_thread_context : public birb_driver_thread_context
{
public:
birb_nvme_thread_context(birb_nvme_driver * driver);
~birb_nvme_thread_context() override;
int read(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
int write(size_t offset, size_t size, char * buffer, callback callback, void * context) override;
void poll() override;
birb_driver::birb_driver_status get_status() override;
private:
struct cb_context {
callback cb;
void * ctx;
};
DISALLOW_EVIL_CONSTRUCTORS(birb_nvme_thread_context);
birb_driver::birb_driver_status status;
birb_nvme_driver * driver;
struct spdk_nvme_qpair * qpair;
static void io_callback(void *arg, const struct spdk_nvme_cpl *completion);
static uint32_t size_to_lba(size_t size, int lba_size);
static uint64_t addr_to_lba(size_t addr, int lba_size);
};

View File

@ -0,0 +1,47 @@
#pragma once
#include "defs.hh"
#include "spdk/thread.h"
#include <cstdlib>
class birb_driver
{
private:
DISALLOW_EVIL_CONSTRUCTORS(birb_driver);
public:
enum birb_driver_status{
BIRB_SUCCESS,
BIRB_FAIL
};
enum birb_driver_type{
BIRB_DRV_NVME,
BIRB_DRV_BDEV
};
virtual size_t get_capacity() = 0;
virtual birb_driver_status get_status() = 0;
virtual size_t get_align() = 0;
virtual birb_driver_type get_type() = 0;
virtual ~birb_driver() = default;
protected:
birb_driver() = default;
};
class birb_driver_thread_context
{
private:
DISALLOW_EVIL_CONSTRUCTORS(birb_driver_thread_context);
public:
using callback = void (*)(bool, void *);
virtual int read(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
virtual int write(size_t offset, size_t size, char * buffer, callback callback, void * context) = 0;
virtual void poll() = 0;
virtual birb_driver::birb_driver_status get_status() = 0;
virtual ~birb_driver_thread_context() = default;
protected:
birb_driver_thread_context() = default;
};

53
inc/storage/io_gen.hh Normal file
View File

@ -0,0 +1,53 @@
#pragma once
#include <sys/endian.h>
#include <sys/types.h>
#include "defs.hh"
#include "gen.hh"
#include <random>
enum io_generator_opcode {
IOGEN_READ,
IOGEN_WRITE
};
enum io_generator_address_mode {
IOGEN_ADDR_MONOTONIC_INCREASING,
IOGEN_ADDR_UNIFORM_RANDOM
};
struct io_generator_ctx {
unsigned long size;
uint64_t offset;
io_generator_opcode op;
};
//
// cur_offset is aligned to req_size
//
class io_generator {
public:
int issue(struct io_generator_ctx * ctx, char * buf);
io_generator(unsigned long req_size,
unsigned long capacity,
unsigned int read_pct,
io_generator_address_mode addr_mode);
io_generator() = delete;
private:
unsigned long cur_offset;
const unsigned long capacity;
const unsigned long req_size;
const unsigned int read_pct;
const io_generator_address_mode addr_mode;
std::random_device rd;
std::mt19937 rng;
std::uniform_int_distribution<int> dist;
std::random_device addr_rd;
std::mt19937 addr_rng;
std::uniform_int_distribution<uint64_t> addr_dist;
DISALLOW_EVIL_CONSTRUCTORS(io_generator);
};

View File

@ -1,378 +0,0 @@
#include <cstdio>
#include <cstdlib>
#include <rte_common.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_cycles.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include <rte_byteorder.h>
#include <rte_config.h>
#include <rte_ether.h>
#include <rte_launch.h>
#include <atomic>
#include <unistd.h>
#include "pkt.h"
#include "ntrlog.h"
#include "rte_arp.h"
#include "rte_mbuf_core.h"
NTR_DECL_IMPL;
constexpr unsigned int MBUF_MAX_COUNT = 8191;
constexpr unsigned int MBUF_CACHE_SIZE = 250;
constexpr unsigned int RX_RING_SIZE = 1024;
constexpr unsigned int TX_RING_SIZE = 1024;
constexpr unsigned int RX_RING_NUM = 1;
constexpr unsigned int TX_RING_NUM = 1;
constexpr unsigned int BURST_SIZE = 32;
static const struct rte_eth_conf port_conf_default{};
struct options_t {
//states
uint16_t s_portid;
struct rte_ether_addr s_host_mac;
struct rte_mempool * s_pkt_mempool;
};
struct options_t options;
static uint16_t
rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused, void *_ __rte_unused)
{
uint64_t now = rte_rdtsc();
struct packet_data * pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i]);
if (pkt_data == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "rx_add_timestamp: ignoring invalid packet %p.\n", (void*)pkts[i]);
continue;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "rx_add_timestamp: tagged packet %p with %llu.\n", (void*)pkts[i], now);
pkt_data->srv_ts_rx = rte_cpu_to_be_64(now);
}
return nb_pkts;
}
static uint16_t
tx_calc_latency(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
{
uint64_t now = rte_rdtsc();
struct packet_data * pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i]);
if (pkt_data == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "tx_calc_latency: ignoring invalid packet %p.\n", (void*)pkts[i]);
continue;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "tx_calc_latency: tagged packet %p with %llu.\n", (void*)pkts[i], now);
pkt_data->srv_ts_tx = rte_cpu_to_be_64(now);
}
return nb_pkts;
}
static int
locore_main(void * _unused __rte_unused)
{
struct rte_mbuf *bufs[BURST_SIZE];
struct rte_mbuf *tx_bufs[BURST_SIZE];
struct packet_data *pkt_data;
uint32_t core_id = rte_lcore_id();
if (rte_eth_dev_socket_id(options.s_portid) > 0 && rte_eth_dev_socket_id(options.s_portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "locore_main: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n", options.s_portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running.\n", core_id);
while(true) {
uint16_t nb_tx = 0;
const uint16_t nb_rx = rte_eth_rx_burst(options.s_portid, 0, bufs, BURST_SIZE);
if (nb_rx == 0) {
continue;
}
for(int i = 0; i < nb_rx; i++) {
pkt_data = check_valid_packet(bufs[i]);
if (pkt_data == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "locore_main: core %d skipping invalid packet %p.\n", core_id, (void*)bufs[i]);
dump_pkt(bufs[i]);
rte_pktmbuf_free(bufs[i]);
continue;
}
uint32_t dst_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.dst_addr);
uint32_t src_ip = rte_be_to_cpu_32(pkt_data->pkt_hdr.ipv4_hdr.src_addr);
uint16_t src_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.src_port);
uint16_t dst_port = rte_be_to_cpu_16(pkt_data->pkt_hdr.udp_hdr.dst_port);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d packet %p from %d.%d.%d.%d(%x:%x:%x:%x:%x:%x) to %d.%d.%d.%d(%x:%x:%x:%x:%x:%x), sport %d, dport %d, epoch %d\n",
core_id,
(void*)bufs[i],
(src_ip >> 24) & 0xff,
(src_ip >> 16) & 0xff,
(src_ip >> 8) & 0xff,
(src_ip >> 0) & 0xff,
pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[0],
pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[1],
pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[2],
pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[3],
pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[4],
pkt_data->pkt_hdr.eth_hdr.s_addr.addr_bytes[5],
(dst_ip >> 24) & 0xff,
(dst_ip >> 16) & 0xff,
(dst_ip >> 8) & 0xff,
(dst_ip >> 0) & 0xff,
pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[0],
pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[1],
pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[2],
pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[3],
pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[4],
pkt_data->pkt_hdr.eth_hdr.d_addr.addr_bytes[5],
src_port,
dst_port,
rte_be_to_cpu_32(pkt_data->epoch));
// swap s_addr and d_addr
struct rte_mbuf * pkt_buf = rte_pktmbuf_alloc(options.s_pkt_mempool);
if (pkt_buf == NULL) {
rte_exit(EXIT_FAILURE, "locore_main: failed to allocate memory for pkt_buf");
}
struct packet_data * tx_data = construct_udp_pkt_hdr(pkt_buf,
&options.s_host_mac,
&pkt_data->pkt_hdr.eth_hdr.s_addr,
dst_ip,
src_ip,
dst_port,
src_port);
if (tx_data == NULL) {
rte_exit(EXIT_FAILURE, "failed to construct tx packet %p", (void*)pkt_buf);
}
// copy, endianess doesn't matter
tx_data->epoch = pkt_data->epoch;
tx_data->magic = pkt_data->magic;
tx_data->clt_ts_rx = pkt_data->clt_ts_rx;
tx_data->clt_ts_tx = pkt_data->clt_ts_tx;
tx_data->srv_ts_rx = pkt_data->srv_ts_rx;
tx_data->srv_ts_tx = pkt_data->srv_ts_tx;
// queue for burst send
tx_bufs[nb_tx++] = pkt_buf;
// free rx packet
rte_pktmbuf_free(bufs[i]);
}
const uint16_t nb_tx_succ = rte_eth_tx_burst(options.s_portid, 0, tx_bufs, nb_tx);
// cleanup unsent packets
// don't need to free others because it's offloaded
if (nb_tx_succ < nb_tx) {
rte_exit(EXIT_FAILURE, "locore_main: failed to send some packets.\n");
}
}
return 0;
}
static int
port_init(uint16_t portid, struct rte_mempool *mbuf_pool)
{
struct rte_eth_dev_info dev_info;
struct rte_eth_conf port_conf = port_conf_default;
struct rte_eth_txconf txconf;
struct rte_eth_rxconf rxconf;
uint16_t nb_rxd = RX_RING_SIZE;
uint16_t nb_txd = TX_RING_SIZE;
if(!rte_eth_dev_is_valid_port(portid)) {
return -1;
}
int ret = rte_eth_dev_info_get(portid, &dev_info);
if (ret != 0) {
return ret;
}
port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MBUF_FAST_FREE;
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(portid, RX_RING_NUM, TX_RING_NUM, &port_conf);
if (ret != 0)
return ret;
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, &nb_txd);
if (ret != 0)
return ret;
/* Allocate and set up 1 RX queue per Ethernet port. */
rxconf = dev_info.default_rxconf;
for (uint32_t i = 0; i < RX_RING_NUM; i++) {
ret = rte_eth_rx_queue_setup(portid, i, nb_rxd, rte_eth_dev_socket_id(portid), &rxconf, mbuf_pool);
if (ret < 0)
return ret;
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
/* Allocate and set up 1 TX queue per Ethernet port. */
for (uint32_t i = 0; i < TX_RING_NUM; i++) {
ret = rte_eth_tx_queue_setup(portid, i, nb_txd, rte_eth_dev_socket_id(portid), &txconf);
if (ret < 0)
return ret;
}
ret = rte_eth_dev_start(portid);
if (ret < 0)
return ret;
/* Display the port MAC address. */
struct rte_ether_addr addr;
ret = rte_eth_macaddr_get(portid, &addr);
if (ret != 0)
return ret;
/* Enable RX in promiscuous mode for the Ethernet device. */
ret = rte_eth_promiscuous_enable(portid);
if (ret != 0)
return ret;
if (rte_eth_add_tx_callback(portid, 0, tx_calc_latency, NULL) == NULL || rte_eth_add_rx_callback(portid, 0, rx_add_timestamp, NULL) == NULL) {
return -1;
}
return 0;
}
static void usage()
{
fprintf(stdout,
"Usage:\n" \
" -v(vv): verbose mode\n" \
" -h: display the information\n");
}
int main(int argc, char* argv[])
{
unsigned int nb_ports;
struct rte_mempool *mbuf_pool, *mbuf_pool_pkt;
// init dpdk
int ret = rte_eal_init(argc, argv);
if (ret < 0) {
rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
}
argc -= ret;
argv += ret;
// set warning level
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
{
int c;
// parse arguments
while((c = getopt(argc, argv, "hv")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 'h':
usage();
rte_exit(EXIT_SUCCESS, NULL);
break;
default:
usage();
rte_exit(EXIT_SUCCESS, "unknown argument: %c", c);
break;
}
}
}
// XXX: singal handler to exit
nb_ports = rte_eth_dev_count_avail();
if (nb_ports == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
}
// create a mbuf memory pool on the socket
mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool\n");
}
// create a pkt mbuf memory pool on the socket
mbuf_pool_pkt = rte_pktmbuf_pool_create("MBUF_POOL_PKT", MBUF_MAX_COUNT * nb_ports, MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (mbuf_pool_pkt == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf_pkt pool\n");
}
options.s_pkt_mempool = mbuf_pool_pkt;
uint16_t portid = rte_eth_find_next(0);
if (portid == RTE_MAX_ETHPORTS) {
rte_exit(EXIT_FAILURE, "cannot find an available port\n");
}
options.s_portid = portid;
if (port_init(portid, mbuf_pool) != 0) {
rte_exit(EXIT_FAILURE, "cannot init port %d\n", portid);
}
if (rte_eth_macaddr_get(portid, &options.s_host_mac) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n", portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n", portid,
options.s_host_mac.addr_bytes[0],
options.s_host_mac.addr_bytes[1],
options.s_host_mac.addr_bytes[2],
options.s_host_mac.addr_bytes[3],
options.s_host_mac.addr_bytes[4],
options.s_host_mac.addr_bytes[5]);
uint16_t lcore_id = rte_get_next_lcore(0, true, false);
if (lcore_id == RTE_MAX_LCORE) {
rte_exit(EXIT_FAILURE, "cannot detect lcores.\n");
}
if (rte_eal_remote_launch(locore_main, NULL, lcore_id) != 0) {
rte_exit(EXIT_FAILURE, "failed to launch function on locore %d\n", lcore_id);
}
// while(true) {
// struct rte_eth_stats stats;
// rte_eth_stats_get(portid, &stats);
// printf("recv: %d missed: %d err: %d\n",(uint32_t)stats.ipackets, (uint32_t)stats.imissed,(uint32_t)stats.ierrors);
// usleep(1000000);
// }
if (rte_eal_wait_lcore(lcore_id) != 0) {
rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n", lcore_id);
}
// shouldn't get here
return 0;
}

95
libgen/generator.cc Normal file
View File

@ -0,0 +1,95 @@
// modified from mutilate
#include "gen.hh"
Generator *
createFacebookKey()
{
return new GEV(30.7984, 8.20449, 0.078688);
}
Generator *
createFacebookValue()
{
Generator *g = new GPareto(15.0, 214.476, 0.348238);
Discrete *d = new Discrete(g);
d->add(0.00536, 0.0);
d->add(0.00047, 1.0);
d->add(0.17820, 2.0);
d->add(0.09239, 3.0);
d->add(0.00018, 4.0);
d->add(0.02740, 5.0);
d->add(0.00065, 6.0);
d->add(0.00606, 7.0);
d->add(0.00023, 8.0);
d->add(0.00837, 9.0);
d->add(0.00837, 10.0);
d->add(0.08989, 11.0);
d->add(0.00092, 12.0);
d->add(0.00326, 13.0);
d->add(0.01980, 14.0);
return d;
}
Generator *
createFacebookIA()
{
return new GPareto(0, 16.0292, 0.154971);
}
Generator *
createGenerator(std::string str)
{
if (!strcmp(str.c_str(), "fb_key"))
return createFacebookKey();
else if (!strcmp(str.c_str(), "fb_value"))
return createFacebookValue();
else if (!strcmp(str.c_str(), "fb_ia"))
return createFacebookIA();
char *s_copy = new char[str.length() + 1];
strcpy(s_copy, str.c_str());
char *saveptr = NULL;
if (atoi(s_copy) != 0 || !strcmp(s_copy, "0")) {
double v = atof(s_copy);
delete[] s_copy;
return new Fixed(v);
}
char *t_ptr = strtok_r(s_copy, ":", &saveptr);
char *a_ptr = strtok_r(NULL, ":", &saveptr);
if (t_ptr == NULL) // || a_ptr == NULL)
DIE("strtok(.., \":\") failed to parse %s", str.c_str());
saveptr = NULL;
char *s1 = strtok_r(a_ptr, ",", &saveptr);
char *s2 = strtok_r(NULL, ",", &saveptr);
char *s3 = strtok_r(NULL, ",", &saveptr);
double a1 = s1 ? atof(s1) : 0.0;
double a2 = s2 ? atof(s2) : 0.0;
double a3 = s3 ? atof(s3) : 0.0;
delete[] s_copy;
if (strcasestr(str.c_str(), "fixed"))
return new Fixed(a1);
else if (strcasestr(str.c_str(), "normal"))
return new Normal(a1, a2);
else if (strcasestr(str.c_str(), "exponential"))
return new Exponential(a1);
else if (strcasestr(str.c_str(), "pareto"))
return new GPareto(a1, a2, a3);
else if (strcasestr(str.c_str(), "gev"))
return new GEV(a1, a2, a3);
else if (strcasestr(str.c_str(), "uniform"))
return new Uniform(a1);
DIE("Unable to create Generator '%s'", str.c_str());
return NULL;
}

276
libgen/loadgen.cc Normal file
View File

@ -0,0 +1,276 @@
#include <sys/types.h>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/endian.h>
#include <sys/thr.h>
#include <pthread.h>
#include <pthread_np.h>
#include <topo.h>
#include <unistd.h>
#include "nms.h"
#include "gen.hh"
#include <atomic>
void *
memload_generator::worker_thrd(void *_tinfo)
{
auto *tinfo = (struct thread_info *)_tinfo;
void *from_buffer, *to_buffer, *tmp;
if (tinfo->opts->shared_buffer) {
from_buffer = tinfo->from_buffer;
to_buffer = tinfo->to_buffer;
} else {
if (tinfo->opts->verbose) {
fprintf(stdout,
"memload_generator <thread %d>: allocating fbuf %lu bytes on domain %d...\n",
tinfo->tid, tinfo->opts->buffer_size,
topo_core_to_numa(tinfo->coreid));
}
from_buffer = nms_alloc_static(topo_core_to_numa(
tinfo->coreid),
tinfo->opts->buffer_size);
if (tinfo->opts->verbose) {
fprintf(stdout,
"memload_generator <thread %d>: allocating tbuf %lu bytes on domain %d...\n",
tinfo->tid, tinfo->opts->buffer_size, tinfo->target_dom);
}
to_buffer = nms_alloc_static(tinfo->target_dom,
tinfo->opts->buffer_size);
}
if (from_buffer == nullptr || to_buffer == nullptr) {
if (tinfo->opts->verbose) {
fprintf(stderr,
"memload_generator <thread %d>: failed to allocate memory\n",
tinfo->tid);
}
tinfo->init_status.store(-1);
return nullptr;
}
if (tinfo->pull) {
tmp = from_buffer;
from_buffer = to_buffer;
to_buffer = tmp;
}
// wait for other threads to init
if (tinfo->opts->verbose) {
fprintf(stdout, "memload_generator <thread %d, pull %d>: running...\n", tinfo->tid, tinfo->pull);
}
tinfo->init_status.store(1);
uint64_t next_ts = topo_uptime_ns();
size_t cur_offset = 0;
uint64_t cur_ts = 0;
while (true) {
switch (tinfo->state->load()) {
case STATE_RUN:
cur_ts = topo_uptime_ns();
if (cur_ts >= next_ts) {
if (cur_offset + tinfo->opts->transaction_size >
tinfo->opts->buffer_size) {
cur_offset = 0;
}
// for (uint i = 0; i < tinfo->opts->transaction_size; i++) {
// ((char *)to_buffer)[cur_offset + i] = ((char *)from_buffer)[cur_offset + i];
// }
memcpy((char *)to_buffer + cur_offset,
(char *)from_buffer + cur_offset,
tinfo->opts->transaction_size);
tinfo->num_trans.fetch_add(1);
if (tinfo->reset_ts.load(
std::memory_order_relaxed)) {
tinfo->reset_ts.store(false,
std::memory_order_relaxed);
next_ts = cur_ts;
}
next_ts += tinfo->ia_gen->generate() *
(double)S2NS;
cur_offset += tinfo->opts->transaction_size;
}
break;
case STATE_END:
goto end;
case STATE_RDY:
next_ts = topo_uptime_ns();
break;
case STATE_INIT:
default:
break;
}
}
end:
if (tinfo->opts->verbose) {
fprintf(stdout, "memload_generator <thread %d>: exiting...\n",
tinfo->tid);
}
if (!tinfo->opts->shared_buffer) {
nms_free_static(from_buffer, tinfo->opts->buffer_size);
nms_free_static(to_buffer, tinfo->opts->buffer_size);
}
return nullptr;
}
memload_generator::memload_generator(cpuset_t *threads, cpuset_t * modes, cpuset_t *target_domain,
struct memload_generator_options *opt, bool *success)
{
*success = false;
state.store(STATE_INIT);
std::memcpy(&this->opts, opt, sizeof(memload_generator_options));
int nextcore = CPU_FFS(threads) - 1;
int target_domain_id = CPU_FFS(target_domain) - 1;
int num_cores = CPU_COUNT(threads);
if (target_domain_id < 0 || num_cores == 0) {
return;
}
double thread_tps = (double)opt->trans_per_second / (double)num_cores;
void *local_buffer = nullptr;
void *target_buffer = nullptr;
int tid = 0;
if (opts.shared_buffer) {
local_buffer = nms_alloc_static(topo_core_to_numa(nextcore),
opt->buffer_size);
target_buffer = nms_alloc_static(target_domain_id,
opt->buffer_size);
if (local_buffer == nullptr || target_buffer == nullptr) {
*success = false;
goto end;
}
}
while (nextcore != -1) {
auto info = new struct thread_info;
cpuset_t cpuset;
pthread_attr_t attr;
info->ia_gen = createGenerator(opts.ia_dist);
if (info->ia_gen == nullptr) {
goto end;
}
info->ia_gen->set_lambda(thread_tps);
info->init_status.store(0);
info->state = &this->state;
info->reset_ts.store(false, std::memory_order_relaxed);
info->num_trans.store(0);
info->opts = &this->opts;
info->tid = tid;
info->coreid = nextcore;
info->target_dom = target_domain_id;
info->from_buffer = local_buffer;
info->to_buffer = target_buffer;
info->pull = CPU_ISSET(nextcore, modes);
CPU_ZERO(&cpuset);
CPU_SET(nextcore, &cpuset);
pthread_attr_init(&attr);
pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &cpuset);
pthread_create(&info->pthr, &attr, worker_thrd, info);
if (opts.verbose) {
fprintf(stdout,
"memload_generator: created thread %d on core %d target domain %d\n",
tid, nextcore, target_domain_id);
}
thr_infos.push_back(info);
CPU_CLR(nextcore, threads);
nextcore = CPU_FFS(threads) - 1;
tid++;
}
for (auto tinfo : thr_infos) {
int status;
while ((status = tinfo->init_status.load()) != 1) {
if (status == -1) {
state.store(STATE_END);
*success = false;
goto end;
}
}
}
state.store(STATE_RDY);
*success = true;
end:
if (opts.verbose) {
fprintf(stdout,
"memload_generator: exiting constructor. Success: %d...\n",
success ? 1 : 0);
}
}
bool
memload_generator::start()
{
if (this->state.load() == STATE_RDY) {
this->state.store(memload_generator::STATE_RUN);
return true;
}
return false;
}
bool
memload_generator::stop()
{
if (this->state.load() == STATE_RUN) {
this->state.store(memload_generator::STATE_RDY);
return true;
}
return false;
}
bool
memload_generator::set_transactions(uint64_t tps)
{
if (this->state.load() != STATE_END &&
this->state.load() != STATE_INIT) {
for (unsigned int i = 0; i < thr_infos.size(); i++) {
thr_infos.at(i)->ia_gen->set_lambda(
(double)tps / (double)thr_infos.size());
thr_infos.at(i)->reset_ts.store(true,
std::memory_order_relaxed);
}
return true;
}
return false;
}
uint64_t
memload_generator::get_transactions()
{
uint64_t total_transactions = 0;
for (auto i : thr_infos) {
total_transactions += i->num_trans.load();
}
return total_transactions;
}
memload_generator::~memload_generator()
{
void *buf1, *buf2;
this->state.store(STATE_END);
for (auto i : thr_infos) {
// XXX: nms_free regions
pthread_join(i->pthr, NULL);
buf1 = i->from_buffer;
buf2 = i->to_buffer;
delete i;
}
if (opts.shared_buffer) {
nms_free_static(buf1, opts.buffer_size);
nms_free_static(buf2, opts.buffer_size);
}
}

205
libnms/alloc.c Normal file
View File

@ -0,0 +1,205 @@
#include <pthread.h>
#include <sys/types.h>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/thr.h>
#include <sys/mman.h>
#include <stdint.h>
#include <stdio.h>
#include <errno.h>
#include <stdatomic.h>
#include <string.h>
#include <assert.h>
#include <nms.h>
#define MAX_NUMA_DOMAINS (64)
#define MAX_REGIONS (64)
#define REGION_SIZE (1024 * 1024 * 1024)
#define PAGE_SIZE (4096)
struct nms_region {
uintptr_t start_addr;
size_t size;
size_t occupied;
};
struct nms_desc {
// alloc
pthread_mutex_t alloc_lock;
struct nms_region regions[MAX_NUMA_DOMAINS][MAX_REGIONS];
int region_sz[MAX_NUMA_DOMAINS];
};
static _Atomic(int) initialized = 0;
static struct nms_desc g_desc;
void
nms_free_static(void * buf, size_t sz)
{
munmap(buf, sz);
return;
}
void *
nms_alloc_static(int node_id, size_t sz)
{
long tid;
domainset_t orig_dom;
int orig_policy;
void * region;
thr_self(&tid);
DOMAINSET_ZERO(&orig_dom);
// save existing thread's allocation strategy
int ret = cpuset_getdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, &orig_policy);
if (ret != 0) {
fprintf(stderr, "libnms: cpuset_getdomain failed with %d\n", errno);
return NULL;
}
domainset_t tmp_domain;
DOMAINSET_ZERO(&tmp_domain);
DOMAINSET_SET(node_id, &tmp_domain);
ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(tmp_domain), &tmp_domain, DOMAINSET_POLICY_ROUNDROBIN);
if (ret != 0) {
fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
return NULL;
}
if ((region = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_ALIGNED_SUPER | MAP_NOCORE | MAP_PRIVATE | MAP_PREFAULT_READ, -1, 0)) == MAP_FAILED) {
fprintf(stderr, "libnms: mmap failed with %d\n", errno);
return NULL;
}
// touch the pages to prefault the pages
int sum;
for (size_t i = 0; i < sz; i++) {
sum += *(uint8_t *)((char *)region + i);
*(uint8_t *)((char *)region + i) = i;
}
// restore existing thread's allocation strategy
ret = cpuset_setdomain(CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, sizeof(orig_dom), &orig_dom, orig_policy);
if (ret != 0) {
fprintf(stderr, "libnms: cpuset_setdomain failed with %d\n", errno);
munmap(region, REGION_SIZE);
return NULL;
}
return region;
}
static int
nms_desc_init(struct nms_desc * desc, int verbose)
{
memset(desc, 0, sizeof(struct nms_desc));
pthread_mutex_init(&desc->alloc_lock, NULL);
return 0;
}
static void *
nms_region_malloc(struct nms_region * region, size_t size)
{
void * ret = NULL;
if (region->size >= region->occupied + size) {
ret = (void *)(region->start_addr + region->occupied);
region->occupied += size;
region->occupied = (region->occupied + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
}
return ret;
}
static int
nms_desc_add_region(struct nms_desc * desc, int nodeid, size_t size)
{
void * ret;
int idx;
ret = nms_alloc_static(nodeid, REGION_SIZE);
if (ret == NULL) {
fprintf(stderr, "libnms: failed to allocate region on node %d\n", nodeid);
return ENOMEM;
}
desc->region_sz[nodeid]++;
idx = desc->region_sz[nodeid] - 1;
desc->regions[nodeid][idx].start_addr = (uintptr_t)ret;
desc->regions[nodeid][idx].occupied = 0;
desc->regions[nodeid][idx].size = REGION_SIZE;
return 0;
}
static void *
nms_desc_malloc(struct nms_desc * desc, unsigned int nodeid, size_t size)
{
void * ret = NULL;
int idx;
int new_region = 0;
if (size > REGION_SIZE) {
return NULL;
}
pthread_mutex_lock(&desc->alloc_lock);
retry:
if (desc->region_sz[nodeid] > 0) {
idx = desc->region_sz[nodeid] - 1;
ret = nms_region_malloc(&desc->regions[nodeid][idx], size);
}
if (ret == NULL) {
// we need a new region
if (nms_desc_add_region(desc, nodeid, REGION_SIZE) != 0) {
pthread_mutex_unlock(&desc->alloc_lock);
return NULL;
}
fprintf(stdout, "libnms: malloc request of size %zu -> allocated new region on node %d\n", size, nodeid);
goto retry;
}
pthread_mutex_unlock(&desc->alloc_lock);
return ret;
}
static void
nms_desc_free(struct nms_desc * desc __attribute__((unused)), unsigned int node __attribute__((unused)), void * addr __attribute__((unused)))
{
// dummy function
}
int
nms_init(int verbose)
{
int expected = 0;
if (atomic_compare_exchange_strong(&initialized, &expected, 2)) {
nms_desc_init(&g_desc, verbose);
atomic_store(&initialized, 1);
} else {
while(atomic_load(&initialized) != 1) {
}
fprintf(stdout,"libnms: already initialized.\n");
}
return 0;
}
void *
nms_malloc(int nodeid, size_t sz)
{
assert(atomic_load(&initialized) == 1);
return nms_desc_malloc(&g_desc, nodeid, sz);
}
void
nms_free(int nodeid, void * addr)
{
assert(atomic_load(&initialized) == 1);
nms_desc_free(&g_desc, nodeid, addr);
}

46
libntr/ntr.c Normal file
View File

@ -0,0 +1,46 @@
#include "ntr.h"
static int ntr_log_levels[NTR_DEP_MAX] = { NTR_LEVEL_DEFAULT };
static FILE *ntr_out;
void
ntr_init()
{
ntr_out = stdout;
}
void
ntr(int dep, int level, const char *fmt, ...)
{
va_list vl;
va_start(vl, fmt);
if (dep < NTR_DEP_MAX && level <= ntr_log_levels[dep]) {
vfprintf(ntr_out, fmt, vl);
}
va_end(vl);
}
void
ntr_set_level(int dep, int level)
{
if (dep < NTR_DEP_MAX) {
ntr_log_levels[dep] = level;
}
}
void
ntr_set_output(FILE *f)
{
if (f != NULL) {
ntr_out = f;
}
}
int
ntr_get_level(int dep)
{
if (dep < NTR_DEP_MAX) {
return ntr_log_levels[dep];
}
return 0;
}

989
net/cat.cc Normal file
View File

@ -0,0 +1,989 @@
#include <atomic>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <random>
#include <vector>
#include <topo.h>
#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_launch.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include <unistd.h>
#include "ntr.h"
#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "nms.h"
constexpr static unsigned int BURST_SIZE = 32;
constexpr static unsigned int MAX_SLAVES = 32;
constexpr static unsigned int SLAVES_MAX_WAIT_MS = 1000;
struct datapt {
uint32_t epoch;
uint32_t valid;
uint64_t clt_hw_tx;
uint64_t clt_sw_tx;
uint64_t clt_hw_rx;
uint64_t clt_sw_rx;
uint64_t srv_hw_tx;
uint64_t srv_sw_tx;
uint64_t srv_hw_rx;
uint64_t srv_sw_rx;
};
constexpr static uint32_t STATE_WAIT = 0; // waiting for sending
constexpr static uint32_t STATE_SENT = 1; // we sent a packet
constexpr static uint32_t STATE_COMPLETE = 2; // we received everything
constexpr static uint32_t STATE_PKTLOSS = 3; // last packet sent was lost
struct options_t {
// parameters
unsigned int run_time { 5 };
unsigned int warmup_time { 3 };
char output[256] = "output.txt";
char ia_gen_str[256] = "fixed";
unsigned int target_qps { 0 };
unsigned int master_mode { 0 };
struct net_spec server_spec { };
cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
std::vector<struct net_spec *> slaves;
uint32_t pkt_loss_failure_threshold { 0 };
uint32_t pkt_loss_time_ms { UINT32_MAX };
int portid { 0 };
// states
struct net_spec s_host_spec { };
struct conn_spec s_host_conn {
.src = &s_host_spec, .dst = &server_spec, .dst_port = POU_PORT
};
unsigned int s_rxqid { 0 };
unsigned int s_txqid { 0 };
unsigned int s_socketid { 0 };
// for qps calculation
std::atomic<uint32_t> s_recved_pkts { 0 };
std::atomic<uint32_t> s_pkt_loss { 0 };
std::atomic<uint64_t> s_start_time { 0 };
std::atomic<uint64_t> s_end_time { 0 };
std::atomic<uint32_t> s_slave_qps { 0 };
std::atomic<uint32_t> s_slave_recved { 0 };
std::atomic<uint32_t> s_slave_loss { 0 };
uint32_t s_state { STATE_WAIT };
bool s_hwtimestamp { true };
Generator *s_iagen { nullptr };
std::vector<struct datapt *> s_data;
struct datapt *s_last_datapt { nullptr };
uint32_t s_epoch { 0 };
std::atomic<bool> s_stop { false };
std::atomic<uint32_t> s_record { 0 };
};
static struct options_t options;
static uint16_t
rx_add_timestamp(uint16_t port, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
void *_ __rte_unused)
{
uint64_t now = topo_uptime_ns();
struct pkt_hdr *pkt_data;
struct timespec ts { };
int ret;
if (options.s_state != STATE_SENT) {
return nb_pkts;
}
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: ignoring invalid packet 0x%p.\n",
(void *)pkts[i]);
continue;
}
if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
uint32_t epoch = rte_be_to_cpu_32(
((struct pkt_payload_epoch *)pkt_data->payload)
->epoch);
if (options.s_last_datapt != nullptr &&
options.s_last_datapt->epoch == epoch) {
if (options.s_hwtimestamp) {
if ((ret = rte_eth_timesync_read_rx_timestamp(
port, &ts, pkts[i]->timesync & 0x3)) ==
0) {
// has hw rx timestamp
options.s_last_datapt->clt_hw_rx =
ts.tv_sec * S2NS + ts.tv_nsec;
options.s_last_datapt->clt_sw_rx = now;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: tagged packet %p with sw: %lu hw: %lu.\n",
(void *)pkts[i], now,
options.s_last_datapt->clt_hw_rx);
} else {
rte_exit(EXIT_FAILURE,
"rx_add_timestamp: packet %p not tagged - hw ts not "
"available - %d.\n",
(void *)pkts[i], ret);
}
} else {
options.s_last_datapt->clt_sw_rx = now;
options.s_last_datapt->clt_hw_rx = 0;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: tagged packet %p with sw: %lu hw: (disabled).\n",
(void *)pkts[i], now);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"rx_add_timestamp: packet %p epoch %d != last epoch %d.\n",
(void *)pkts[i], epoch,
options.s_last_datapt->epoch);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: packet %p not tagged - type %d.\n",
(void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
}
}
return nb_pkts;
}
static uint16_t
tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
{
uint64_t now = topo_uptime_ns();
struct pkt_hdr *pkt_data;
// if (options.s_state != STATE_SENT) {
// return nb_pkts;
// }
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"tx_add_timestamp: ignoring invalid packet 0x%p.\n",
(void *)pkts[i]);
continue;
}
if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
uint32_t epoch = rte_be_to_cpu_32(
((struct pkt_payload_epoch *)pkt_data->payload)
->epoch);
if (options.s_last_datapt == nullptr ||
epoch != options.s_last_datapt->epoch) {
rte_exit(EXIT_FAILURE,
"tx_add_timestamp: packet epoch %d != last epoch %d\n",
epoch, options.s_last_datapt->epoch);
}
options.s_last_datapt->clt_sw_tx = now;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"tx_add_timestamp: tagged packet %p with sw: %lu.\n",
(void *)pkts[i], now);
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"tx_add_timestamp: packet %p not tagged - type %d.\n",
(void *)pkts[i], pkt_data->type);
}
}
return nb_pkts;
}
// returns 0 on success
static void
send_all_slaves(uint16_t type)
{
struct rte_mbuf *tx_bufs[MAX_SLAVES];
//struct rte_eth_stats stats;
struct conn_spec cspec;
cspec.src = &options.s_host_spec;
cspec.dst_port = DEFAULT_RAT_PORT;
cspec.src_port = DEFAULT_RAT_PORT;
// send all clients SYNC
for (unsigned int i = 0; i < options.slaves.size(); i++) {
struct pkt_hdr *hdr;
cspec.dst = options.slaves.at(i);
if (alloc_pkt_hdr(mempool_get(options.s_socketid), type, &cspec, 0,
&tx_bufs[i], &hdr) != 0) {
rte_exit(EXIT_FAILURE, "failed to alloc packet\n");
}
}
// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
// rte_exit(EXIT_FAILURE, "failed!");
// }
// printf("send_all_slaves: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
if (rte_eth_tx_burst(options.portid, options.s_txqid, tx_bufs,
options.slaves.size()) != options.slaves.size()) {
rte_exit(EXIT_FAILURE, "failed to send some packets\n");
}
}
// sizeof mbuf must >= MAX_SLAVES
// this function fills up to #slave
static void
wait_for_slaves(uint16_t etype, struct rte_mbuf **out)
{
struct rte_mbuf *tx_bufs[MAX_SLAVES];
bool stop = false;
const uint64_t start = topo_uptime_ns();
std::vector<struct rte_ether_addr *> recved;
uint32_t tot = 0;
while (!stop) {
uint64_t now = topo_uptime_ns();
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
options.s_rxqid, tx_bufs, MAX_SLAVES);
if (nb_rx > 0) {
for (unsigned int i = 0; i < nb_rx; i++) {
struct pkt_hdr *each = check_valid_packet(
tx_bufs[i], &options.s_host_spec.mac_addr);
uint16_t type;
if (each == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"wait_for_slaves: ignoring invalid packet %p.\n",
(void *)tx_bufs[i]);
goto end_loop;
}
type = rte_be_to_cpu_16(each->type);
if (type == etype) {
bool invalid = true;
// check if it is from one of our
// clients
for (auto eaddr : options.slaves) {
if (rte_is_same_ether_addr(
&eaddr->mac_addr,
&each->eth_hdr
.src_addr)) {
invalid = false;
break;
}
}
if (invalid) {
// received invalid packet from
// unregistered slave
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"wait_for_slaves: invalid packet %p from unregistered slave\n.",
tx_bufs[i]);
goto end_loop;
}
invalid = false;
// check if we have already received the
// same packet from the mac addr
for (auto eaddr : recved) {
if (rte_is_same_ether_addr(
eaddr,
&each->eth_hdr
.src_addr)) {
invalid = true;
break;
}
}
if (invalid) {
// received invalid packet from
// the same slave
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"wait_for_slaves: invalid packet %p - duplicated\n.",
tx_bufs[i]);
goto end_loop;
}
recved.push_back(
&each->eth_hdr.src_addr);
if (recved.size() ==
options.slaves.size()) {
stop = true;
}
if (out != nullptr) {
out[tot] = tx_bufs[i];
tot++;
// don't free this packet
continue;
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"wait_for_slaves: ignoring invalid packet %p type %d.\n",
(void *)tx_bufs[i], type);
}
end_loop:
rte_pktmbuf_free(tx_bufs[i]);
}
}
// struct rte_eth_stats stats;
// if (rte_eth_stats_get(options.portid, &stats) != 0 ) {
// rte_exit(EXIT_FAILURE, "failed!");
// }
//printf("wait_slaves <AFTER>: ipackets %lu, opackets %lu, ierrors %lu, oerrors %lu\n", stats.ipackets, stats.opackets, stats.ierrors, stats.oerrors);
if (now - start > SLAVES_MAX_WAIT_MS * MS2NS) {
rte_exit(EXIT_FAILURE,
"cat: waiting for too long %d. I QUIT!!", etype);
}
}
}
static void
pkt_loop()
{
struct rte_mbuf *tx_buf;
struct rte_mbuf *rx_bufs[BURST_SIZE];
struct pkt_hdr *pkt_data;
rdport_generator port_gen(MIN_RANDOM_PORT);
bool read_tx = true;
bool recv_stat = true;
bool recv_resp = true;
if (rte_eth_dev_socket_id(options.portid) > 0 &&
rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n",
options.portid);
}
uint64_t next_ts = topo_uptime_ns();
uint64_t last_send_ts = next_ts;
bool is_last_pkt_lost = false;
uint32_t num_cts_pkt_lost = 0;
while (!options.s_stop.load()) {
uint64_t now = topo_uptime_ns();
// always pop incoming packets
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
options.s_rxqid, rx_bufs, BURST_SIZE);
if (nb_rx > 0) {
for (int i = 0; i < nb_rx; i++) {
if (options.s_state != STATE_SENT) {
// only need to process packets after we
// sent one
rte_pktmbuf_free(rx_bufs[i]);
continue;
}
struct pkt_hdr *each = check_valid_packet(
rx_bufs[i], &options.s_host_spec.mac_addr);
if (each == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: ignoring invalid packet %p.\n",
(void *)rx_bufs[i]);
rte_pktmbuf_free(rx_bufs[i]);
continue;
}
uint16_t type = rte_be_to_cpu_16(each->type);
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
"locore_main: received packet %p ", each);
struct pkt_payload_epoch *pld_epoch;
struct pkt_payload_stat *pld_stat;
uint32_t epoch;
switch (type) {
case PKT_TYPE_PROBE_RESP:
pld_epoch = (struct pkt_payload_epoch *)
each->payload;
epoch = rte_be_to_cpu_32(
pld_epoch->epoch);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "lcore_main: PROBE_RESP received packet %p epoch %d\n", each, epoch);
if (options.s_last_datapt == nullptr ||
epoch !=
options.s_last_datapt->epoch) {
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"locore_main: packet %p epoch %d doesn't match datapt %d.\n",
(void *)rx_bufs[i], epoch,
options.s_last_datapt
->epoch);
break;
}
recv_resp = true;
break;
case PKT_TYPE_STAT:
pld_stat = (struct pkt_payload_stat *)
each->payload;
epoch = rte_be_to_cpu_32(
pld_stat->epoch);
if (options.s_last_datapt == nullptr ||
epoch !=
options.s_last_datapt->epoch) {
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"locore_main: packet %p epoch %d doesn't match datapt %d.\n",
(void *)rx_bufs[i], epoch,
options.s_last_datapt
->epoch);
break;
}
options.s_last_datapt->srv_hw_tx =
rte_be_to_cpu_64(pld_stat->hw_tx);
options.s_last_datapt->srv_hw_rx =
rte_be_to_cpu_64(pld_stat->hw_rx);
options.s_last_datapt->srv_sw_tx =
rte_be_to_cpu_64(pld_stat->sw_tx);
options.s_last_datapt->srv_sw_rx =
rte_be_to_cpu_64(pld_stat->sw_rx);
recv_stat = true;
is_last_pkt_lost = false;
break;
default:
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: ignoring packet %p with unknown type %d.\n",
(void *)rx_bufs[i], type);
}
rte_pktmbuf_free(rx_bufs[i]);
}
}
if (options.s_state == STATE_SENT) {
// check if hw tx ts is read
if (!read_tx) {
int ret;
struct timespec ts;
if (options.s_hwtimestamp) {
if ((ret = rte_eth_timesync_read_tx_timestamp(
options.portid, &ts)) == 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: read hw tx timestamp %lu.\n",
(ts.tv_nsec + ts.tv_sec * S2NS));
options.s_last_datapt->clt_hw_tx =
ts.tv_nsec + ts.tv_sec * S2NS;
read_tx = true;
}
} else {
options.s_last_datapt->clt_hw_tx = 0;
read_tx = true;
}
}
if (read_tx && recv_resp && recv_stat) {
options.s_state = STATE_COMPLETE;
} else {
// check packet loss
if (now - last_send_ts >
options.pkt_loss_time_ms * MS2NS) {
if (is_last_pkt_lost) {
num_cts_pkt_lost++;
} else {
is_last_pkt_lost = true;
num_cts_pkt_lost = 1;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: packet loss: waiting too long for epoch %d. %d in a row.\n",
options.s_last_datapt->epoch,
num_cts_pkt_lost);
delete options.s_last_datapt;
options.s_last_datapt = nullptr;
options.s_state = STATE_PKTLOSS;
options.s_pkt_loss.fetch_add(1);
if (num_cts_pkt_lost >
options
.pkt_loss_failure_threshold) {
rte_exit(EXIT_FAILURE,
"too many continuous packet loss detected\n");
}
}
}
}
if (options.s_state == STATE_COMPLETE ||
options.s_state == STATE_PKTLOSS ||
options.s_state == STATE_WAIT) {
if (options.s_state == STATE_COMPLETE) {
options.s_data.push_back(options.s_last_datapt);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: datapt for epoch %d dump:\n"
" Valid: %d\n"
" client TX HW: %lu\n"
" client TX SW: %lu\n"
" client RX HW: %lu\n"
" client RX SW: %lu\n"
" server TX HW: %lu\n"
" server TX SW: %lu\n"
" server RX HW: %lu\n"
" server RX SW: %lu\n\n",
options.s_last_datapt->epoch,
options.s_last_datapt->valid,
options.s_last_datapt->clt_hw_tx,
options.s_last_datapt->clt_sw_tx,
options.s_last_datapt->clt_hw_rx,
options.s_last_datapt->clt_sw_rx,
options.s_last_datapt->srv_hw_tx,
options.s_last_datapt->srv_sw_tx,
options.s_last_datapt->srv_hw_rx,
options.s_last_datapt->srv_sw_rx);
options.s_recved_pkts.fetch_add(1);
options.s_last_datapt = nullptr;
}
options.s_state = STATE_WAIT;
if (now >= next_ts) {
struct pkt_payload_epoch *pld_epoch;
uint32_t epoch;
next_ts += (int)(options.s_iagen->generate() *
S2NS);
options.s_host_conn.src_port = port_gen.next();
if (alloc_pkt_hdr(mempool_get(options.s_socketid),
PKT_TYPE_PROBE, &options.s_host_conn, 0,
&tx_buf, &pkt_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to alloc probe packet.\n");
}
epoch = options.s_epoch;
options.s_epoch++;
pld_epoch = (struct pkt_payload_epoch *)
pkt_data->payload;
pld_epoch->epoch = rte_cpu_to_be_32(epoch);
options.s_last_datapt = new struct datapt;
options.s_last_datapt->epoch = epoch;
options.s_last_datapt->valid =
options.s_record.load();
last_send_ts = now;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: sending packet 0x%p with epoch %d\n",
(void *)tx_buf, epoch);
const uint16_t nb_tx =
rte_eth_tx_burst(options.portid,
options.s_txqid, &tx_buf, 1);
if (nb_tx != 1) {
rte_exit(EXIT_FAILURE,
"failed to send packet 0x%p, epoch %d\n",
(void *)tx_buf, epoch);
}
rte_pktmbuf_free(tx_buf);
read_tx = false;
recv_resp = false;
recv_stat = false;
options.s_state = STATE_SENT;
}
}
}
}
static int
locore_main(void *tif __rte_unused)
{
struct rte_mbuf *mbufs[MAX_SLAVES];
uint32_t core_id = rte_lcore_id();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: core %d running...\n",
core_id);
if (options.master_mode == 1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: sending SYNC ...\n");
send_all_slaves(PKT_TYPE_SYNC);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: waiting for SYNC_ACK ...\n");
wait_for_slaves(PKT_TYPE_SYNC_ACK, nullptr);
}
options.s_start_time.store(topo_uptime_ns());
pkt_loop();
options.s_end_time.store(topo_uptime_ns());
if (options.master_mode == 1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: sending FIN ...\n");
send_all_slaves(PKT_TYPE_FIN);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: waiting for FIN_ACK ...\n");
wait_for_slaves(PKT_TYPE_FIN_ACK, mbufs);
// aggregate slave QPS
for (unsigned int i = 0; i < options.slaves.size(); i++) {
// these packets already underwent validity check in
// wait_for_slaves
auto pkt_hdr = rte_pktmbuf_mtod(mbufs[i],
struct pkt_hdr *);
auto pld_qps = (struct pkt_payload_qps *)
pkt_hdr->payload;
uint32_t qps = rte_be_to_cpu_32(pld_qps->qps);
uint32_t recved = rte_be_to_cpu_32(
pld_qps->recved_pkts);
uint32_t loss = rte_be_to_cpu_32(pld_qps->lost_pkts);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main: received qps %d from client %d\n",
qps, i);
options.s_slave_qps.fetch_add(qps);
options.s_slave_loss.fetch_add(loss);
options.s_slave_recved.fetch_add(recved);
rte_pktmbuf_free(mbufs[i]);
}
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main: exited\n");
return 0;
}
static void
dump_options()
{
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configuration:\n"
" verbosity = +%d\n"
" run time = %d\n"
" warmup time = %d\n"
" output file = %s\n"
" number of threads = %d\n"
" interarrival dist = %s\n"
" target qps = %d\n"
" host IP = 0x%x\n"
" pkt loss time = %u\n"
" pkt loss failure threshold = %u\n"
" portid = %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
options.warmup_time, options.output, CPU_COUNT(&options.cpu_set),
options.ia_gen_str, options.target_qps, options.s_host_spec.ip,
options.pkt_loss_time_ms, options.pkt_loss_failure_threshold,
options.portid);
for (auto slave : options.slaves) {
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
" slave = 0x%x@%x:%x:%x:%x:%x:%x\n", slave->ip,
slave->mac_addr.addr_bytes[0],
slave->mac_addr.addr_bytes[1],
slave->mac_addr.addr_bytes[2],
slave->mac_addr.addr_bytes[3],
slave->mac_addr.addr_bytes[4],
slave->mac_addr.addr_bytes[5]);
}
}
static void
usage()
{
fprintf(stdout,
"Usage:\n"
" -v(vv): verbose mode\n"
" -s: server net spec\n"
" -S: slave(rat)'s net spec (also turns on master mode)\n"
" -t: run time\n"
" -T: warmup time\n"
" -h: display the information\n"
" -o: output filename\n"
" -A: affinity mask\n"
" -i: inter-arrival time distribution\n"
" -q: target qps\n"
" -H: host net spec\n"
" -L: pkt loss failure threshold\n"
" -l: pkt loss time threshold\n");
}
int
main(int argc, char *argv[])
{
std::ofstream log_file;
bool has_host_spec = false;
ntr_init();
// init dpdk
int ret = rte_eal_init(argc, argv);
if (ret < 0) {
rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
}
argc -= ret;
argv += ret;
// set warning level
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
{
int c;
// parse arguments
struct net_spec *ns;
while ((c = getopt(argc, argv, "vs:S:t:T:ho:A:i:q:H:L:l:p:")) !=
-1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1,
ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 's':
if (str_to_netspec(optarg,
&options.server_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid server net spec.\n");
}
break;
case 'S':
ns = new struct net_spec;
if (str_to_netspec(optarg, ns) != 0) {
rte_exit(EXIT_FAILURE,
"invalid client net spec\n");
}
options.slaves.push_back(ns);
options.master_mode = 1;
if (options.slaves.size() > MAX_SLAVES) {
rte_exit(EXIT_FAILURE,
"too many rats.\n");
}
break;
case 't':
options.run_time = strtol(optarg, nullptr, 10);
break;
case 'T':
options.warmup_time = strtol(optarg, nullptr,
10);
break;
case 'h':
usage();
rte_exit(EXIT_SUCCESS, "\n");
case 'o':
strncpy(options.output, optarg,
sizeof(options.output) - 1);
break;
case 'A':
cpulist_to_cpuset(optarg, &options.cpu_set);
break;
case 'i':
strncpy(options.ia_gen_str, optarg,
sizeof(options.ia_gen_str) - 1);
break;
case 'q':
options.target_qps = strtoul(optarg, nullptr,
10);
break;
case 'H':
has_host_spec = true;
if (str_to_netspec(optarg,
&options.s_host_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid host net spec.\n");
}
break;
case 'L':
options.pkt_loss_failure_threshold =
strtoul(optarg, nullptr, 10);
break;
case 'l':
options.pkt_loss_time_ms = strtoul(optarg,
nullptr, 10);
if (options.pkt_loss_time_ms == 0) {
options.pkt_loss_time_ms = UINT32_MAX;
}
break;
case 'p':
options.portid = strtol(optarg, nullptr, 10);
break;
default:
usage();
rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
c);
}
}
}
if (!has_host_spec) {
rte_exit(EXIT_FAILURE, "must specify host IP\n");
}
// init libtopo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
}
// init nms
if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "failed to init libnms!\n");
}
if (CPU_COUNT(&options.cpu_set) != 1) {
rte_exit(EXIT_FAILURE, "must specify exactly one core\n");
}
int core_id = CPU_FFS(&options.cpu_set) - 1;
dump_options();
// configure memory and port
struct port_conf pconf;
struct device_conf dconf;
struct mem_conf mconf;
portconf_get(options.portid, &pconf);
if (!pconf.timesync) {
options.s_hwtimestamp = false;
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"main: timesync disabled. hw timestamp unavailable.\n ");
}
if (CPU_COUNT(&options.cpu_set) > 1) {
int ffs = CPU_FFS(&options.cpu_set);
CPU_ZERO(&options.cpu_set);
CPU_SET(ffs - 1, &options.cpu_set);
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "cat only supports one thread, using only core %d.\n", ffs - 1);
}
dconf.mtu = MAX_STANDARD_MTU;
CPU_COPY(&options.cpu_set, &dconf.core_affinity);
dconf.portid = options.portid;
dconf.rss_hf = pconf.rss_hf;
dconf.rx_offloads = pconf.rxoffload;
dconf.tx_offloads = pconf.txoffload;
dconf.timesync = pconf.timesync;
dconf.rx_fn = rx_add_timestamp;
dconf.rx_user = nullptr;
dconf.rx_ring_sz = 2048;
dconf.tx_fn = tx_add_timestamp;
dconf.tx_user = nullptr;
dconf.tx_ring_sz = 2048;
mconf.cache_size = 64;
mconf.priv_size = 0;
mconf.num_elements = 4096;
mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_STANDARD_MTU;
mconf.max_pools = -1;
dpdk_init(&dconf, &mconf);
if (rte_eth_macaddr_get(options.portid,
&options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
options.portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
// create default generator
options.s_iagen = createGenerator(options.ia_gen_str);
if (options.s_iagen == nullptr) {
rte_exit(EXIT_FAILURE, "invalid generator string %s\n",
options.ia_gen_str);
}
options.s_iagen->set_lambda((double)options.target_qps);
// open log file for writing
log_file.open(options.output, std::ofstream::out);
if (!log_file) {
rte_exit(EXIT_FAILURE, "failed to open log file %s\n",
options.output);
}
sleep(INIT_DELAY);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: launching thread on core %d\n", core_id);
if (rte_eal_remote_launch(locore_main, nullptr, core_id) != 0) {
rte_exit(EXIT_FAILURE, "failed to launch function on locore\n");
}
// XXX: poor man's timer
uint32_t second = 0;
while (true) {
if (second >= options.warmup_time) {
options.s_record.store(1);
}
if (second >= options.run_time + options.warmup_time) {
options.s_stop.store(true);
break;
}
usleep(S2US);
second++;
}
if (rte_eal_wait_lcore(core_id) < 0)
rte_exit(EXIT_FAILURE, "failed to wait for job completion\n");
// calculate QPS
uint32_t qps = (double)options.s_recved_pkts.load() /
(((double)(options.s_end_time.load() -
options.s_start_time.load()) /
(double)S2NS));
qps += options.s_slave_qps.load();
// dump stats
log_file << qps << ',' << options.s_recved_pkts.load() << ','
<< options.s_pkt_loss.load() << ','
<< options.s_slave_recved.load() << ','
<< options.s_slave_loss.load() << std::endl;
for (auto it : options.s_data) {
if (it->valid) {
log_file << it->clt_sw_rx << ',' << it->clt_sw_tx << ','
<< it->clt_hw_rx << ',' << it->clt_hw_tx << ','
<< it->srv_sw_rx << ',' << it->srv_sw_tx << ','
<< it->srv_hw_rx << ',' << it->srv_hw_tx
<< std::endl;
}
delete it;
}
log_file.close();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"qps = %d, recved = %d, loss = %d, slave recved = %d, slave loss = %d\n",
qps, options.s_recved_pkts.load(), options.s_pkt_loss.load(),
options.s_slave_recved.load(), options.s_slave_loss.load());
// clean up
dpdk_cleanup(&dconf);
return 0;
}

701
net/khat.cc Normal file
View File

@ -0,0 +1,701 @@
#include <atomic>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <vector>
#include <unistd.h>
#include <sys/cpuset.h>
#include <sys/endian.h>
#include <sys/sched.h>
#include <sys/types.h>
#include <topo.h>
#include <rte_common.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_launch.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include "ntr.h"
//#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "nms.h"
#include "rte_byteorder.h"
constexpr static unsigned int BURST_SIZE = 32;
constexpr static unsigned int CACHELINE_SIZE = 64;
constexpr static uint16_t THREAD_LOAD_BUFFER_SZ = 16384;
struct probe_state_t {
struct net_spec dst;
struct conn_spec cspec {
.dst = &dst
};
uint64_t last_sw_rx;
uint64_t last_sw_tx;
uint64_t last_hw_rx;
uint32_t epoch;
};
// keep track of the probe state
// when a probe packet first arrives this state is set to be influx and the
// rte_mbuf's userdata is set to PROBE_MAGIC which prevents other probe packets
// to be processed when the server sends the probe stats back to user influx is
// released this is to guarantee that the server only processes one probe packet
// at the time
// XXX: also this can be attached to the mbuf itself and processed by the lcore
// thread
// I kept this global because globally there could be only one pending
// probe request and rx_add_timestamp can save their shit here too
struct thread_info {
int tid;
int rxqid;
int txqid;
int lcore_id;
int node_id;
void *cache_lines;
void *load_buffer;
};
struct options_t {
// config
int num_threads { 1 };
cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 2nd core
bool jumbo_frame_enabled {
false
}; // setting this to true changes mbuf size and mtu
int port_mtu { MAX_STANDARD_MTU };
int thread_cacheline_cnt = { 1600 }; // 100MB data per thread
uint16_t portid { 0 };
// states
struct net_spec s_host_spec { };
std::vector<struct thread_info *> s_thr_info;
int probe_state_offset { 0 };
bool s_hwtimestamp { true };
struct probe_state_t s_probe_info;
std::atomic<bool> is_probing { false };
};
struct options_t options;
static bool
mbuf_is_probe_valid(struct rte_mbuf *pkt)
{
return *RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *);
}
static void
mbuf_set_probe_valid(struct rte_mbuf *pkt, bool b)
{
*RTE_MBUF_DYNFIELD(pkt, options.probe_state_offset, bool *) = b;
}
static uint16_t
rx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, uint16_t max_pkts __rte_unused,
void *_ __rte_unused)
{
int rc = 0;
uint64_t now = topo_uptime_ns();
struct timespec ts { };
struct pkt_hdr *pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: ignoring invalid packet %p.\n",
(void *)pkts[i]);
continue;
}
if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE) {
bool cmp = false;
mbuf_set_probe_valid(pkts[i], false);
if (options.is_probing.compare_exchange_strong(cmp,
true)) {
options.s_probe_info.last_sw_rx = now;
if (options.s_hwtimestamp) {
if ((rc = rte_eth_timesync_read_rx_timestamp(
port, &ts,
pkts[i]->timesync & 0x3)) ==
0) {
options.s_probe_info
.last_hw_rx = ts.tv_nsec +
ts.tv_sec * S2NS;
ntr(NTR_DEP_USER1,
NTR_LEVEL_DEBUG,
"rx_add_timestamp: tagged packet %p with sw rx: %lu hw rx:%lu.\n",
(void *)pkts[i],
options.s_probe_info
.last_sw_rx,
options.s_probe_info
.last_hw_rx);
mbuf_set_probe_valid(pkts[i],
true);
} else {
options.is_probing.store(false);
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"rx_add_timestamp: packet %p not tagged - failed to read hw rx timestamp: %d.\n",
(void *)pkts[i], rc);
}
} else {
mbuf_set_probe_valid(pkts[i], true);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: tagged packet %p with sw rx only: %lu.\n",
(void *)pkts[i], now);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: packet %p not tagged - server is probing.\n",
(void *)pkts[i]);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"rx_add_timestamp: packet %p not tagged - not PROBE packet: type %d.\n",
(void *)pkts[i], rte_be_to_cpu_16(pkt_data->type));
}
}
return nb_pkts;
}
static uint16_t
tx_add_timestamp(uint16_t port __rte_unused, uint16_t qidx __rte_unused,
struct rte_mbuf **pkts, uint16_t nb_pkts, void *_ __rte_unused)
{
uint64_t now = topo_uptime_ns();
struct pkt_hdr *pkt_data;
for (int i = 0; i < nb_pkts; i++) {
pkt_data = check_valid_packet(pkts[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"tx_add_timestamp: ignoring invalid packet %p.\n",
(void *)pkts[i]);
continue;
}
if (rte_be_to_cpu_16(pkt_data->type) == PKT_TYPE_PROBE_RESP) {
// this packet is the response to PROBE packets
// at this time the packet is not sent to the NIC yet so
// the state must be waiting stats
assert(options.is_probing.load() &&
mbuf_is_probe_valid(pkts[i]));
options.s_probe_info.last_sw_tx = now;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"tx_add_timestamp: tagged packet %p with sw tx %lu\n",
(void *)pkts[i], options.s_probe_info.last_sw_tx);
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"tx_add_timestamp: packet %p not tagged - type %d\n",
(void *)pkts[i], pkt_data->type);
}
}
return nb_pkts;
}
static void
worker_cpu_load(unsigned long us)
{
uint64_t now = topo_uptime_ns();
while(true) {
uint64_t cur = topo_uptime_ns();
if (cur - now >= us * 1000) {
break;
}
}
}
static void
worker_memory_load(int tid, uint32_t which, uint32_t load)
{
uint32_t start_cacheline = which % (options.thread_cacheline_cnt * options.s_thr_info.size());
uint32_t thrd = start_cacheline / options.thread_cacheline_cnt;
uint32_t start = start_cacheline % options.thread_cacheline_cnt;
struct thread_info * cur = options.s_thr_info.at(tid);
struct thread_info * tgt = options.s_thr_info.at(thrd);
for (uint32_t i = 0; i < load; i++) {
*(uint32_t *)cur->load_buffer = *(uint32_t *)((char *)tgt->cache_lines + ((start + i) % options.thread_cacheline_cnt) * CACHELINE_SIZE);
}
}
static int
locore_main(void *ti)
{
auto tinfo = (struct thread_info *)ti;
struct rte_mbuf *bufs[BURST_SIZE];
// + 1 because it might involve an extra PKT_TYPE_STAT packet
// when all tx timestamps are ready
struct rte_mbuf *tx_bufs[BURST_SIZE];
struct pkt_hdr *pkt_data;
// XXX: hack hardcode to be larger than MTU
bool pending_probe = false;
if (rte_eth_dev_socket_id(options.portid) > 0 &&
rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n",
tinfo->tid, options.portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"locore_main <thread %d>: running on locore %d with txqid %d and rxqid %d.\n",
tinfo->tid, rte_lcore_id(), tinfo->txqid, tinfo->rxqid);
while (true) {
uint16_t nb_tx = 0;
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
tinfo->rxqid, bufs, BURST_SIZE);
struct rte_mbuf *pkt_buf;
struct pkt_hdr *tx_data;
for (int i = 0; i < nb_rx; i++) {
// XXX: optimization: in rx_add_timestamp every packet
// is already validated once can just mark valid packet
// with a value so we can avoid this redundant check
pkt_data = check_valid_packet(bufs[i],
&options.s_host_spec.mac_addr);
if (pkt_data == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: skipping invalid packet %p.\n",
tinfo->tid, (void *)bufs[i]);
// dump_pkt(bufs[i]);
rte_pktmbuf_free(bufs[i]);
continue;
}
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, pkt_data,
"locore_main <thread %d>: received packet ", tinfo->tid);
switch (rte_be_to_cpu_16(pkt_data->type)) {
case PKT_TYPE_PROBE: {
if (mbuf_is_probe_valid(bufs[i])) {
// send back probe_resp pkt to probe for
// return latency
pending_probe = true;
// book keep probe results
options.s_probe_info.epoch =
rte_be_to_cpu_32(
((struct pkt_payload_epoch *)
pkt_data->payload)
->epoch);
pkt_hdr_to_netspec(pkt_data,
&options.s_probe_info.dst,
&options.s_probe_info.cspec
.dst_port,
nullptr,
&options.s_probe_info.cspec
.src_port);
options.s_probe_info.cspec.src =
&options.s_host_spec;
if (alloc_pkt_hdr(mempool_get(
tinfo->node_id),
PKT_TYPE_PROBE_RESP,
&options.s_probe_info.cspec, 0,
&pkt_buf, &tx_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to allocate pkt\n");
}
rte_memcpy(tx_data->payload,
pkt_data->payload,
sizeof(struct pkt_payload_epoch));
mbuf_set_probe_valid(pkt_buf, true);
// queue for burst send
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
"locore_main <thread %d>: sending packet ", tinfo->tid);
tx_bufs[nb_tx++] = pkt_buf;
}
break;
}
case PKT_TYPE_LOAD: {
struct conn_spec cspec;
struct net_spec src;
struct net_spec dst;
// touch the unused data to pretend that we read
// those dummy fields
memcpy(tinfo->load_buffer, pkt_data->payload,
MIN(bufs[i]->data_len -
sizeof(struct pkt_hdr),
THREAD_LOAD_BUFFER_SZ));
// perform the load
auto pld = (struct pkt_payload_load *)
pkt_data->payload;
uint32_t load_type = rte_be_to_cpu_32(pld->type);
uint32_t load_arg0 = rte_be_to_cpu_32(pld->arg0);
uint32_t load_arg1 = rte_be_to_cpu_32(pld->arg1);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: LOAD type %d, arg0 %d, arg1 %d\n",
tinfo->tid, load_type, load_arg0, load_arg1);
if (load_type == LOAD_TYPE_CPU) {
worker_cpu_load(load_arg0);
} else if (load_type == LOAD_TYPE_MEM) {
worker_memory_load(tinfo->tid, load_arg0, load_arg1);
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main <thread %d>: unknown LOAD type %d, ignoring...", tinfo->tid, load_type);
break;
}
// reply
pkt_hdr_to_netspec(pkt_data, &src,
&cspec.dst_port, &dst, &cspec.src_port);
cspec.dst = &src;
cspec.src = &dst;
// printf("LOAD PKT SIZE: %d\n",
// bufs[i]->data_len); we reply to load packet
// regardless of the server state
if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
PKT_TYPE_LOAD_RESP, &cspec, 0, &pkt_buf,
&tx_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to allocate pkt\n");
}
rte_memcpy(tx_data->payload, pkt_data->payload,
sizeof(struct pkt_payload_load));
// queue for burst send
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, tx_data,
"locore_main <thread %d>: sending packet ", tinfo->tid);
tx_bufs[nb_tx++] = pkt_buf;
break;
}
default:
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: ignoring packet %p with unknown type %d.\n",
tinfo->tid, (void *)bufs[i],
rte_be_to_cpu_16(pkt_data->type));
break;
}
rte_pktmbuf_free(bufs[i]);
}
// send all packets
tx_burst_all(options.portid, tinfo->txqid, tx_bufs, nb_tx);
// we wanna check every loop not only when there are packets
if (pending_probe) {
assert(options.is_probing.load());
struct timespec ts { };
struct pkt_payload_stat *stat;
int status = 0;
if (options.s_hwtimestamp) {
if ((status = rte_eth_timesync_read_tx_timestamp(
options.portid, &ts)) == 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: obtained hw tx timestamp %lu.\n",
tinfo->tid,
(ts.tv_sec * S2NS + ts.tv_nsec));
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"locore_main <thread %d>: failed to obtain hw tx timestamp: %d.\n",
tinfo->tid, status);
}
}
if (status == 0) {
// now we have everything we need
if (alloc_pkt_hdr(mempool_get(tinfo->node_id),
PKT_TYPE_STAT, &options.s_probe_info.cspec, 0,
&pkt_buf, &tx_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to alloc pkt_buf\n");
}
// populate stats
stat = (struct pkt_payload_stat *)tx_data->payload;
stat->epoch = rte_cpu_to_be_32(
options.s_probe_info.epoch);
if (options.s_hwtimestamp) {
stat->hw_rx = rte_cpu_to_be_64(
options.s_probe_info.last_hw_rx);
stat->hw_tx = rte_cpu_to_be_64(
ts.tv_nsec + ts.tv_sec * S2NS);
} else {
stat->hw_rx = 0;
stat->hw_tx = 0;
}
stat->sw_rx = rte_cpu_to_be_64(
options.s_probe_info.last_sw_rx);
stat->sw_tx = rte_cpu_to_be_64(
options.s_probe_info.last_sw_tx);
// send the packet
tx_burst_all(options.portid, tinfo->txqid, &pkt_buf, 1);
// release flux
pending_probe = false;
options.is_probing.store(false);
}
}
}
}
static void
usage()
{
fprintf(stdout,
"Usage:\n"
" -v(vv): verbose mode\n"
" -h: seek help\n"
" -A: cpu list for worker threads\n"
" -m: enable memory load generator(MLG)\n"
" -b: MLG trunk size\n"
" -x: MLG thread affinity mask\n"
" -X: MLG target domain affinity mask\n"
" -S: MLG shared buffer\n"
" -H: host spec\n"
" -J: enable jumbo frames\n"
" -p: port id\n");
fflush(stdout);
}
static void
dump_options()
{
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: khat configuration:\n"
" verbosity: +%d\n"
" thread count: %d\n"
" ip: 0x%x\n"
" jumbo frame: %d\n"
" port id: %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING,
options.num_threads, options.s_host_spec.ip,
options.jumbo_frame_enabled, options.portid);
}
int
main(int argc, char *argv[])
{
bool has_host_spec { false };
struct mem_conf mconf;
struct device_conf dconf;
ntr_init();
// init dpdk
int ret = rte_eal_init(argc, argv);
if (ret < 0) {
rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
}
argc -= ret;
argv += ret;
// set warning level
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
{
int c;
// parse arguments
while ((c = getopt(argc, argv, "hvA:H:Jp:")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1,
ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 'h':
usage();
rte_exit(EXIT_SUCCESS, "\n");
case 'A':
cpulist_to_cpuset(optarg, &options.cpu_set);
options.num_threads = CPU_COUNT(
&options.cpu_set);
if (options.num_threads == 0) {
rte_exit(EXIT_FAILURE,
"must run at least one thread\n");
}
break;
case 'H':
if (str_to_netspec(optarg,
&options.s_host_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid host spec\n");
}
has_host_spec = true;
break;
case 'J':
options.jumbo_frame_enabled = true;
options.port_mtu = MAX_JUMBO_MTU;
break;
case 'p':
options.portid = atoi(optarg);
break;
default:
usage();
rte_exit(EXIT_SUCCESS, "unknown argument: %c",
c);
}
}
}
if (!has_host_spec) {
rte_exit(EXIT_FAILURE, "Must specify host spec\n");
}
// init libtopo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
}
// init libnms
if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) != 0) {
rte_exit(EXIT_FAILURE, "libnms init failed!\n");
}
dump_options();
// register dynamic field
struct rte_mbuf_dynfield rte_mbuf_dynfield_probe_flag = {
.name = "rte_mbuf_dynfield_probe_valid",
.size = sizeof(bool),
.align = __alignof__(uint32_t),
.flags = 0
};
options.probe_state_offset = rte_mbuf_dynfield_register(
&rte_mbuf_dynfield_probe_flag);
if (options.probe_state_offset == -1) {
rte_exit(EXIT_FAILURE, "failed to register dynamic field: %d\n",
rte_errno);
}
// configure memory and port
struct port_conf pconf;
portconf_get(options.portid, &pconf);
if (!pconf.timesync) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"main: timesync disabled. hw timestamp unavailable.\n ");
options.s_hwtimestamp = false;
}
dconf.mtu = options.port_mtu;
CPU_COPY(&options.cpu_set, &dconf.core_affinity);
dconf.portid = options.portid;
dconf.rss_hf = pconf.rss_hf;
dconf.rx_offloads = pconf.rxoffload;
dconf.tx_offloads = pconf.txoffload;
dconf.timesync = pconf.timesync;
dconf.rx_fn = rx_add_timestamp;
dconf.rx_user = nullptr;
dconf.rx_ring_sz = 2048;
dconf.tx_fn = tx_add_timestamp;
dconf.tx_user = nullptr;
dconf.tx_ring_sz = 2048;
mconf.cache_size = 512;
mconf.priv_size = 0;
mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
rte_lcore_count() / rte_socket_count();
mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
MAX_STANDARD_MTU;
mconf.max_pools = -1;
dpdk_init(&dconf, &mconf);
if (rte_eth_macaddr_get(options.portid,
&options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
options.portid);
}
// init threads
uint32_t cpu_idx = CPU_FFS(&options.cpu_set);
uint32_t tid = 0;
while (cpu_idx != 0) {
uint32_t lcore_id = cpu_idx - 1;
uint32_t node_id = rte_lcore_to_socket_id(lcore_id);
auto *tinfo = (struct thread_info *)nms_malloc(node_id,
sizeof(struct thread_info));
tinfo->cache_lines = nms_malloc(node_id,
CACHELINE_SIZE * options.thread_cacheline_cnt);
tinfo->load_buffer = nms_malloc(node_id,
THREAD_LOAD_BUFFER_SZ);
tinfo->tid = tid;
tinfo->lcore_id = lcore_id;
tinfo->node_id = node_id;
tinfo->rxqid = tid;
tinfo->txqid = tid;
options.s_thr_info.push_back(tinfo);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: thread %d assigned to cpu %d, node %d\n", tinfo->tid,
tinfo->lcore_id, topo_core_to_numa(lcore_id));
tid++;
CPU_CLR(cpu_idx - 1, &options.cpu_set);
cpu_idx = CPU_FFS(&options.cpu_set);
}
sleep(INIT_DELAY);
for (int i = 0; i < options.num_threads; i++) {
struct thread_info *tinfo = options.s_thr_info.at(i);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: launching thread %d on locore %d\n", tinfo->tid,
tinfo->lcore_id);
if (rte_eal_remote_launch(locore_main,
(void *)options.s_thr_info.at(i),
tinfo->lcore_id) != 0) {
rte_exit(EXIT_FAILURE,
"failed to launch function on locore %d\n",
tinfo->lcore_id);
}
}
while (true) {
usleep(S2US);
}
// shouldn't get here
// clean up
for (int i = 0; i < options.num_threads; i++) {
struct thread_info *tinfo = options.s_thr_info.at(i);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: waiting for locore %d...\n", tinfo->lcore_id);
if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
tinfo->lcore_id);
}
}
dpdk_cleanup(&dconf);
return 0;
}

204
net/libnetsup/dpdk.cc Normal file
View File

@ -0,0 +1,204 @@
#include "net/netsup.hh"
#include <cstdlib>
#include "rte_build_config.h"
#include "rte_common.h"
#include "rte_config.h"
#include "rte_ether.h"
#include "rte_lcore.h"
#include "rte_mempool.h"
#include "rte_mbuf.h"
#include "rte_errno.h"
#include "rte_ethdev.h"
#include "ntr.h"
static struct rte_mempool *g_mempools[MAX_NUMA_NODES] = {nullptr};
static unsigned int g_mempool_sz = 0;
static void
mempool_init(struct mem_conf *mconf)
{
struct rte_mempool * mbuf_pool;
char mempool_name[64];
for (int i = 0; i < (int)rte_socket_count(); i++) {
uint32_t nodeid = i;
// ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
// "mempool_init: creating mempool for node %d\n", nodeid);
// create one mbuf pool per socket
snprintf(mempool_name, sizeof(mempool_name), "net_mempool_%d", nodeid);
mbuf_pool = rte_pktmbuf_pool_create(mempool_name, mconf->num_elements,
mconf->cache_size, mconf->priv_size,
mconf->data_room_size, nodeid);
if (mbuf_pool == nullptr) {
rte_exit(EXIT_FAILURE, "cannot create mbuf pool: %d\n", rte_errno);
}
g_mempools[nodeid] = mbuf_pool;
g_mempool_sz++;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "mempool_init: created mempool for node %d\n", nodeid);
}
}
struct rte_mempool *
mempool_get(int nodeid)
{
if ((unsigned int)nodeid < g_mempool_sz) {
return g_mempools[nodeid];
}
return nullptr;
}
static void
port_init(struct device_conf *dconf)
{
struct rte_ether_addr addr;
struct rte_eth_dev_info dev_info {
};
struct rte_eth_conf port_conf;
struct rte_eth_txconf txconf {
};
struct rte_eth_rxconf rxconf {
};
int ret;
int num_threads = CPU_COUNT(&dconf->core_affinity);
if (rte_eth_dev_count_avail() == 0) {
rte_exit(EXIT_FAILURE, "number of ports must be > 0\n");
}
if (!rte_eth_dev_is_valid_port(dconf->portid)) {
rte_exit(EXIT_FAILURE, "cannot find port %d\n", dconf->portid);
}
if ((ret = rte_eth_macaddr_get(dconf->portid, &addr)) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port: %d\n", ret);
}
ret = rte_eth_dev_info_get(dconf->portid, &dev_info);
if (ret != 0) {
rte_exit(EXIT_FAILURE, "failed to get dev info: %d", ret);
}
memset(&port_conf, 0, sizeof(struct rte_eth_conf));
port_conf.rxmode.mtu = dconf->mtu;
port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_RSS;
port_conf.rx_adv_conf.rss_conf.rss_key = nullptr;
port_conf.rx_adv_conf.rss_conf.rss_hf = dconf->rss_hf;
port_conf.rxmode.offloads = dconf->rx_offloads;
port_conf.txmode.offloads = dconf->tx_offloads;
/* Configure the Ethernet device. */
ret = rte_eth_dev_configure(dconf->portid, num_threads, num_threads, &port_conf);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to configure port: %d\n", ret);
ret = rte_eth_dev_adjust_nb_rx_tx_desc(dconf->portid, &dconf->rx_ring_sz, &dconf->tx_ring_sz);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to set rx tx queue size: %d\n", ret);
/* Allocate and set up 1 RX queue per thread per Ethernet port. */
rxconf = dev_info.default_rxconf;
rxconf.offloads = port_conf.rxmode.offloads;
rxconf.rx_nseg = 0;
rxconf.rx_seg = nullptr;
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
int core;
int qid = 0;
CPU_FOREACH_ISSET(core, &dconf->core_affinity) {
int socket = rte_lcore_to_socket_id(core);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "port_init: setting up rx & tx queue for core %d (socket %d)...\n", core, socket);
ret = rte_eth_rx_queue_setup(dconf->portid, qid, dconf->rx_ring_sz, socket, &rxconf, mempool_get(socket));
if (ret < 0)
rte_exit(EXIT_FAILURE, "failed to setup rx queue for core %d: %d\n", core, ret);
ret = rte_eth_tx_queue_setup(dconf->portid, qid, dconf->tx_ring_sz, socket, &txconf);
if (ret < 0)
rte_exit(EXIT_FAILURE, "failed to setup tx queue for core %d: %d", core, ret);
qid++;
}
// set mtu
ret = rte_eth_dev_set_mtu(dconf->portid, dconf->mtu);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to set mtu: %d\n", ret);
ret = rte_eth_dev_start(dconf->portid);
if (ret < 0)
rte_exit(EXIT_FAILURE, "failed to start port: %d\n", ret);
if (dconf->timesync) {
ret = rte_eth_timesync_enable(dconf->portid);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to enable timesync: %d\n", ret);
}
/* Enable RX in promiscuous mode for the Ethernet device. */
ret = rte_eth_promiscuous_enable(dconf->portid);
if (ret != 0)
rte_exit(EXIT_FAILURE, "failed to enable promiscuous mode: %d\n", ret);
for (int i = 0; i < num_threads; i++) {
if (dconf->tx_fn != nullptr) {
if (rte_eth_add_tx_callback(dconf->portid, i, dconf->tx_fn, dconf->tx_user) == nullptr) {
rte_exit(EXIT_FAILURE, "failed to attach callback to tx queue %d\n", i);
}
}
if (dconf->rx_fn != nullptr) {
if (rte_eth_add_rx_callback(dconf->portid, i, dconf->rx_fn, dconf->rx_user) == nullptr) {
rte_exit(EXIT_FAILURE, "failed to attach callback to rx queue %d\n", i);
}
}
}
// sync_port_clock(portid);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"port_init: configured port %d on socket %d with mac addr %x:%x:%x:%x:%x:%x\n",
dconf->portid, rte_eth_dev_socket_id(dconf->portid),
addr.addr_bytes[0],
addr.addr_bytes[1],
addr.addr_bytes[2],
addr.addr_bytes[3],
addr.addr_bytes[4],
addr.addr_bytes[5]);
}
void
dpdk_init(struct device_conf *dconf, struct mem_conf *mconf)
{
if (rte_socket_count() > (int)MAX_NUMA_NODES) {
rte_exit(EXIT_FAILURE, "too many numa nodes\n");
}
// ensure 1-1 mapping
for (int i = 0; i < (int)rte_socket_count(); i++) {
if (rte_socket_id_by_idx(i) != i) {
rte_exit(EXIT_FAILURE, "socket %d has id %d instead.\n", i, rte_socket_id_by_idx(i));
}
}
mempool_init(mconf);
port_init(dconf);
}
void
dpdk_cleanup(struct device_conf * dconf)
{
rte_eth_dev_stop(dconf->portid);
rte_eth_dev_close(dconf->portid);
for (int i = 0; i < (int)rte_socket_count(); i++) {
rte_mempool_free(g_mempools[i]);
}
}

66
net/libnetsup/portconf.cc Normal file
View File

@ -0,0 +1,66 @@
#include "rte_ethdev.h"
#include "net/netsup.hh"
#include <cstdlib>
static struct port_conf port_confs[] = {
{
.driver_name = "net_cxgbe",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_UDP | RTE_ETH_RSS_FRAG_IPV4,
.timesync = false
},
{
.driver_name = "net_i40e",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
.timesync = false
},
{
.driver_name = "net_ice",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
.timesync = false
},
{
.driver_name = "net_ixgbe",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM,
.txoffload = RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP,
.timesync = true
}
};
static struct port_conf default_conf = {
.driver_name = "default",
.rxoffload = RTE_ETH_RX_OFFLOAD_RSS_HASH | RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_TIMESTAMP,
.txoffload = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE | RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM,
.rss_hf = RTE_ETH_RSS_FRAG_IPV4 | RTE_ETH_RSS_NONFRAG_IPV4_UDP | RTE_ETH_RSS_NONFRAG_IPV4_OTHER | RTE_ETH_RSS_L2_PAYLOAD,
.timesync = true
};
static const int port_size = sizeof(port_confs) / sizeof(port_confs[0]);
int
portconf_get(int portid, struct port_conf * out)
{
struct rte_eth_dev_info dev_info {};
if (rte_eth_dev_info_get(portid, &dev_info) != 0) {
rte_exit(EXIT_FAILURE, "failed to obtain device info for port %d\n", portid);
}
for(int i = 0; i < port_size; i++) {
struct port_conf * conf = &port_confs[i];
if (strcmp(conf->driver_name, dev_info.driver_name) == 0) {
memcpy(out, conf, sizeof(struct port_conf));
return 0;
}
}
fprintf(stdout, "portconf_get: unable to find matching conf for port %d:%s, returning default conf.\n", portid, dev_info.driver_name);
memcpy(out, &default_conf, sizeof(struct port_conf));
return -1;
}

909
net/rat.cc Normal file
View File

@ -0,0 +1,909 @@
#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <list>
#include <map>
#include <mutex>
#include <random>
#include <vector>
#include <sys/endian.h>
#include <topo.h>
#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_config.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_launch.h>
#include <rte_lcore.h>
#include <rte_mbuf.h>
#include <unistd.h>
#include "ntr.h"
#include "gen.hh"
#include "net/netsup.hh"
#include "net/pkt.hh"
#include "nms.h"
constexpr static unsigned int BURST_SIZE = 32;
static unsigned int
epoch_mk(unsigned int id, unsigned int epoch)
{
return (id << 24) | epoch;
}
static unsigned int
epoch_get_id(unsigned int epoch)
{
return epoch >> 24;
}
static unsigned int
epoch_get_epoch(unsigned int epoch)
{
return epoch & 0x00FFFFFF;
}
struct epoch_info {
unsigned int epoch;
uint64_t ts;
};
struct thread_info {
unsigned int id { 0 };
unsigned int lcore_id { 0 };
unsigned int rxqid { 0 };
unsigned int txqid { 0 };
int socket_id;
// this field is read by the stat collecting thread
std::atomic<int> recved_pkts { 0 };
std::atomic<int> lost_pkts { 0 };
Generator *ia_gen { nullptr };
Generator *load_gen0 { nullptr };
Generator *load_gen1 { nullptr };
std::mutex
mtx; // this lock protects data shared between worker threads, i.e.:
std::list<struct epoch_info *> recved_epochs;
thread_info() = default;
};
constexpr static int STATE_SYNC = 0; // waiting for SYNC
constexpr static int STATE_SYNC_ACK = 1; // Waiting for sending SYNC_ACK
constexpr static int STATE_RUNNING = 2; // Running
constexpr static int STATE_FIN = 3; // FIN received
constexpr static int WORKLOAD_MAX_ARGS = 2;
struct options_t {
unsigned int run_time { 5 };
// parameters
int slave_mode { 0 };
uint32_t rage_quit_time { UINT32_MAX };
char ia_gen[256] { "fixed:0" };
char load_gen[WORKLOAD_MAX_ARGS][256] = {{"fixed:0"}, {"fixed:0"}};
uint32_t workload_type {LOAD_TYPE_CPU};
uint32_t target_qps { 0 };
uint32_t depth { 1 };
struct net_spec server_spec { };
cpuset_t cpu_set = CPUSET_T_INITIALIZER(0x2); // 1 thread @ core 2
uint32_t pkt_loss_delay_ms { UINT32_MAX };
bool jumbo_frame_enabled { false };
int pkt_pad_sz { 0 };
int port_mtu { MAX_STANDARD_MTU };
int portid { 0 };
// states
unsigned int s_num_threads { 1 }; // 1 thread
struct net_spec s_host_spec { };
struct net_spec s_master_spec { };
struct conn_spec s_master_cspec {
.src = &s_host_spec, .src_port = DEFAULT_RAT_PORT,
.dst = &s_master_spec, .dst_port = DEFAULT_RAT_PORT,
};
std::vector<struct thread_info *> s_thr_info;
std::atomic<int> s_state { STATE_RUNNING }; // default non master mode
// states for qps
std::atomic<uint64_t> s_ts_begin { 0 };
};
static struct options_t options;
static inline void
calc_stats(uint64_t now, uint32_t *qps, uint32_t *recved_pkt,
uint32_t *total_loss)
{
uint32_t recv = 0;
uint32_t loss = 0;
for (auto i : options.s_thr_info) {
recv += i->recved_pkts.load();
loss += i->lost_pkts.load();
}
if (recved_pkt != nullptr) {
*recved_pkt = recv;
}
if (total_loss != nullptr) {
*total_loss = loss;
}
if (qps != nullptr) {
*qps = (uint32_t)((double)(recv) /
((double)(now - options.s_ts_begin.load()) / (double)S2NS));
}
}
static void
proto_loop(struct thread_info *tinfo)
{
struct rte_mbuf *tx_buf;
struct rte_mbuf *rx_bufs[BURST_SIZE];
struct pkt_hdr *pkt_data;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"proto_loop <thread %d>: waiting for SYNC from cat\n", tinfo->id);
while (options.s_state.load() == STATE_SYNC) {
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
tinfo->rxqid, rx_bufs, BURST_SIZE);
if (nb_rx > 0) {
for (int i = 0; i < nb_rx; i++) {
struct pkt_hdr *each = check_valid_packet(
rx_bufs[i], &options.s_host_spec.mac_addr);
if (each != nullptr) {
uint16_t type = rte_be_to_cpu_16(
each->type);
if (type == PKT_TYPE_SYNC) {
int expected = STATE_SYNC;
ntr(NTR_DEP_USER1,
NTR_LEVEL_INFO,
"proto_loop <thread %d>: received SYNC from cat\n",
tinfo->id);
if (!options.s_state
.compare_exchange_strong(
expected,
STATE_SYNC_ACK)) {
// someone barged in,
// listen to that guy
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"proto_loop <thread %d>: failed to cmpxchg sync_recv.\n",
tinfo->id);
} else {
pkt_hdr_to_netspec(each,
&options
.s_master_spec,
nullptr, nullptr,
nullptr);
if (alloc_pkt_hdr(
mempool_get(
tinfo
->socket_id),
PKT_TYPE_SYNC_ACK,
&options
.s_master_cspec,
0, &tx_buf,
&pkt_data) !=
0) {
rte_exit(
EXIT_FAILURE,
"failed to alloc pkt hdr\n");
}
tx_burst_all(
options.portid,
tinfo->txqid,
&tx_buf, 1);
expected =
STATE_SYNC_ACK;
// we've done our job,
// set off the threads
if (!options.s_state
.compare_exchange_strong(
expected,
STATE_RUNNING)) {
rte_exit(
EXIT_FAILURE,
"state unexpectedly changed\n");
}
ntr(NTR_DEP_USER1,
NTR_LEVEL_INFO,
"proto_loop <thread %d>: sent SYNC_ACK to cat\n",
tinfo->id);
}
} else {
ntr(NTR_DEP_USER1,
NTR_LEVEL_DEBUG,
"proto_loop <thread %d>: ignoring invalid packet %p type %d.\n",
tinfo->id,
(void *)rx_bufs[i], type);
}
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"proto_loop <thread %d>: ignoring invalid packet %p.\n",
tinfo->id, (void *)rx_bufs[i]);
//dump_pkt(rx_bufs[i]);
}
rte_pktmbuf_free(rx_bufs[i]);
}
}
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"proto_loop <thread %d>: exiting loop...\n", tinfo->id);
}
static void
pkt_loop(struct thread_info *tinfo)
{
struct rte_mbuf *tx_bufs[BURST_SIZE];
struct rte_mbuf *rx_bufs[BURST_SIZE];
std::vector<struct epoch_info *> recved_epochs;
std::map<unsigned int, struct epoch_info *> sent_epochs;
uint64_t cur_epoch = 0;
uint64_t next_ts;
uint64_t last_recv_ts = 0;
struct conn_spec srv_cspec;
rdport_generator src_port_gen(MIN_RANDOM_PORT);
rdport_generator dst_port_gen(MIN_RANDOM_PORT);
srv_cspec.src = &options.s_host_spec;
srv_cspec.dst = &options.server_spec;
next_ts = topo_uptime_ns();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "pkt_loop <thread %d>: entering\n",
tinfo->id);
while (options.s_state.load() == STATE_RUNNING) {
uint64_t now = topo_uptime_ns();
// always pop incoming packets
const uint16_t nb_rx = rte_eth_rx_burst(options.portid,
tinfo->rxqid, rx_bufs, BURST_SIZE);
if (nb_rx > 0) {
for (int i = 0; i < nb_rx; i++) {
struct pkt_hdr *each = check_valid_packet(
rx_bufs[i], &options.s_host_spec.mac_addr);
if (each == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: ignoring invalid packet %p.\n",
tinfo->id, (void *)rx_bufs[i]);
rte_pktmbuf_free(rx_bufs[i]);
continue;
}
uint16_t type = rte_be_to_cpu_16(each->type);
NTR_PKT(NTR_DEP_USER1, NTR_LEVEL_DEBUG, each,
"locore_main <thread %d>: ", tinfo->id);
struct pkt_payload_epoch *pld_epoch;
struct epoch_info *einfo;
uint32_t epoch;
uint32_t id;
struct thread_info *other_t;
int int_expected = STATE_RUNNING;
switch (type) {
case PKT_TYPE_LOAD_RESP:
pld_epoch = (struct pkt_payload_epoch *)
each->payload;
epoch = rte_be_to_cpu_32(
pld_epoch->epoch);
id = epoch_get_id(epoch);
// printf("Load resp size : %d\n",
// rx_bufs[i]->data_len);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: packet %p epoch 0x%x id %d.\n",
tinfo->id, (void *)rx_bufs[i],
epoch, id);
if (id >= options.s_num_threads) {
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"pkt_loop <thread %d>: packet %p invalid id %d.\n",
tinfo->id,
(void *)rx_bufs[i], id);
break;
}
einfo = new struct epoch_info;
einfo->epoch = epoch;
einfo->ts = now;
other_t = options.s_thr_info.at(id);
other_t->mtx.lock();
other_t->recved_epochs.push_back(einfo);
other_t->mtx.unlock();
break;
case PKT_TYPE_FIN:
if (rte_is_same_ether_addr(
&each->eth_hdr.src_addr,
&options.s_master_spec
.mac_addr)) {
ntr(NTR_DEP_USER1,
NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: recved FIN from cat.\n",
tinfo->id);
// master told us to stop!
if (!options.s_state
.compare_exchange_strong(
int_expected,
STATE_FIN)) {
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"pkt_loop <thread %d>: failed to cmpxchg state.\n",
tinfo->id);
}
uint32_t qps;
uint32_t total_recv;
uint32_t total_loss;
calc_stats(now, &qps,
&total_recv, &total_loss);
struct pkt_hdr *pkt_hdr;
if (alloc_pkt_hdr(
mempool_get(
tinfo->socket_id),
PKT_TYPE_FIN_ACK,
&options.s_master_cspec,
0, &tx_bufs[0],
&pkt_hdr) != 0) {
rte_exit(EXIT_FAILURE,
"failed to allocate pkt hdr\n");
}
auto pld_qps =
(struct pkt_payload_qps *)
pkt_hdr->payload;
pld_qps->qps = rte_cpu_to_be_32(
qps);
pld_qps->recved_pkts =
rte_cpu_to_be_32(
total_recv);
pld_qps->lost_pkts =
rte_cpu_to_be_32(
total_loss);
tx_burst_all(options.portid,
tinfo->txqid, &tx_bufs[0],
1);
options.s_state.store(
STATE_FIN);
ntr(NTR_DEP_USER1,
NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: sent FIN_ACK to cat. QPS = %d.\n",
tinfo->id, qps);
} else {
ntr(NTR_DEP_USER1,
NTR_LEVEL_WARNING,
"pkt_loop <thread %d>: invalid FIN packet from a different cat.\n",
tinfo->id);
}
break;
default:
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop: ignoring packet %p with unknown type %d.\n",
(void *)rx_bufs[i], type);
}
rte_pktmbuf_free(rx_bufs[i]);
}
}
// dequeue receved epochs
struct epoch_info *einfo;
tinfo->mtx.lock();
while (!tinfo->recved_epochs.empty()) {
// only dequeue, process later
einfo = tinfo->recved_epochs.front();
tinfo->recved_epochs.pop_front();
// XXX: might call into the allocator
// otherwise we need to have an array and do batching
// => complex code and don't think it's worth it
recved_epochs.push_back(einfo);
}
tinfo->mtx.unlock();
if (!recved_epochs.empty())
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: dequeued %lu received epochs\n",
tinfo->id, recved_epochs.size());
// process epochs
while (!recved_epochs.empty()) {
einfo = recved_epochs.back();
recved_epochs.pop_back();
auto it = sent_epochs.find(einfo->epoch);
if (it != sent_epochs.end()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: received epoch 0x%x\n",
tinfo->id, epoch_get_epoch(einfo->epoch));
if (einfo->ts > last_recv_ts) {
last_recv_ts = einfo->ts;
}
delete it->second;
sent_epochs.erase(it);
tinfo->recved_pkts.fetch_add(1);
} else {
// we recved an epoch we never sent
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: received epoch 0x%x but never sent it. Packet loss?\n",
tinfo->id, einfo->epoch);
}
delete einfo;
}
// handle packet loss
for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
einfo = it->second;
if (now - einfo->ts >
options.pkt_loss_delay_ms * MS2NS) {
// timed out
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: epoch 0x%x is lost after not receiving for too long\n",
tinfo->id, einfo->epoch);
delete it->second;
it = sent_epochs.erase(it);
tinfo->lost_pkts.fetch_add(1);
} else {
++it;
}
}
// check to send the next packet
uint32_t total_send = 0;
while (now >= next_ts && sent_epochs.size() < options.depth &&
total_send < BURST_SIZE) {
struct pkt_payload_load *pld_load;
struct pkt_hdr *pkt_data;
next_ts += (int)(tinfo->ia_gen->generate() * S2NS);
// change dst port for every packet for RSS
srv_cspec.dst_port = dst_port_gen.next();
srv_cspec.src_port = src_port_gen.next();
if (alloc_pkt_hdr(mempool_get(tinfo->socket_id),
PKT_TYPE_LOAD, &srv_cspec, options.pkt_pad_sz,
&tx_bufs[total_send], &pkt_data) != 0) {
rte_exit(EXIT_FAILURE,
"failed to allocate pkt hdr\n");
}
pld_load = (struct pkt_payload_load *)pkt_data->payload;
pld_load->type = rte_cpu_to_be_32(options.workload_type);
pld_load->arg0 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen0->generate());
pld_load->arg1 = rte_cpu_to_be_32((uint32_t)tinfo->load_gen1->generate());
unsigned int epoch = epoch_mk(tinfo->id, cur_epoch);
pld_load->epoch = rte_cpu_to_be_32(epoch);
cur_epoch++;
einfo = new struct epoch_info;
einfo->epoch = epoch;
einfo->ts = now;
sent_epochs.insert({ epoch, einfo });
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: sending packet %p with epoch 0x%x\n",
tinfo->id, (void *)tx_bufs[total_send], epoch);
total_send++;
}
tx_burst_all(options.portid, tinfo->txqid, tx_bufs, total_send);
// check rage quit only when we have sent a packet
if (last_recv_ts == 0) {
last_recv_ts = topo_uptime_ns();
}
if (topo_uptime_ns() >
options.rage_quit_time * MS2NS + last_recv_ts) {
rte_exit(EXIT_FAILURE,
"rat: thread %d waiting too long for resp. I F QUIT!\n",
tinfo->id);
}
}
// clean up
for (auto it = sent_epochs.begin(); it != sent_epochs.end();) {
delete it->second;
++it;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG,
"pkt_loop <thread %d>: exiting loop...\n", tinfo->id);
}
static int
locore_main(void *tif)
{
auto tinfo = (struct thread_info *)tif;
uint32_t core_id = rte_lcore_id();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"locore_main <thread %d>: running on core %d rxqid %d txqid %d...\n", tinfo->id,
core_id, tinfo->rxqid, tinfo->txqid);
if (rte_eth_dev_socket_id(options.portid) > 0 &&
rte_eth_dev_socket_id(options.portid) != (int)rte_socket_id()) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"locore_main <thread %d>: WARNING, port %d is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n",
tinfo->id, options.portid);
}
if (options.slave_mode == 1) {
// perform rat protocol
proto_loop(tinfo);
}
// wait for the primary thread sending SYNC_ACK
while (options.s_state.load() != STATE_RUNNING) {
}
// store the current timestamp
options.s_ts_begin.store(topo_uptime_ns());
pkt_loop(tinfo);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "locore_main <thread %d>: exited\n",
tinfo->id);
return 0;
}
static void
dump_options()
{
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configuration:\n"
" verbosity = +%d\n"
" run time = %d\n"
" num threads = %d\n"
" rage quit time = %ul\n"
" slave mode = %d\n"
" interarrival dist = %s\n"
" workload type = %d\n"
" workload arg0 = %s\n"
" workload arg1 = %s\n"
" qps = %d\n"
" host IP = 0x%x\n"
" depth = %u\n"
" packet loss time threshold = %u\n"
" jumbo frame = %d\n"
" packet pad size = %d\n"
" portid = %d\n",
ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING, options.run_time,
options.s_num_threads, options.rage_quit_time, options.slave_mode,
options.ia_gen, options.workload_type, options.load_gen[0], options.load_gen[1], options.target_qps,
options.s_host_spec.ip, options.depth, options.pkt_loss_delay_ms,
options.jumbo_frame_enabled, options.pkt_pad_sz, options.portid);
}
static void
usage()
{
fprintf(stdout,
"Usage:\n"
" -v(vv): verbose mode\n"
" -h: display the information\n"
" -t: run time\n"
" -s: server net spec\n"
" -S: slave(rat) mode\n"
" -A: affinity mask\n"
" -i: inter-arrival time distribution\n"
" -w: workload type\n"
" -w (repeated): workload arg0 distribution\n"
" -w (repeated): workload arg1 distribution\n"
" -r: rage quit time (in ms)\n"
" -q: target QPS\n"
" -H: host net spec\n"
" -D: max number of packets in flight\n"
" -l: packet loss time threshold\n"
" -J: enable jumbo frame\n"
" -P: pad load packets to this size\n"
" -p: portid\n");
}
int
main(int argc, char *argv[])
{
struct thread_info *tinfo;
bool has_host_spec = false;
ntr_init();
// init dpdk
int ret = rte_eal_init(argc, argv);
if (ret < 0) {
rte_exit(EXIT_FAILURE, "rte_eal_init failed!\n");
}
argc -= ret;
argv += ret;
// set warning level
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
{
int c;
int num_of_ws = 0;
// parse arguments
while ((c = getopt(argc, argv,
"vht:s:SA:i:w:r:q:H:D:l:JP:p:")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1,
ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 'h':
usage();
rte_exit(EXIT_SUCCESS, "\n");
case 't':
options.run_time = strtol(optarg, nullptr, 10);
break;
case 's':
if (str_to_netspec(optarg,
&options.server_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid server net spec\n");
}
break;
case 'S':
options.slave_mode = 1;
options.s_state =
STATE_SYNC; // set state to wait for SYNC
break;
case 'A':
cpulist_to_cpuset(optarg, &options.cpu_set);
options.s_num_threads = CPU_COUNT(
&options.cpu_set);
if (options.s_num_threads == 0) {
rte_exit(EXIT_FAILURE,
"invalid cpu mask %s\n", optarg);
}
break;
case 'i':
strncpy(options.ia_gen, optarg,
sizeof(options.ia_gen) - 1);
break;
case 'w':
if (num_of_ws == 0) {
options.workload_type = strtol(optarg, NULL, 10);
if (options.workload_type >= LOAD_TYPE_MAX) {
rte_exit(EXIT_FAILURE,
"invalid workload type %s\n", optarg);
}
} else if (num_of_ws <= WORKLOAD_MAX_ARGS) {
strncpy(options.load_gen[num_of_ws - 1], optarg, 255);
}
num_of_ws++;
break;
case 'r':
options.rage_quit_time = strtol(optarg, nullptr,
10);
break;
case 'q':
options.target_qps = strtol(optarg, nullptr,
10);
break;
case 'H':
has_host_spec = true;
if (str_to_netspec(optarg,
&options.s_host_spec) != 0) {
rte_exit(EXIT_FAILURE,
"invalid host net spec.\n");
}
break;
case 'D':
options.depth = strtol(optarg, nullptr, 10);
if (options.depth == 0) {
options.depth = UINT32_MAX;
}
break;
case 'l':
options.pkt_loss_delay_ms = strtol(optarg,
nullptr, 10);
if (options.pkt_loss_delay_ms == 0) {
options.pkt_loss_delay_ms = UINT32_MAX;
}
break;
case 'J':
options.jumbo_frame_enabled = true;
options.port_mtu = MAX_JUMBO_MTU;
break;
case 'P':
options.pkt_pad_sz = strtol(optarg, nullptr,
10);
break;
case 'p':
options.portid = strtol(optarg, nullptr, 10);
break;
default:
usage();
rte_exit(EXIT_FAILURE, "unknown argument: %c\n",
c);
}
}
}
if (options.pkt_pad_sz != 0 &&
options.pkt_pad_sz > mtu_to_pkt_size(options.port_mtu)) {
rte_exit(EXIT_FAILURE, "pkt_pad_sz is too large for mtu %d\n",
options.port_mtu);
}
if (!has_host_spec) {
rte_exit(EXIT_FAILURE, "Must specify host IP.\n");
}
// init libtopo
if (topo_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libtopo init failed!\n");
}
if (nms_init(ntr_get_level(NTR_DEP_USER1) - NTR_LEVEL_WARNING) !=
0) {
rte_exit(EXIT_FAILURE, "libnms init failed!\n");
}
dump_options();
// configure memory and port
struct port_conf pconf;
struct device_conf dconf;
struct mem_conf mconf;
portconf_get(options.portid, &pconf);
if (!pconf.timesync) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING,
"main: timesync disabled. hw timestamp unavailable.\n ");
}
dconf.mtu = options.port_mtu;
CPU_COPY(&options.cpu_set, &dconf.core_affinity);
dconf.portid = options.portid;
dconf.rss_hf = pconf.rss_hf;
dconf.rx_offloads = pconf.rxoffload;
dconf.tx_offloads = pconf.txoffload;
dconf.timesync = pconf.timesync;
dconf.rx_fn = nullptr;
dconf.rx_user = nullptr;
dconf.rx_ring_sz = 2048;
dconf.tx_fn = nullptr;
dconf.tx_user = nullptr;
dconf.tx_ring_sz = 2048;
mconf.cache_size = 512;
mconf.priv_size = 0;
mconf.num_elements = (dconf.rx_ring_sz + dconf.tx_ring_sz) *
rte_lcore_count() / rte_socket_count();
mconf.data_room_size = RTE_MBUF_DEFAULT_BUF_SIZE + MAX_JUMBO_MTU -
MAX_STANDARD_MTU;
mconf.max_pools = -1;
dpdk_init(&dconf, &mconf);
if (rte_eth_macaddr_get(options.portid,
&options.s_host_spec.mac_addr) != 0) {
rte_exit(EXIT_FAILURE, "cannot get mac address of port %d\n",
options.portid);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"Configured port %d with mac addr %x:%x:%x:%x:%x:%x\n",
options.portid, options.s_host_spec.mac_addr.addr_bytes[0],
options.s_host_spec.mac_addr.addr_bytes[1],
options.s_host_spec.mac_addr.addr_bytes[2],
options.s_host_spec.mac_addr.addr_bytes[3],
options.s_host_spec.mac_addr.addr_bytes[4],
options.s_host_spec.mac_addr.addr_bytes[5]);
unsigned int cpuset_idx = CPU_FFS(&options.cpu_set);
unsigned int tid = 0;
while (cpuset_idx != 0) {
unsigned int lcore_id = cpuset_idx - 1;
tinfo = new thread_info;
tinfo->ia_gen = createGenerator(options.ia_gen);
tinfo->load_gen0 = createGenerator(options.load_gen[0]);
tinfo->load_gen1 = createGenerator(options.load_gen[1]);
if (tinfo->ia_gen == nullptr || tinfo->load_gen0 == nullptr || tinfo->load_gen1 == nullptr) {
rte_exit(EXIT_FAILURE,
"invalid ia_gen or ld_gen string\n");
}
tinfo->ia_gen->set_lambda((double)options.target_qps /
(double)(options.s_num_threads));
tinfo->id = tid;
tinfo->lcore_id = lcore_id;
tinfo->socket_id = rte_lcore_to_socket_id(lcore_id);
tinfo->rxqid = tid;
tinfo->txqid = tid;
options.s_thr_info.push_back(tinfo);
tid++;
CPU_CLR(lcore_id, &options.cpu_set);
cpuset_idx = CPU_FFS(&options.cpu_set);
}
sleep(INIT_DELAY);
for (unsigned int i = 0; i < options.s_num_threads; i++) {
tinfo = options.s_thr_info.at(i);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: launching thread %d on locore %d\n", tinfo->id,
tinfo->lcore_id);
if (rte_eal_remote_launch(locore_main,
(void *)options.s_thr_info.at(i),
tinfo->lcore_id) != 0) {
rte_exit(EXIT_FAILURE,
"failed to launch function on locore %d\n",
tinfo->lcore_id);
}
}
// poor man's timer
uint32_t second = 0;
// this loop exit is signaled by SYNC_FIN in slave mode and by itself in
// non slave mode
while (options.s_state.load() != STATE_FIN) {
if (options.slave_mode != 1) {
if (second >= options.run_time) {
options.s_state.store(STATE_FIN);
break;
}
usleep(1 * S2US);
second++;
}
}
for (unsigned int i = 0; i < options.s_num_threads; i++) {
tinfo = options.s_thr_info.at(i);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO,
"main: waiting for locore %d...\n", tinfo->lcore_id);
if (rte_eal_wait_lcore(tinfo->lcore_id) != 0) {
rte_exit(EXIT_FAILURE, "failed to wait for locore %d\n",
tinfo->lcore_id);
}
}
uint32_t qps;
uint32_t total_recv;
uint32_t total_loss;
calc_stats(topo_uptime_ns(), &qps, &total_recv, &total_loss);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "qps = %d, recv = %d, loss = %d\n",
qps, total_recv, total_loss);
for (auto each : options.s_thr_info) {
delete each->load_gen0;
delete each->load_gen1;
delete each->ia_gen;
delete each;
}
// clean up
dpdk_cleanup(&dconf);
return 0;
}

50
scripts/cc_pin.py Normal file
View File

@ -0,0 +1,50 @@
import os
import sys
import getopt
import subprocess
options = getopt.getopt(sys.argv[1:], 'b:s:d:p:')[0]
base=0
stride=2
num = 0
port = 0
for opt, arg in options:
if opt == '-b':
base = int(arg)
elif opt == '-s':
stride = int(arg)
elif opt == '-d':
num = int(arg)
elif opt == '-p':
port = int(arg)
result = subprocess.run("sysctl -a", shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
lines = result.stdout.decode().split('\n')
cclines : list[str] = []
for line in lines:
if ("irq" in line) and (f"t6nex{num}" in line) and (f"{port}a" in line):
cclines.append(line)
if len(cclines) == 0:
print(f"No t6nex {num}a lines from sysctl.\n")
exit(1)
irqs = []
for line in cclines:
eles = line.split(' ')
irq = eles[0]
if (irq.startswith("irq") and irq.endswith(":")):
irq = irq[3:-1]
irqs.append(int(irq))
else:
print(f"Unknown line format: f{line}")
print(f"Detected {len(irqs)} irqs:\n{str(irqs)}")
for irq in irqs:
print(f"Setting irq{irq}'s affinity to core {base}...")
subprocess.run(f"cpuset -l {base} -x {irq}", check=True, shell=True)
base = base + stride
exit(0)

9
scripts/copy-mount.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
scp -P77 mount.sh oscar@icelake1-int.rcs.uwaterloo.ca:~/
scp -P77 mount_small.sh oscar@icelake1-int.rcs.uwaterloo.ca:~/
scp -P77 mount.sh oscar@milan1-int.rcs.uwaterloo.ca:~/
scp -P77 mount_small.sh oscar@milan1-int.rcs.uwaterloo.ca:~/
scp -P77 mount.sh oscar@icelake2-int.rcs.uwaterloo.ca:~/
scp -P77 mount_small.sh oscar@icelake2-int.rcs.uwaterloo.ca:~/
scp -P77 mount.sh oscar@milan2-int.rcs.uwaterloo.ca:~/
scp -P77 mount_small.sh oscar@milan2-int.rcs.uwaterloo.ca:~/

230
scripts/dpdk.py Executable file
View File

@ -0,0 +1,230 @@
from cgi import test
from site import abs_paths
import subprocess as sp
import time
import select
import os
import datetime
import pwd
import sys
import getopt
import numpy as np
import re
import libpar as par
import libtc as tc
import libmechspec as mechspec
import netexp
only_max_qps = True
# [[counter names], counting mode (0 = sampling, 1 = counting)]
pmc_counters = [
"",
# [["mem_load_l3_miss_retired.local_dram"], 1],
# [["mem_load_l3_miss_retired.remote_dram"], 1],
# [["mem_load_l3_miss_retired.remote_hitm"], 1],
# [["mem_load_l3_miss_retired.remote_fwd"], 1]
# [["mem_trans_retired.load_latency_gt_8"], 0],
# [["mem_trans_retired.load_latency_gt_16"], 0],
# [["mem_trans_retired.load_latency_gt_32"], 0],
# [["mem_trans_retired.load_latency_gt_64"], 0],
# [["mem_trans_retired.load_latency_gt_128"], 0],
# [["mem_trans_retired.load_latency_gt_256"], 0],
# [["mem_trans_retired.load_latency_gt_512"], 0],
#[["mem_trans_retired.load_latency_gt_8", ""], 0],
]
# pkt_pad
clt_pkt_pads = [
0,
# 256,
# 512,
# 1024,
# 2048,
# 4096,
# 8192
]
clt_pkt_pads_depth = {}
clt_pkt_pads_depth[0] = 8
clt_pkt_pads_depth[256] = 6
clt_pkt_pads_depth[512] = 6
clt_pkt_pads_depth[1024] = 4
clt_pkt_pads_depth[1518] = 4
clt_pkt_pads_depth[2048] = 2
clt_pkt_pads_depth[4096] = 2
clt_pkt_pads_depth[8192] = 1
clt_pkt_pads_depth[9018] = 1
# clt_load
clt_wrkld = [
[0, "fixed:0", "fixed:0"],
# [0, "uniform:1000", "fixed:0"],
# [0, "uniform:100", "fixed:0"],
# [0, "uniform:10", "fixed:0"],
# [1, "uniform:480", "uniform:1024"],
# [1, "uniform:480", "uniform:256"],
# [1, "uniform:480", "uniform:64"]
]
# paths
file_dir = os.path.dirname(os.path.realpath(__file__))
root_dir = os.path.join(file_dir,"..")
# [srv_affinity, OPTIONAL( memgen_affinity, iteration, buffer_size, target_dom )]
server_affinity = [
["1,3,5,7,9,11,13,15,17,19,21,23"],
["25,27,29,31,33,35,37,39,41,43,45,47"],
#["1,3,5,7,9,11,13,15,17,19,21,23", "26,28,30,32,34,36,38,40,42,44,46", -1, 512*1024*1024, 0],
#["25,27,29,31,33,35,37,39,41,43,45,47", "2,4,6,8,10,12,14,16,18,20,22", -1, 512*1024*1024, 1],
# "65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127",
# "1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63",
# "1,3,5,7,9,11,13,15",
# "17,19,21,23,25,27,29,31",
# "33,35,37,39,41,43,45,47",
# "49,51,53,55,57,59,61,63"
]
def flush_netresult(conf : netexp.NetExpConf, result : netexp.NetExpResult):
sample_out = tc.get_odir() + "/" + str(result.parser.qps) + ".txt"
with open(sample_out, "w") as f:
f.write(result.sample)
if conf.enable_pmc:
pmc_out = tc.get_odir() + "/" + str(result.parser.qps) + ".pmc"
if conf.pmc_mode != 0:
with open(pmc_out, "w") as f:
f.write(result.pmc_parser.raw)
else:
with open(pmc_out, "wb") as f:
f.write(result.pmc_parser[0])
with open(pmc_out + "_parsed", "w") as g:
g.write(result.pmc_parser[1])
tc.log_print("=== Summary - qps: " + str(result.parser.qps) + " master loss: " + str(float(result.parser.master_loss) / float(result.parser.master_recv + result.parser.master_loss) * 100.00) + "% slave loss: " + str(float(result.parser.slave_loss) / float(result.parser.slave_recv + result.parser.slave_loss) * 100.0) + "%" )
tc.log_print("=== Server HW:")
tc.log_print(par.mutilate_data.build_mut_output(result.parser.srv_hwlat, [result.parser.qps]) + "\n")
tc.log_print("=== Server SW:")
tc.log_print(par.mutilate_data.build_mut_output(result.parser.srv_swlat, [result.parser.qps]) + "\n")
tc.log_print("=== Client HW:")
tc.log_print(par.mutilate_data.build_mut_output(result.parser.clt_hwlat, [result.parser.qps]) + "\n")
tc.log_print("=== Client SW:")
tc.log_print(par.mutilate_data.build_mut_output(result.parser.clt_swlat, [result.parser.qps]) + "\n")
if conf.enable_pmc:
if conf.pmc_mode != 0:
tc.log_print("=== PMC:")
tc.log_print("counter: " + result.pmc_parser.counter + " count: " + str(result.pmc_parser.count) + " cores: " + str(result.pmc_parser.cores))
def main():
tc.set_ssh_param("-o StrictHostKeyChecking=no -p77")
tc.set_ssh_user("oscar")
output_dirname = "run"
conf = netexp.NetExpConf()
conf.srv_mechspec = mechspec.LAB.SKYLAKE1_10G
conf.clt_mechspecs = [mechspec.LAB.SKYLAKE3_10G, mechspec.LAB.SKYLAKE5_10G]
conf.mst_mechspec = mechspec.LAB.SKYLAKE2_10G
conf.finalize_mechspecs()
conf.root_dir = "/numam.d/build/bin"
# server fixed configs
conf.srv_port = 0
# client fixed configs
conf.clt_ia = "exponential"
conf.clt_affinity = "1,3,5,7,9,11,13,15,17,19,21,23"
conf.clt_port = 0
conf.clt_pkt_loss_lat = 5000
conf.clt_rage_quit_lat = 5000
# master fixed configs
conf.mst_port = 0
conf.mst_warmup = 5
conf.mst_duration = 20
conf.mst_qps = 100
conf.mst_ia = "exponential"
conf.mst_pkt_loss_lat = 5000
conf.mst_pkt_loss_max = 100
conf.mst_affinity = "2"
# pmc stuff
conf.pmc_sampling_rate = 4096
conf.pmc_counting_interval = 0.1
options = getopt.getopt(sys.argv[1:], 'scSD')[0]
for opt, arg in options:
if opt in ('-s'):
netexp.stop_all(conf)
return
elif opt in ('-c'):
conf.enable_client_only=True
elif opt in ('-S'):
netexp.setup(conf, bench = True, dpdk = False)
return
elif opt in ('-D'):
netexp.setup(conf, bench=False, dpdk=True)
return
tc.init("~/results.d/numam_neo/" + output_dirname + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
cpcmd = "cp " + __file__ + " " + tc.get_odir() + "/"
tc.log_print(cpcmd)
sp.check_call(cpcmd, shell=True)
for eaff in server_affinity:
conf.srv_affinity = eaff[0]
conf.enable_memgen = False
if len(eaff) > 1:
conf.enable_memgen = True
conf.memgen_affinity = eaff[1]
conf.memgen_iteration = eaff[2]
conf.memgen_size = eaff[3]
conf.memgen_tgtdom = eaff[4]
for epad in clt_pkt_pads:
conf.clt_pkt_pad = 0
conf.clt_pkt_depth = clt_pkt_pads_depth[conf.clt_pkt_pad]
for eload in clt_wrkld:
conf.clt_wrkld = eload[0]
conf.clt_wrkarg0 = eload[1]
conf.clt_wrkarg1 = eload[2]
for epmc in pmc_counters:
conf.enable_pmc = False
if len(epmc) > 0:
conf.enable_pmc = True
conf.pmc_counters = epmc[0]
conf.pmc_mode = epmc[1]
test_name = "affinity" + eaff[0] + "_pad" + str(epad) + "_load" + str(eload[0]) + "," + str(eload[1]) + "," + str(eload[2])
if (conf.enable_memgen):
test_name += "_memload" + str(eaff[1]) + "," + str(eaff[2]) + "," + str(eaff[3]) + "," + str(eaff[4])
if (conf.enable_pmc):
test_name += "_pmc" + str(epmc[1]) + "_" + conf.get_pmc_str()
tc.begin(test_name)
conf.clt_qps = 0
tc.log_print("============ " + test_name + " QPS: MAX ============")
result : netexp.NetExpResult = netexp.run(conf)
flush_netresult(conf, result)
max_qps = result.parser.qps
if conf.enable_client_only:
return
if only_max_qps:
continue
finish = (int)(max_qps - max(conf.mst_qps, 0.01 * max_qps))
step = (int)(finish / 10)
cur_qps = step
while cur_qps <= finish:
tc.log_print("============ " + test_name + " QPS: " + str(cur_qps) + " ============")
conf.clt_qps = cur_qps
result : netexp.NetExpResult = netexp.run(conf)
flush_netresult(result)
cur_qps += step
tc.log_print("")
tc.end()
netexp.stop_all(conf)
main()

132
scripts/graph.py Executable file
View File

@ -0,0 +1,132 @@
#!/usr/bin/env python3.6
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker
import numpy as np
import sys
import re
import os
import json
import libpar as par
import getopt
import math
import concurrent.futures as CF
def process_dir(rootdir):
ret = []
print("Processing directory " + rootdir + " ...")
for subdir in os.listdir(rootdir):
each_dir = os.path.join(rootdir, subdir)
if os.path.isfile(each_dir) and each_dir.endswith(".txt"):
output = None
try:
with open(each_dir, 'r') as f:
if len(f.readlines()) <= 1:
print("Skipping empty file - " + each_dir)
continue
with open(each_dir, 'r') as f:
output = f.read()
parser = par.khat_parser()
parser.parse(output)
print("Processed raw data - " + each_dir)
ret.append(parser)
except:
print("Unrecognized format - " + subdir)
print("")
return ret
marker_map = ["o", "P", "s", "v", "*", "+", "^", "1", "2", "d", "X", "o", "P", "s", "v", "*", "+", "^", "1", "2", "d", "X"]
color_map = ["xkcd:black", "xkcd:red", "xkcd:blue", "xkcd:green", "xkcd:cyan", "xkcd:purple", "xkcd:orange", "xkcd:salmon", "xkcd:lightgreen", "xkcd:indigo", "xkcd:brown", "xkcd:bubblegum", "xkcd:lavender", "xkcd:maroon", "xkcd:fern", "xkcd:sky", "xkcd:orchid", "xkcd:sienna"]
parser_idx_labels = ["srv_hw", "srv_sw", "clt_hw", "clt_sw"]
def add_curve(eax, label : str, qps_arr : [], lat_arr : [], marker : str, color : str):
df_dict = {}
df_dict['qps'] = qps_arr
df_dict['lat'] = lat_arr
df = pd.DataFrame(df_dict)
df = df.sort_values('qps')
eax.plot('qps', 'lat', data = df, label=label, marker=marker, color=color, markersize=8)
# adds curves (avg and 99th percentile) for a specific parser idx
def add_curves(rax, label : str, parsers : [], parser_idx : int, marker : str, color : str):
qps_arr = []
avg_arr = []
p99_arr = []
for parser in parsers:
qps_arr.append(parser.qps)
each_lat_arr = []
each_lat_arr.extend(parser.get_stat_arr(parser_idx))
avg_arr.append(np.mean(each_lat_arr))
p99_arr.append(np.percentile(each_lat_arr, 99))
add_curve(rax[0], label, qps_arr, avg_arr, marker, color)
add_curve(rax[1], label, qps_arr, p99_arr, marker, color)
# generate the graphs for a parser index
def generate_graph(aff_to_parser : {}, parser_idx : int, fn : str):
marker_idx = 0
color_idx = 0
fig, rax = plt.subplots(2, 1)
rax[0].set_yscale("log")
rax[0].set_title("Average")
rax[0].set_xlabel("QPS")
rax[0].set_ylabel("Latency (ns)")
rax[0].xaxis.get_major_formatter().set_scientific(False)
rax[0].yaxis.set_minor_formatter(ticker.ScalarFormatter())
rax[1].set_yscale("log")
rax[1].set_title("99th percentile")
rax[1].set_xlabel("QPS")
rax[1].set_ylabel("Latency (ns)")
rax[1].xaxis.get_major_formatter().set_scientific(False)
rax[1].yaxis.set_minor_formatter(ticker.ScalarFormatter())
print("Generating graph => " + fn + "...")
for aff in aff_to_parser:
# each affinity gets a different marker type
marker_type = marker_map[marker_idx]
color_type = color_map[color_idx]
marker_idx += 1
color_idx += 1
print(" Processing affinity " + aff + "...")
add_curves(rax, aff, aff_to_parser[aff], parser_idx, marker_type, color_type)
rax[0].legend()
rax[1].legend()
fig.set_size_inches(23.4, 16.5)
plt.savefig(fn, dpi=150)
plt.close()
def main():
datdir = None
options = getopt.getopt(sys.argv[1:], 'd:')[0]
for opt, arg in options:
if opt in ('-d'):
datdir = arg
if datdir == None:
raise Exception("Must specify -d parameter")
dat = {}
for subdir in os.listdir(datdir):
each_dir = os.path.join(datdir, subdir)
if not os.path.isfile(each_dir):
dat[subdir] = process_dir(each_dir)
for i in range(len(parser_idx_labels)):
generate_graph(dat, i, datdir + "/" + parser_idx_labels[i])
if __name__ == "__main__":
main()

105
scripts/histo.py Normal file
View File

@ -0,0 +1,105 @@
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import sys
import re
import os
import json
import getopt
import math
import concurrent.futures as CF
import libpar as par
num_bins = 250
extra_pct = []
def saveplot(fp : str, data : [], title : str):
plt.hist(data, num_bins)
plt.xlabel("Delay")
plt.title(title)
plt.ylabel("Frequency")
f = plt.gcf()
f.set_size_inches(11.69, 8.27)
f.savefig(fp + "_" + title + "_" + ".png", dpi=160)
plt.clf()
print("Generated - " + fp + "_" + title + "_" + ".png")
executor = CF.ProcessPoolExecutor(max_workers=int(os.cpu_count()))
def clean_data(dat: []):
ret = []
arr = np.array(dat)
cutoff = np.percentile(arr, 99)
for i in arr:
if i <= cutoff:
ret.append(i)
return ret
def process_file(each_dir):
try:
print("Processing " + each_dir + " ...")
with open(each_dir, 'r') as f:
parser = par.khat_parser()
parser.parse(f.read())
sh = []
ss = []
ch = []
cs = []
for pt in parser.datapt:
sh.append(pt.s_htx - pt.s_hrx)
ss.append(pt.s_stx - pt.s_srx)
ch.append(pt.c_hrx - pt.c_htx)
cs.append(pt.c_srx - pt.c_stx)
sh = clean_data(sh)
ss = clean_data(ss)
ch = clean_data(ch)
cs = clean_data(cs)
saveplot(each_dir, sh, "server_hw_delay")
saveplot(each_dir, ss, "server_sw_delay")
saveplot(each_dir, ch, "client_hw_delay")
saveplot(each_dir, cs, "client_sw_delay")
# output median, etc.
with open(each_dir + "_" + "stats.txt", 'w') as f:
f.write("===================== SERVER HW ====================\n")
f.write(par.mutilate_data.build_mut_output(sh, [len(sh)]))
f.write("\n===================== SERVER SW ====================\n")
f.write(par.mutilate_data.build_mut_output(ss, [len(ss)]))
f.write("\n===================== CLIENT HW ====================\n")
f.write(par.mutilate_data.build_mut_output(ch, [len(ch)]))
f.write("\n===================== CLIENT SW ====================\n")
f.write(par.mutilate_data.build_mut_output(cs, [len(cs)]))
except Exception:
print("Unexpected error:", sys.exc_info())
def process_dir(rootdir):
for subdir in os.listdir(rootdir):
each_dir = os.path.join(rootdir, subdir)
if os.path.isfile(each_dir):
if each_dir.endswith(".txt") or each_dir.endswith(".sample"):
process_file(each_dir)
else:
process_dir(each_dir)
def main():
datdir = None
options = getopt.getopt(sys.argv[1:], 'd:')[0]
for opt, arg in options:
if opt in ('-d'):
datdir = arg
if datdir == None:
raise Exception("Must specify -d parameter")
process_dir(datdir)
executor.shutdown()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,25 @@
class NetSpec:
def __init__(self, fqdn, ip, mac) -> None:
self.mac = mac
self.ip = ip
self.fqdn = fqdn
self.netspec = ip + "@" + mac
class LabNetSpecs:
def __init__(self) -> None:
self.SKYLAKE1_10G = NetSpec(fqdn = "skylake1.rcs.uwaterloo.ca",ip = "192.168.123.11", mac = "3c:15:fb:62:9b:28")
self.SKYLAKE2_10G = NetSpec(fqdn = "skylake2.rcs.uwaterloo.ca",ip = "192.168.123.12", mac = "3c:15:fb:c9:f3:36")
self.SKYLAKE3_10G = NetSpec(fqdn = "skylake3.rcs.uwaterloo.ca",ip = "192.168.123.13", mac = "3c:15:fb:c9:f3:4b")
self.SKYLAKE4_10G = NetSpec(fqdn = "skylake4.rcs.uwaterloo.ca",ip = "192.168.123.14", mac = "")
self.SKYLAKE5_10G = NetSpec(fqdn = "skylake5.rcs.uwaterloo.ca",ip = "192.168.123.15", mac = "3c:15:fb:c9:f3:28")
self.SKYLAKE6_10G = NetSpec(fqdn = "skylake6.rcs.uwaterloo.ca",ip = "192.168.123.16", mac = "3c:15:fb:62:9b:2f")
self.SKYLAKE7_10G = NetSpec(fqdn = "skylake7.rcs.uwaterloo.ca",ip = "192.168.123.17", mac = "3c:15:fb:c9:f3:44")
self.SKYLAKE8_10G = NetSpec(fqdn = "skylake8.rcs.uwaterloo.ca",ip = "192.168.123.18", mac = "3c:15:fb:62:9c:be")
self.MILAN1_100G = NetSpec(fqdn = "milan1-int.rcs.uwaterloo.ca",ip = "192.168.123.19", mac = "")
self.MILAN1_10G = NetSpec(fqdn = "milan1-int.rcs.uwaterloo.ca",ip = "192.168.123.19", mac = "a0:42:3f:4d:cb:bc")
self.ICELAKE2_100G = NetSpec(fqdn = "icelake2-int.rcs.uwaterloo.ca",ip = "192.168.123.20", mac = "")
self.ICELAKE2_10G = NetSpec(fqdn = "icelake2-int.rcs.uwaterloo.ca",ip = "192.168.123.20", mac = "")
LAB = LabNetSpecs()

196
scripts/libs/libpar.py Normal file
View File

@ -0,0 +1,196 @@
import json
import numpy as np
class iperf_json_parser:
def __init__(self, inputs):
self.aggregate_egress_bps = 0
self.jsonobjs = []
for input in inputs:
jsobj = json.loads(input)
self.jsonobjs.append(jsobj)
each_bps = jsobj['end']['sum_sent']['bits_per_second']
self.aggregate_egress_bps += each_bps
class memloadgen_parser:
def __init__(self, input, min, max):
lines = input.split('\n')
if max > len(lines):
max = len(lines)
if len(lines) <= min:
raise Exception("Not enough lines!")
if min > max:
min = max
arr = []
for i in range(min, max):
arr.append(int(lines[i]))
self.bps = np.mean(arr)
class pmc_parser:
def __init__(self, input):
self.raw = input
lines = input.split('\n')
if len(lines) < 2:
raise Exception("Invalid pmc file format")
spec = lines[0].strip()
if (spec[0] != '#'):
raise Exception("Invalid pmc file spec line: \"" + lines[0] + "\"")
spec = spec.split(' ')
self.cores = len(spec) - 1
elements = spec[1].split('/')
if (len(elements) != 3):
raise Exception("Invalid pmc file spec line: \"" + lines[0] + "\"")
self.counter = elements[2].strip()
last_line = lines[-1]
elements = last_line.split(' ')
total = 0
for e in elements:
if (len(e) > 0):
total += int(e)
self.count = total
class khat_parser:
class pt:
def __init__(self):
self.s_htx = 0
self.s_hrx = 0
self.s_stx = 0
self.s_srx = 0
self.c_htx = 0
self.c_hrx = 0
self.c_stx = 0
self.c_srx = 0
self.master_total = 0
self.master_loss = 0
self.slave_total = 0
self.slave_loss = 0
self.qps = 0
def __init__(self):
self.datapt = []
self.srv_hwlat = []
self.srv_swlat = []
self.clt_hwlat = []
self.clt_swlat = []
self.lat_idx_arr = []
self.lat_idx_arr.append(self.srv_hwlat)
self.lat_idx_arr.append(self.srv_swlat)
self.lat_idx_arr.append(self.clt_hwlat)
self.lat_idx_arr.append(self.clt_swlat)
def get_stat_arr(self, idx : int):
return self.lat_idx_arr[idx]
def parse(self, output : str):
first = True
for line in output.splitlines():
# the first line is qps
cells = line.split(',')
if (first):
if len(cells) != 5:
raise Exception("Invalid headline:" + line)
self.qps = int(cells[0])
self.master_recv = int(cells[1])
self.master_loss = int(cells[2])
self.slave_recv = int(cells[3])
self.slave_loss = int(cells[4])
first = False
continue
if len(cells) != 8:
raise Exception("Invalid line:" + line)
pt = self.pt()
pt.c_srx = int(cells[0])
pt.c_stx = int(cells[1])
pt.c_hrx = int(cells[2])
pt.c_htx = int(cells[3])
pt.s_srx = int(cells[4])
pt.s_stx = int(cells[5])
pt.s_hrx = int(cells[6])
pt.s_htx = int(cells[7])
self.datapt.append(pt)
self.srv_hwlat.append(pt.s_htx - pt.s_hrx)
self.srv_swlat.append(pt.s_stx - pt.s_srx)
self.clt_hwlat.append(pt.c_hrx - pt.c_htx)
self.clt_swlat.append(pt.c_srx - pt.c_stx)
class mutilate_data:
def __init__(self):
self.dat = {}
self.qps = 0
def to_string(self):
ret = "Throughput: " + str(self.qps) + "\n" + json.dumps(self.dat)
return ret
@staticmethod
def parse_mut_output(output):
ret = mutilate_data()
succ_qps = False
succ_read = False
table = [None, "avg", "std", "min", "5th", "10th", "50th", "90th", "95th", "99th"]
table_legacy = [None, "avg", "std", "min", "5th", "10th", "90th", "95th", "99th"]
for line in output.splitlines():
if line.find("Total QPS") != -1:
spl = line.split()
if len(spl) == 7:
ret.qps = float(spl[3])
succ_qps = True
else:
break
elif line.find("read") != -1:
spl = line.split()
if len(spl) == 10:
for i in range(1, len(spl)):
ret.dat[table[i]] = float(spl[i])
succ_read = True
elif len(spl) == 9:
for i in range(1, len(spl)):
ret.dat[table_legacy[i]] = float(spl[i])
succ_read = True
else:
break
if not (succ_qps and succ_read):
raise Exception("Failed to parse data")
return ret
@staticmethod
def parse_mut_sample(fn):
f = open(fn, "r")
qps = []
lat = []
lines = f.readlines()
for line in lines:
entry = line.split()
if len(entry) != 2:
raise Exception("Unrecognized line: " + line)
qps.append(float(entry[0]))
lat.append(float(entry[1]))
f.close()
return qps, lat
# generate mutilate output format
@staticmethod
def build_mut_output(lat_arr, qps_arr):
output = '{0: <10}'.format('#type') + '{0: >10}'.format('avg') + '{0: >10}'.format('std') + \
'{0: >10}'.format('min') + '{0: >10}'.format('5th') + '{0: >10}'.format('10th') + \
'{0: >10}'.format('50th') + '{0: >10}'.format('90th') + '{0: >10}'.format('95th') + '{0: >10}'.format('99th') + "\n"
output += '{0: <10}'.format('read') + '{0: >10}'.format("{:.1f}".format(np.mean(lat_arr))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.std(lat_arr))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.min(lat_arr))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 5))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 10))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 50))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 90))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 95))) + ' ' + \
'{0: >10}'.format("{:.1f}".format(np.percentile(lat_arr, 99))) + ' ' + "\n" \
output += "\n" + "Total QPS = " + "{:.1f}".format(np.mean(qps_arr)) + " (0 / 0s)"
return output

189
scripts/libs/libtc.py Normal file
View File

@ -0,0 +1,189 @@
import subprocess as sp
import time
import select
import os
import pwd
import sys
import datetime
import random
import re
from threading import Thread
tc_logfile = None
def log_print(info):
print(info)
if tc_logfile != None:
tc_logfile.write(info + "\n")
tc_logfile.flush()
tc_output_dir=""
tc_cur_test = ""
tc_test_id = 0
def init(odir = "./results.d/"):
global tc_output_dir
tc_output_dir = odir
tc_output_dir = os.path.expanduser(tc_output_dir)
os.system("mkdir -p " + tc_output_dir)
global tc_logfile
tc_logfile = open(tc_output_dir + "/log.txt", "w+")
def begin(name):
global tc_test_id
global tc_cur_test
tc_cur_test = name
tc_test_id += 1
os.system("mkdir -p " + get_odir())
log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " started =====")
def end():
global tc_cur_test
log_print("\n===== Test #" + str(tc_test_id) + " - " + tc_cur_test + " completed =====")
tc_cur_test = ""
def get_odir():
return tc_output_dir + "/" + tc_cur_test
SCHED_QUEUE = 1
SCHED_CPU = 2
SCHED_BEST = 4
SCHED_FEAT_WS = 1
def make_sched_flag(sched, args, feat = 0, fargs = 0):
return (sched & 0xFF) | (args & 0xFF) << 8 | (feat & 0xFF) << 16 | (fargs & 0xFF) << 24
TUNE_RTSHARE = 2
TUNE_TFREQ = 1
def make_tune_flag(obj, val):
return (obj & 0xFFFF) | (val & 0xFFFF) << 16
def get_username():
return pwd.getpwuid( os.getuid() )[0]
ssh_param = ""
def set_ssh_param(para):
global ssh_param
ssh_param = para
def get_ssh_param():
global ssh_param
return ssh_param
ssh_user = None
def set_ssh_user(user):
global ssh_user
ssh_user = user
def get_ssh_user():
global ssh_user
return ssh_user
def remote_exec(srv : list[str], cmd : str, blocking=True, check=True) -> sp.Popen:
sub = []
for s in srv:
p = sp.Popen(["ssh " + ssh_param + " " + ((ssh_user + "@") if ssh_user != None else "") + s + " \"" + cmd +"\""], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
sub.append(p)
if blocking:
for p in sub:
p.wait()
if check and p.returncode != 0:
raise Exception("Command failed " + cmd)
return sub
def check_stderr(p, sel, exclude = []):# -> tuple[bool, list[str]]:
max_stderr_rd = 10
err = []
while sel.poll(1) and max_stderr_rd > 0:
err.append(p.stderr.readline().decode().strip())
max_stderr_rd = max_stderr_rd - 1
good = True
for e in err:
e = e.strip()
if len(e) == 0:
continue
good = False
for exc in exclude:
if exc in e:
good = True
break
return good, err
# stderr threads
errthr_objs = []
errthr_sigstop = False
errthr_failed = False
def errthr_get_failed():
return errthr_failed
def thr_check_stderr(p : sp.Popen, name: str, exclude):
global errthr_failed
sel = select.poll()
sel.register(p.stderr, select.POLLIN)
local_failed = False
while(not errthr_sigstop):
if (not local_failed):
status, err = check_stderr(p, sel, exclude=exclude)
if not status:
errthr_failed = True
local_failed = True
log_print("Error detected in \"" + name + "\":")
for e in err:
log_print(" \"" + e + "\"")
log_print("")
time.sleep(random.uniform(0.001, 0.1))
def errthr_start():
global errthr_sigstop
global errthr_failed
errthr_sigstop = False
errthr_failed = False
for thr in errthr_objs:
thr.daemon = True
thr.start()
def errthr_create(cp, name, exclude = None):
global errthr_objs
for i in range(len(cp)):
errthr_objs.append(Thread(target = thr_check_stderr, args=(cp[i], name[i], exclude)))
def errthr_stop():
global errthr_objs
global errthr_sigstop
errthr_sigstop = True
for thr in errthr_objs:
thr.join()
errthr_objs.clear()
def parse_hostfile(fp):
ret = {}
fh = open(fp, "r")
content = fh.readlines()
fh.close()
content = [x.strip() for x in content]
for line in content:
spl = line.split(" ")
if len(spl) >= 2:
ret[spl[0]] = spl[1]
log_print("Parsed: hostname \"" + spl[0] + "\" -> \"" + spl[1] + "\"")
return ret
def process_hostnames(names, hosts):
ret = []
for line in names:
if line in hosts:
ret.append(hosts[line])
else:
ret.append(line)
return ret
def get_cpuset_core(threads):
ret = "cpuset -l 0-" + str(threads * 2 - 1) + " "
return ret

340
scripts/netexp.py Normal file
View File

@ -0,0 +1,340 @@
import time
import subprocess as sp
import os
import libpar as par
import libtc as tc
import libmechspec as mechspec
class NetExpResult:
def __init__(self):
self.parser = None
self.pmc_parser = None
self.sample = None
class NetExpConf:
def __init__(self):
self.root_dir = ""
self.enable_client_only = False
self.enable_memgen = False
self.memgen_affinity = ""
self.memgen_iteration = -1
self.memgen_size = 512 * 1024 * 1024
self.memgen_tgtdom = 1
self.srv_affinity = ""
self.srv_mechspec = None
self.srv_port = 0
self.clt_qps = 0
self.clt_mechspecs = []
self.clt_affinity = "1"
self.clt_wrkld = 0
self.clt_wrkarg0 = "fixed:0"
self.clt_wrkarg1 = "fixed:0"
self.clt_pkt_loss_lat = 1000
self.clt_rage_quit_lat = 1000
self.clt_port = 0
self.clt_pkt_pad = 0
self.clt_pkt_depth = 1
self.clt_ia = "exponential"
self.mst_mechspec = None
self.mst_affinity = "2"
self.mst_qps = 100
self.mst_port = 0
self.mst_pkt_loss_lat = 1000
self.mst_pkt_loss_max = 1000
self.mst_duration = 10
self.mst_warmup = 5
self.mst_ia = "exponential"
self.enable_pmc = False
self.pmc_counters = []
self.pmc_mode = 0 # 0 = sampling
self.pmc_sampling_rate = 8192
self.pmc_counting_interval = 0.1
def __build_fqdn_arr(self, ns):
ret = []
for n in ns:
if n != None:
ret.append(n.fqdn)
return ret
def get_pmc_str(self):
ret = ""
for counter in self.pmc_counters:
ret = ret + counter + ","
return ret[:-1]
def calc_client_qps(self):
return 0 if self.clt_qps == 0 else (int)((self.clt_qps - self.mst_qps) / len(self.clt_mechspecs))
def finalize_mechspecs(self):
self.clt_fqdns = self.__build_fqdn_arr(self.clt_mechspecs)
self.srv_fqdns = self.__build_fqdn_arr([self.srv_mechspec])
self.mst_fqdns = self.__build_fqdn_arr([self.mst_mechspec])
__SAMPLE_FN = "sample.txt.tmp"
__PMC_FN = "pmc.txt.tmp"
def __keep_result(conf : NetExpConf):
result = NetExpResult()
target_scp_fn = tc.get_odir() + "/" + __SAMPLE_FN
scpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.mst_mechspec.fqdn + ":" + conf.root_dir + "/" + __SAMPLE_FN + " " + target_scp_fn
tc.log_print(scpcmd)
sp.check_call(scpcmd, shell=True)
result.parser = par.khat_parser()
with open(target_scp_fn, "r") as f:
result.sample = f.read()
result.parser.parse(result.sample)
rmcmd = "rm " + target_scp_fn
tc.log_print(rmcmd)
sp.check_call(rmcmd, shell=True)
if conf.enable_pmc:
target_pmc_fn = tc.get_odir() + "/" + __PMC_FN
pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + " " + target_pmc_fn
tc.log_print(pmcscpcmd)
sp.check_call(pmcscpcmd, shell=True)
if conf.pmc_mode == 0:
pmcproccmd = "sudo pmcstat -R " + conf.root_dir + "/" + __PMC_FN + " -m " + conf.root_dir + "/" + __PMC_FN + ".proc"
tc.log_print(pmcproccmd)
tc.remote_exec(conf.srv_fqdns, pmcproccmd)
pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + ".proc" + " " + target_pmc_fn + ".proc"
tc.log_print(pmcscpcmd)
sp.check_call(pmcscpcmd, shell=True)
if conf.pmc_mode != 0:
with open(target_pmc_fn, "r") as f:
result.pmc_parser = par.pmc_parser(f.read())
else:
with open(target_pmc_fn, "rb") as f:
with open(target_pmc_fn + ".proc", "r") as g:
result.pmc_parser = [f.read(), g.read()]
rmcmd = "rm " + target_pmc_fn + ".proc"
tc.log_print(rmcmd)
sp.check_call(rmcmd, shell=True)
rmcmd = "rm " + target_pmc_fn
tc.log_print(rmcmd)
sp.check_call(rmcmd, shell=True)
return result
def stop_all(conf : NetExpConf):
# stop clients
tc.log_print("Stopping clients...")
tc.remote_exec(conf.clt_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
# stop master
tc.log_print("Stopping master...")
tc.remote_exec(conf.mst_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
if not conf.enable_client_only:
# stop server
tc.log_print("Stopping server...")
tc.remote_exec(conf.srv_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
if conf.enable_pmc:
tc.log_print("Stopping server PMC...")
tc.remote_exec(conf.srv_fqdns, "sudo killall -9 pmcstat", check=False)
def __run_setup_cmd(conf : NetExpConf, cmd : str, desc : str):
all = []
all.extend(conf.srv_fqdns)
all.extend(conf.clt_fqdns)
all.extend(conf.mst_fqdns)
ssrv : list[tuple[str, sp.Popen]] = []
for s in all:
tc.log_print(f"Running \'{desc}\' on {s}...")
ssrv.append((s, tc.remote_exec([s], cmd, blocking=False, check=False)[0]))
for p in ssrv:
_ , stderr = p[1].communicate()
if p[1].returncode != 0:
print(f"{ p[0] } \'{desc}\' failed. stderr:\n{stderr.decode()}\n")
else:
print(f"{ p[0] } \'{desc}\' succeeded")
def setup(conf : NetExpConf, bench : False, dpdk : False):
libtopo_path = "/libtopo"
dpdk_path = "/dpdk"
bench_path = "/numam.d"
if dpdk:
setup_cmd = f'''sudo rm -rf {libtopo_path}; sudo rm -rf /usr/local/include/libtopo;
sudo rm -rf /usr/local/lib/libtopo;
sudo mkdir -p {libtopo_path};
sudo chmod 777 {libtopo_path};
cd {libtopo_path};
git clone https://git.quacker.org/d/libtopo;
cd libtopo;
mkdir build;
cd build;
cmake ../;
sudo make install'''
__run_setup_cmd(conf, setup_cmd, "dpdk - libtopo")
setup_cmd = f'''sudo pkg install -y meson pkgconf py39-pyelftools;
sudo rm -rf {dpdk_path}
sudo mkdir -p {dpdk_path};
sudo chmod 777 {dpdk_path};
cd {dpdk_path};
git clone https://git.quacker.org/d/numam-dpdk;
cd numam-dpdk;
git checkout migration;
CC=gcc CXX=g++ meson -Denable_kmods=true build;
cd build;
sudo ninja install'''
__run_setup_cmd(conf, setup_cmd, "dpdk - dpdk")
if bench:
setup_cmd = f'''sudo rm -rf {bench_path};
sudo mkdir -p {bench_path};
sudo chmod 777 {bench_path}'''
__run_setup_cmd(conf, setup_cmd, "bench - remove")
all = []
all.extend(conf.srv_fqdns)
all.extend(conf.clt_fqdns)
all.extend(conf.mst_fqdns)
dir = f"{os.path.dirname(__file__)}/../"
for clt in all:
print("Syncing files to " + clt + "...")
rsync_cmd = f"rsync -az --no-perms --rsync-path=\"sudo rsync\" --omit-dir-times -e \"ssh -p77\" {dir} {tc.get_ssh_user()}@{clt}:{bench_path}/"
sp.check_call(rsync_cmd, shell=True)
setup_cmd = f'''cd {bench_path};
sudo rm -rf build;
mkdir build;
cd build;
cmake ../;
make -j8 khat cat rat memloadgen'''
__run_setup_cmd(conf, setup_cmd, "bench - compile")
def run(conf : NetExpConf):
stop_all(conf)
while True:
server_cmd = "sudo "
if conf.enable_pmc:
if conf.pmc_mode != 0:
pmc_cmd = "sudo pmcstat -C -w " + str(conf.pmc_counting_interval) + " -s " + conf.get_pmc_str() + " -o " + conf.root_dir + "/" + __PMC_FN
else:
pmc_cmd = "sudo pmcstat -n " + str(conf.pmc_sampling_rate) + " -S " + conf.get_pmc_str() + " -O " + conf.root_dir + "/" + __PMC_FN
tc.log_print("Starting server PMC...")
tc.log_print(pmc_cmd)
spmc = tc.remote_exec(conf.srv_fqdns, pmc_cmd, blocking=False)
server_cmd += conf.root_dir + "/khat --log-level lib.eal:err -- -A " + conf.srv_affinity + \
" -H " + conf.srv_mechspec.netspec + " -p " + str(conf.srv_port)
if int(conf.clt_pkt_pad) > 1518:
server_cmd += " -J "
if conf.enable_client_only:
ssrv = None
tc.log_print(server_cmd)
else:
# start server
tc.log_print("Starting server...")
tc.log_print(server_cmd)
ssrv = tc.remote_exec(conf.srv_fqdns, server_cmd, blocking=False)
if conf.enable_memgen:
memgen_cmd = "sudo " + conf.root_dir + "/memloadgen -b " + str(conf.memgen_size) + " -s " + conf.memgen_affinity + \
" -i " + str(conf.memgen_iteration) + " -d " + str(conf.memgen_tgtdom)
tc.log_print("Starting memloadgen...")
tc.log_print(memgen_cmd)
smem = tc.remote_exec(conf.srv_fqdns, memgen_cmd, blocking=False)
# start clients
tc.log_print("Starting clients...")
sclt = []
sclt_name = []
for i in range(len(conf.clt_fqdns)):
client_cmd = "sudo " + conf.root_dir + "/rat --log-level lib.eal:err -- -S -A " + conf.clt_affinity + \
" -i " + conf.clt_ia + \
" -q " + str(conf.calc_client_qps()) + \
" -H " + conf.clt_mechspecs[i].netspec + \
" -s " + conf.srv_mechspec.netspec + \
" -r " + str(conf.clt_rage_quit_lat) + \
" -l " + str(conf.clt_pkt_loss_lat) + \
" -w " + str(conf.clt_wrkld) + \
" -w " + str(conf.clt_wrkarg0) + \
" -w " + str(conf.clt_wrkarg1) + \
" -P " + str(conf.clt_pkt_pad) + \
" -D " + str(conf.clt_pkt_depth) + \
" -p " + str(conf.clt_port)
if int(conf.clt_pkt_pad) > 1518:
client_cmd += " -J "
tc.log_print(client_cmd)
sclt.append(tc.remote_exec([conf.clt_fqdns[i]], client_cmd, blocking=False)[0])
sclt_name.append(conf.clt_fqdns[i])
time.sleep(5)
# start master
tc.remote_exec
tc.log_print("Starting master...")
master_cmd = "sudo " + conf.root_dir + "/cat --log-level lib.eal:err -- " + \
" -s " + conf.srv_mechspec.netspec + \
" -o " + conf.root_dir + "/" + __SAMPLE_FN + \
" -t " + str(conf.mst_duration) + \
" -T " + str(conf.mst_warmup) + \
" -i " + conf.mst_ia + \
" -q " + str(conf.mst_qps) + \
" -l " + str(conf.mst_pkt_loss_lat) + \
" -L " + str(conf.mst_pkt_loss_max) + \
" -A " + conf.mst_affinity + \
" -H " + conf.mst_mechspec.netspec + \
" -p " + str(conf.mst_port)
for clt in conf.clt_mechspecs:
master_cmd += " -S " + clt.netspec
tc.log_print(master_cmd)
sp = tc.remote_exec(conf.mst_fqdns, master_cmd, blocking=False)
p = sp[0]
# launch stderr monitoring thread
exclude = ["Pseudo-terminal", "ice_", "i40e_"]
tc.errthr_create([p], conf.mst_fqdns, exclude)
if not conf.enable_client_only:
tc.errthr_create(ssrv, conf.srv_fqdns, exclude)
tc.errthr_create(sclt, sclt_name, exclude)
if conf.enable_memgen:
tc.errthr_create(smem, ["memloadgen"], exclude)
if conf.enable_pmc:
tc.errthr_create(spmc, ["pmcstat"], exclude)
tc.errthr_start()
success = False
cur = 0
# selec = select.poll()
# selec.register(p.stdout, select.POLLIN)
while True:
# either failed or timeout
# we use failure detection to save time for long durations
if tc.errthr_get_failed() or cur >= (conf.mst_warmup + conf.mst_duration) * 3:
break
# while selec.poll(1):
# print(p.stdout.readline())
if p.poll() != None:
success = True
break
time.sleep(1)
cur = cur + 1
stop_all(conf)
tc.errthr_stop()
tc.log_print("Cooling down...")
time.sleep(5)
if success:
return __keep_result(conf)

112
scripts/storage/parse.py Normal file
View File

@ -0,0 +1,112 @@
#!/usr/bin/env python3.6
import numpy as np
import sys
import re
import os
import json
import getopt
import math
import concurrent.futures as CF
columns = [
("Req per second", "rps", ".2f"),
("Bytes per second", "bps", ".2f"),
("Average Latency", "lat_avg", ".2f"),
("50th Latency", "lat_50", ".0f"),
("95th Latency", "lat_95", ".0f"),
("99th Latency", "lat_99", ".0f"),
("Latency stddev", "lat_std", ".2f")
]
TIME = 30
REQ_SZ = 4096
class DatObj:
def __init__(self, raw : list, time : int, req_sz : int):
self.raw = raw
self.rps = len(raw) / time
self.bps = self.rps * req_sz
self.lat_avg = np.average(self.raw)
self.lat_99 = np.percentile(self.raw, 99)
self.lat_95 = np.percentile(self.raw, 95)
self.lat_50 = np.percentile(self.raw, 50)
self.lat_std = np.std(self.raw)
def parse_file(lines : list, time : int, req_sz : int) -> DatObj :
raw = []
for line in lines:
if len(line) > 0:
raw.append(int(line))
return DatObj(raw, time, req_sz)
def output_col():
ret = "Benchmark"
for name,_,_ in columns:
ret = ret + "," + name + "," + name + " (NUMA)" + "," + "% change"
return ret
def get_attr_or_none(obj, attr):
if (obj != None):
val = getattr(obj, attr)
else:
val = None
return val
def output_objs(name: str, obj : DatObj, obj_numa : DatObj):
ret = name
for _, attr, fmt in columns:
val = get_attr_or_none(obj, attr)
val_numa = get_attr_or_none(obj_numa, attr)
ret = ret + "," + (format(val, fmt) if val != None else "N/A")
ret = ret + "," + (format(val_numa, fmt) if val_numa != None else "N/A")
if val == None or val_numa == None:
ret = ret + "," + "N/A"
else:
ret = ret + "," + format((val_numa - val) / val * 100, ".2f") + "%"
return ret
def process_file(f : str, obj_map):
with open(f, "r") as fp:
lines = fp.readlines()
bench_name = os.path.basename(f)
obj_map[bench_name] = parse_file(lines, TIME, REQ_SZ)
print("Processed file " + f + ". Benchmark name: " + bench_name)
def process_dir(path : str, obj_map):
files = [os.path.abspath(os.path.join(path, x)) for x in os.listdir(path)]
for f in files:
if (".sh" in f):
continue
if (os.path.isfile(f)):
process_file(f, obj_map)
def main():
datdir = None
options = getopt.getopt(sys.argv[1:], 'd:')[0]
for opt, arg in options:
if opt in ('-d'):
datdir = arg
if datdir == None:
raise Exception("Must specify -d parameter")
obj_map = dict()
process_dir(datdir, obj_map)
with open("results.csv", "w") as f:
f.write(output_col())
f.write("\n")
for bench in obj_map:
if bench.endswith("_numa"):
continue
f.write(output_objs(bench, obj_map[bench], obj_map.get(bench+"_numa")))
f.write("\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,19 @@
# rand_read
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,100 -Q 3 -o rand_read_numa
# rand_write
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,0 -Q 3 -o rand_write_numa
# mono_read
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,100 -Q 3 -o mono_read_numa
# mono_write
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P M,0 -Q 3 -o mono_write_numa
# mixed
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read
sudo /numam/code/build/birb_posix -m 0x2 -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D /dev/nvd0 -P R,70 -Q 3 -o mixed_read_numa

View File

@ -0,0 +1,19 @@
# rand_read
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
# rand_write
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
# mono_read
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
# mono_write
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
# mixed
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev

View File

@ -0,0 +1,19 @@
# rand_read
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,100 -Q 3 -o rand_read_numa -k bdev
# rand_write
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,0 -Q 3 -o rand_write_numa -k bdev
# mono_read
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,100 -Q 3 -o mono_read_numa -k bdev
# mono_write
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P M,0 -Q 3 -o mono_write_numa -k bdev
# mixed
sudo /numam/code/build/birb -m 0xAAAAAA -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read -k bdev
sudo /numam/code/build/birb -m 0xAAAAAA000000 -c /numam/nvme.json -t 35 -w 5 -I fixed -a 0x555555000000 -b 4096 -q 0 -D Nvme0n1 -P R,70 -Q 3 -o mixed_read_numa -k bdev

797
storage/birb.cc Normal file
View File

@ -0,0 +1,797 @@
#include <sys/endian.h>
#include <sys/errno.h>
#include <sys/types.h>
#include <x86/_stdint.h>
#include <getopt.h>
#include <pthread.h>
#include <pthread_np.h>
#include <threads.h>
#include <unistd.h>
#include <cerrno>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <chrono>
#include <list>
#include <set>
#include "rte_lcore.h"
#include "spdk/cpuset.h"
#include "spdk/stdinc.h"
#include "spdk/thread.h"
#include "spdk/env.h"
#include "spdk/event.h"
#include "spdk/log.h"
#include "spdk/string.h"
#include "gen.hh"
#include "ntr.h"
#include "defs.hh"
#include "nm.hh"
#include "storage/io_gen.hh"
#include "storage/drivers/driver.hh"
#include "storage/drivers/bdev.hh"
#include "storage/drivers/nvme.hh"
static inline uint64_t get_cur_ts_nano()
{
return std::chrono::duration_cast<std::chrono::nanoseconds>
(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
}
/*
* We'll use this struct to gather housekeeping hello_context to pass between
* our events and callbacks.
*/
static constexpr unsigned long MAX_SPEC_LEN = 32;
static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
struct options_t {
// args
int verbosity = NTR_LEVEL_DEFAULT;
int num_threads = 1;
unsigned long cpumask = 1;
char pattern_spec[MAX_SPEC_LEN] = "R,100";
char ia_spec[MAX_SPEC_LEN] = "fixed";
unsigned int time = 5;
unsigned int warmup = 2;
unsigned int queue_depth = 1;
char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
char driver_name[MAX_DEV_NAME_LEN] = "bdev";
unsigned int read_pct = 0;
io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";
unsigned long req_size = 4096;
unsigned long rps = 0;
};
struct main_thread_cb_vars {
uint32_t worker_thread_init_cnt;
uint32_t worker_thread_stop_cnt;
};
struct worker_thread_cb_vars {
uint32_t worker_start;
uint32_t worker_stop;
struct thread_context * ctx;
std::list<struct io_request *> * free_ios;
};
static __thread void * cb_vars;
static struct options_t options;
struct io_record {
uint64_t start_ts;
uint64_t end_ts;
};
struct io_request {
uint64_t start_ts;
io_generator_opcode op;
char * user_buf;
char * dma_buf;
};
struct thread_context {
unsigned int tid;
unsigned int coreid;
unsigned int sockid;
pthread_t sys_thread;
struct spdk_thread * main_thread;
birb_driver * driver;
unsigned long start_region_offset;
unsigned long start_region_length;
/* modified by worker threads */
struct spdk_thread * sp_thread;
std::list<io_record *> *io_records;
uint64_t overhead_avg;
uint32_t overhead_cnt;
uint64_t overhead_max;
uint64_t overhead_min;
};
static void dump_options()
{
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
" dev name: %s\n"
" driver name: %s\n"
" worker threads: 0x%lx\n"
" number of threads: %d\n"
" IO request size: %lu\n"
" IO requests per second: %lu\n"
" IO pattern: %s\n"
" IO queue depth: %d\n"
" IO addressing mode: %d\n"
" read percent: %u\n"
" inter-arrival dist: %s\n"
" run time: %d\n"
" warmup time: %d\n"
" output file: %s\n",
options.dev_name,
options.driver_name,
options.cpumask,
options.num_threads,
options.req_size,
options.rps,
options.pattern_spec,
options.queue_depth,
options.addr_mode,
options.read_pct,
options.ia_spec,
options.time,
options.warmup,
options.output_file
);
}
static void usage()
{
fprintf(stdout,
" -V(VV): verbose mode\n"
" -D: dev name\n"
" -k: driver to use (default bdev)\n"
" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
" -b: IO request size\n"
" -q: IO requests per second\n"
" -P: IO request pattern\n"
" -Q: IO request queue depth\n"
" -I: inter-arrival time distribution\n"
" -t: total run time\n"
" -w: warm up time\n"
" -o: latency response output file\n");
}
static int parse_arg(int c, char *arg)
{
switch (c) {
case 'V':
ntr_set_level(NTR_DEP_USER1,
ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 'D':
strncpy(options.dev_name, arg, MAX_DEV_NAME_LEN);
break;
case 'k':
strncpy(options.driver_name, arg, MAX_DEV_NAME_LEN);
break;
case 'a':
options.cpumask = strtoull(optarg, nullptr, 16);
options.num_threads = cmask_get_num_cpus(
options.cpumask);
if (options.num_threads == 0) {
fprintf(stderr,
"must run at least one thread\n");
return EINVAL;
}
break;
case 'b':
options.req_size = strtoull(
optarg, nullptr, 10);
break;
case 'q':
options.rps = strtoull(
optarg, nullptr, 10);
break;
case 'Q':
options.queue_depth = strtoull(
optarg, nullptr, 10);
break;
case 'P':
strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
break;
case 'I':
strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
break;
case 't':
options.time = strtoull(
optarg, nullptr, 10);
break;
case 'w':
options.warmup = strtoull(
optarg, nullptr, 10);
break;
case 'o':
strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
break;
case 'h':
default:
return EINVAL;
}
return 0;
}
static birb_driver *
birb_create_driver(const char * driver_name, void * context)
{
if (strcmp(driver_name, "bdev") == 0) {
return new birb_bdev_driver(reinterpret_cast<const char *>(context));
} else if (strcmp(driver_name, "nvme") == 0) {
return new birb_nvme_driver(reinterpret_cast<const char *>(context));
} else {
return nullptr;
}
}
static birb_driver_thread_context *
birb_create_thread_context(birb_driver * driver)
{
if (driver->get_type() == birb_driver::BIRB_DRV_BDEV) {
return new birb_bdev_thread_context(dynamic_cast<birb_bdev_driver *>(driver));
} else if (driver->get_type() == birb_driver::BIRB_DRV_NVME) {
return new birb_nvme_thread_context(dynamic_cast<birb_nvme_driver *>(driver));
} else {
return nullptr;
}
}
static void
birb_destroy_driver(birb_driver * drv)
{
delete drv;
}
static void
birb_destroy_thread_context(birb_driver_thread_context * ctx)
{
delete ctx;
}
/*
* Callback function for io completion.
*/
static void
worker_io_complete(bool success, void *cb_arg)
{
auto vars = (struct worker_thread_cb_vars *)cb_vars;
auto req = (struct io_request *)cb_arg;
uint64_t end_ts = get_cur_ts_nano();
if (!success) {
// XXX: print warning for errors for now
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d <worker_io_complete>: io request failed\n", vars->ctx->tid);
} else {
auto rec = new struct io_record;
rec->start_ts = req->start_ts;
rec->end_ts = end_ts;
vars->ctx->io_records->push_back(rec);
if (req->op == IOGEN_READ) {
memcpy(req->user_buf, req->dma_buf, options.req_size);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", vars->ctx->tid, req->op);
}
vars->free_ios->push_back(req);
}
static void
cb_notify_main_init(void * arg)
{
auto * ctx = (struct thread_context *)arg;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_main_init: from thread %d to main.\n", ctx->tid);
auto * vars = (struct main_thread_cb_vars *) cb_vars;
vars->worker_thread_init_cnt++;
}
static void
cb_notify_main_stop(void * arg)
{
auto * ctx = (struct thread_context *)arg;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_main_stop: from thread %d to main.\n", ctx->tid);
auto * vars = (struct main_thread_cb_vars *) cb_vars;
vars->worker_thread_stop_cnt++;
}
static void
cb_notify_worker_start(void * arg)
{
auto * ctx = (struct thread_context *)arg;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_worker_start: from main to thread %d.\n", ctx->tid);
auto * vars = (struct worker_thread_cb_vars *) cb_vars;
vars->worker_start = 1;
}
static void
cb_notify_worker_stop(void * arg)
{
auto * ctx = (struct thread_context *)arg;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "cb_notify_worker_stop: from main to thread %d.\n", ctx->tid);
auto * vars = (struct worker_thread_cb_vars *) cb_vars;
vars->worker_stop = 1;
}
static void
main_thread_cb_vars_init(struct main_thread_cb_vars * vars)
{
vars->worker_thread_init_cnt = 0;
vars->worker_thread_stop_cnt = 0;
}
static void
worker_thread_cb_vars_init(struct worker_thread_cb_vars * vars, struct thread_context * ctx,
std::list<struct io_request *> * free_ios)
{
vars->worker_start = 0;
vars->worker_stop = 0;
vars->ctx = ctx;
vars->free_ios = free_ios;
}
static void *
worker_thread_main(void * arg)
{
int rc = 0;
constexpr static unsigned int SPDK_THREAD_NAME_SZ = 16;
struct worker_thread_cb_vars vars;
auto *ctx = (struct thread_context *)arg;
birb_driver_thread_context * driver_thread_ctx;
std::list<struct io_request *> free_ios;
char spdk_thread_name[SPDK_THREAD_NAME_SZ];
struct spdk_cpuset * cpuset;
Generator * ia_gen = nullptr;
io_generator * io_gen = nullptr;
struct io_generator_ctx io_ctx;
uint64_t next_ts;
uint64_t a_offset;
uint64_t last_loop_ts = 0;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);
ctx->overhead_avg = 0;
ctx->overhead_cnt = 0;
ctx->overhead_max = 0;
ctx->overhead_min = UINT64_MAX;
// create spdk thread
cpuset = spdk_cpuset_alloc();
if (cpuset == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc cpuset\n");
rc = ENOMEM;
goto cleanup;
}
spdk_cpuset_zero(cpuset);
spdk_cpuset_set_cpu(cpuset, ctx->coreid, true);
snprintf(spdk_thread_name, SPDK_THREAD_NAME_SZ, "birb_worker_%u", ctx->tid);
ctx->sp_thread = spdk_thread_create(spdk_thread_name, cpuset);
if (ctx->sp_thread == nullptr) {
rc = ENOMEM;
goto cleanup;
}
spdk_set_thread(ctx->sp_thread);
// create thread context
driver_thread_ctx = birb_create_thread_context(ctx->driver);
if (driver_thread_ctx == nullptr || driver_thread_ctx->get_status() != birb_driver::BIRB_SUCCESS) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not create thread context!\n", ctx->tid);
rc = EINVAL;
goto cleanup;
}
// create io request objects
for (unsigned int i = 0; i < options.queue_depth; i++) {
auto dma_buf = (char *)spdk_dma_zmalloc_socket(options.req_size, ctx->driver->get_align(), NULL, ctx->sockid);
auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);
if (dma_buf == nullptr || user_buf == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
rc = ENOMEM;
goto cleanup;
}
auto io_req = new struct io_request;
io_req->dma_buf = dma_buf;
io_req->user_buf = user_buf;
free_ios.push_back(io_req);
}
// init thread local states
worker_thread_cb_vars_init(&vars, ctx, &free_ios);
cb_vars = &vars;
ia_gen = createGenerator(options.ia_spec);
if (ia_gen == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
rc = EINVAL;
goto cleanup;
}
ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));
io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
if (io_gen == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
rc = EINVAL;
goto cleanup;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);
if ((rc = spdk_thread_send_msg(ctx->main_thread, cb_notify_main_init, ctx)) != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not send message %d\n", ctx->tid, rc);
goto cleanup;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);
while (vars.worker_start != 1) {
spdk_thread_poll(spdk_get_thread(), 0, 0);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
/* random delay 0-100 us */
usleep(nm_get_uptime_ns() % 100);
next_ts = get_cur_ts_nano();
while (true) {
uint64_t cur_loop_ts = get_cur_ts_nano();
if (last_loop_ts > 0) {
uint64_t overhead = cur_loop_ts - last_loop_ts;
if (ctx->overhead_max < overhead) {
ctx->overhead_max = overhead;
}
if (ctx->overhead_min > overhead) {
ctx->overhead_min = overhead;
}
ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
ctx->overhead_cnt++;
ctx->overhead_avg /= ctx->overhead_cnt;
}
last_loop_ts = cur_loop_ts;
spdk_thread_poll(spdk_get_thread(), 0, 0);
driver_thread_ctx->poll();
if (vars.worker_stop != 0) {
if (free_ios.size() >= options.queue_depth) {
break;
}
} else {
if (!free_ios.empty()) {
auto io_req = free_ios.front();
uint64_t cur_ts = get_cur_ts_nano();
if (cur_ts >= next_ts) {
io_gen->issue(&io_ctx, io_req->dma_buf);
a_offset = io_ctx.offset + ctx->start_region_offset;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);
io_req->start_ts = cur_ts;
io_req->op = io_ctx.op;
if(io_ctx.op == IOGEN_READ) {
rc = driver_thread_ctx->read(a_offset, io_ctx.size, io_req->dma_buf, worker_io_complete, io_req);
} else {
rc = driver_thread_ctx->write(a_offset, io_ctx.size, io_req->dma_buf, worker_io_complete, io_req);
}
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...", ctx->tid, rc);
} else {
free_ios.pop_front();
next_ts = next_ts + ia_gen->generate() * S2NS;
}
}
}
}
}
cleanup:
while (!free_ios.empty()) {
auto req = free_ios.front();
free_ios.pop_front();
spdk_dma_free(req->dma_buf);
nm_free(ctx->sockid, req->user_buf);
}
if (ia_gen != nullptr) {
delete ia_gen;
}
if (io_gen != nullptr) {
delete io_gen;
}
if (cpuset != nullptr) {
spdk_cpuset_free(cpuset);
}
if (driver_thread_ctx != nullptr) {
birb_destroy_thread_context(driver_thread_ctx);
}
if (rc == 0) {
if ((rc = spdk_thread_send_msg(ctx->main_thread, cb_notify_main_stop, ctx)) != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not send message %d\n", ctx->tid, rc);
}
}
spdk_thread_exit(ctx->sp_thread);
while (!spdk_thread_is_exited(ctx->sp_thread)) {
spdk_thread_poll(ctx->sp_thread, 0, 0);
};
if (ctx->sp_thread != nullptr) {
spdk_set_thread(nullptr);
spdk_thread_destroy(ctx->sp_thread);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);
if (rc != 0) {
spdk_app_stop(rc);
}
return nullptr;
}
static void
parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
{
char * token = strtok(pattern, ",");
if (strcmp(token, "M") == 0) {
*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
} else {
*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
}
token = strtok(nullptr, ",");
*read_pct = strtoull(token, nullptr, 10);
}
static void
birb_main(void * arg1 UNUSED)
{
int rc = 0;
std::list<struct thread_context *> worker_threads;
std::ofstream output_file;
struct main_thread_cb_vars vars;
birb_driver * drv = nullptr;
unsigned long record_cutoff_time = 0;
unsigned long current_s = 0;
unsigned int total_reqs = 0;
unsigned int tid = 0;
unsigned long per_thread_cap = 0;
int cur_core;
/* initialize driver */
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
drv = birb_create_driver(options.driver_name, options.dev_name);
if (drv == nullptr || drv->get_status() != birb_driver::BIRB_SUCCESS) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create device driver.\n");
rc = EINVAL;
goto end;
}
per_thread_cap = drv->get_capacity() / options.num_threads;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB\n", drv->get_capacity(), drv->get_capacity() / 1024 / 1024);
/* misc init */
main_thread_cb_vars_init(&vars);
cb_vars = &vars;
parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
dump_options();
output_file.open(options.output_file, std::ofstream::out);
if (!output_file) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
rc = EINVAL;
goto end;
}
cur_core = cmask_get_next_cpu(&options.cpumask);
while(cur_core != NEXT_CPU_NULL) {
auto * ctx = new struct thread_context;
memset(ctx, 0, sizeof(struct thread_context));
if (ctx == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
spdk_app_stop(ENOMEM);
return;
}
ctx->tid = tid++;
ctx->driver = drv;
ctx->main_thread = spdk_get_thread();
ctx->sockid = rte_lcore_to_socket_id(cur_core);
ctx->coreid = cur_core;
ctx->io_records = new std::list<struct io_record *>();
ctx->start_region_length = per_thread_cap;
ctx->start_region_offset = per_thread_cap * ctx->tid;
// create sys thread
pthread_attr_t attr;
cpuset_t scpuset;
CPU_ZERO(&scpuset);
CPU_SET(cur_core, &scpuset);
pthread_attr_init(&attr);
pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
rc = EINVAL;
goto end;
}
worker_threads.push_back(ctx);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid,
ctx->start_region_offset,
ctx->start_region_length);
cur_core = cmask_get_next_cpu(&options.cpumask);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
while(vars.worker_thread_init_cnt < (uint32_t)options.num_threads) {
spdk_thread_poll(spdk_get_thread(), 0, 0);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
for (struct thread_context * tctx : worker_threads) {
rc = spdk_thread_send_msg(tctx->sp_thread, cb_notify_worker_start, tctx);
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to send message %d\n", rc);
goto end;
}
}
/* main event loop */
while(current_s < options.time) {
if (current_s >= options.warmup && record_cutoff_time == 0) {
record_cutoff_time = get_cur_ts_nano();
}
usleep(1 * S2US);
current_s++;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
for (struct thread_context * tctx : worker_threads) {
rc = spdk_thread_send_msg(tctx->sp_thread, cb_notify_worker_stop, tctx);
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to send message %d\n", rc);
goto end;
}
}
while(vars.worker_thread_stop_cnt < (uint32_t)options.num_threads) {
spdk_thread_poll(spdk_get_thread(), 0, 0);
}
// keep stats
for (struct thread_context * tctx : worker_threads) {
uint64_t last_ts = 0;
uint64_t processed = 0;
for (struct io_record * r : *tctx->io_records) {
if (r->start_ts >= record_cutoff_time) {
if (r->end_ts > last_ts) {
last_ts = r->end_ts;
}
processed++;
output_file << r->end_ts - r->start_ts << std::endl;
total_reqs++;
}
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n",
tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n",
total_reqs, total_reqs * options.req_size / (options.time - options.warmup));
end:
if (drv != nullptr) {
birb_destroy_driver(drv);
}
output_file.close();
for (struct thread_context * tctx : worker_threads) {
for (struct io_record * r : *tctx->io_records) {
delete r;
}
delete tctx->io_records;
delete tctx;
}
exit(0);
spdk_app_stop(rc);
return;
}
int
main(int argc, char **argv)
{
struct spdk_app_opts opts = {};
int rc = 0;
ntr_init();
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);
/* Set default values in opts structure. */
spdk_app_opts_init(&opts, sizeof(opts));
opts.name = "birb";
/*
* Parse built-in SPDK command line parameters as well
* as our custom one(s).
*/
if ((rc = spdk_app_parse_args(argc, argv, &opts, "VD:k:a:b:q:Q:P:I:t:w:o:", NULL, parse_arg,
usage)) != SPDK_APP_PARSE_ARGS_SUCCESS) {
exit(rc);
}
nm_init(options.verbosity);
/*
* spdk_app_start() will initialize the SPDK framework, call hello_start(),
* and then block until spdk_app_stop() is called (or if an initialization
* error occurs, spdk_app_start() will return with rc even without calling
* hello_start().
*/
rc = spdk_app_start(&opts, birb_main, NULL);
if (rc) {
SPDK_ERRLOG("ERROR starting application\n");
}
/* At this point either spdk_app_stop() was called, or spdk_app_start()
* failed because of internal error.
*/
/* Gracefully close out all of the SPDK subsystems. */
spdk_app_fini();
return rc;
}

585
storage/birb_posix.cc Normal file
View File

@ -0,0 +1,585 @@
#include <sys/endian.h>
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/types.h>
#include <fcntl.h>
#include <getopt.h>
#include <pthread.h>
#include <pthread_np.h>
#include <threads.h>
#include <unistd.h>
#include <aio.h>
#include <getopt.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <cerrno>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <chrono>
#include <list>
#include <set>
#include "gen.hh"
#include "ntr.h"
#include "defs.hh"
#include "nm.hh"
#include "storage/io_gen.hh"
static inline uint64_t get_cur_ts_nano()
{
return std::chrono::duration_cast<std::chrono::nanoseconds>
(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
}
/*
* We'll use this struct to gather housekeeping hello_context to pass between
* our events and callbacks.
*/
static constexpr unsigned long MAX_SPEC_LEN = 32;
static constexpr unsigned long MAX_DEV_NAME_LEN = 32;
static constexpr unsigned long MAX_OUTPUT_FILE_LEN = 256;
struct options_t {
// args
int verbosity = NTR_LEVEL_DEFAULT;
int num_threads = 1;
unsigned long cpumask = 1;
char pattern_spec[MAX_SPEC_LEN] = "R,100";
char ia_spec[MAX_SPEC_LEN] = "fixed";
unsigned int time = 5;
unsigned int warmup = 2;
unsigned int queue_depth = 1;
char dev_name[MAX_DEV_NAME_LEN] = "Malloc0";
char driver_name[MAX_DEV_NAME_LEN] = "bdev";
unsigned int read_pct = 0;
io_generator_address_mode addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
char output_file[MAX_OUTPUT_FILE_LEN] = "output.txt";
unsigned long req_size = 4096;
unsigned long rps = 0;
};
std::atomic<int> worker_thread_init_cnt(0);
std::atomic<int> worker_thread_stop_cnt(0);
std::atomic<int> worker_start(0);
std::atomic<int> worker_stop(0);
static struct options_t options;
struct io_record {
uint64_t start_ts;
uint64_t end_ts;
};
struct io_request {
uint64_t start_ts;
io_generator_opcode op;
char * user_buf;
char * dma_buf;
struct aiocb aio;
};
struct thread_context {
unsigned int tid;
unsigned int coreid;
unsigned int sockid;
pthread_t sys_thread;
int disk_fd;
unsigned long start_region_offset;
unsigned long start_region_length;
/* modified by worker threads */
std::list<io_record *> *io_records;
uint64_t overhead_avg;
uint32_t overhead_cnt;
uint64_t overhead_max;
uint64_t overhead_min;
};
static void dump_options()
{
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: Options:\n"
" dev name: %s\n"
" driver name: %s\n"
" worker threads: 0x%lx\n"
" number of threads: %d\n"
" IO request size: %lu\n"
" IO requests per second: %lu\n"
" IO pattern: %s\n"
" IO queue depth: %d\n"
" IO addressing mode: %d\n"
" read percent: %u\n"
" inter-arrival dist: %s\n"
" run time: %d\n"
" warmup time: %d\n"
" output file: %s\n",
options.dev_name,
options.driver_name,
options.cpumask,
options.num_threads,
options.req_size,
options.rps,
options.pattern_spec,
options.queue_depth,
options.addr_mode,
options.read_pct,
options.ia_spec,
options.time,
options.warmup,
options.output_file
);
}
static void usage()
{
fprintf(stdout,
" -V(VV): verbose mode\n"
" -D: dev name\n"
" -k: driver to use (default bdev)\n"
" -a: worker threads spec (0x3 = spawn 2 threads on core 1 & 2)\n"
" -b: IO request size\n"
" -q: IO requests per second\n"
" -P: IO request pattern\n"
" -Q: IO request queue depth\n"
" -I: inter-arrival time distribution\n"
" -t: total run time\n"
" -w: warm up time\n"
" -o: latency response output file\n");
}
static void *
worker_thread_main(void * arg)
{
int rc = 0;
auto *ctx = (struct thread_context *)arg;
std::list<struct io_request *> free_ios;
std::list<struct io_request *> prog_ios;
Generator * ia_gen = nullptr;
io_generator * io_gen = nullptr;
struct io_generator_ctx io_ctx;
uint64_t next_ts;
uint64_t a_offset;
uint64_t last_loop_ts = 0;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init...\n", ctx->tid);
// create io request objects
for (unsigned int i = 0; i < options.queue_depth; i++) {
auto buf = (char *)nm_malloc(ctx->sockid, options.req_size);
auto user_buf = (char *)nm_malloc(ctx->sockid, options.req_size);
if (buf == nullptr || user_buf == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate buffers!\n", ctx->tid);
rc = ENOMEM;
goto cleanup;
}
auto io_req = new struct io_request;
io_req->dma_buf = buf;
io_req->user_buf = user_buf;
io_req->aio.aio_fildes = ctx->disk_fd;
io_req->aio.aio_nbytes = options.req_size;
io_req->aio.aio_buf = buf;
io_req->aio.aio_sigevent.sigev_notify = SIGEV_NONE;
io_req->aio.aio_reqprio = 0;
free_ios.push_back(io_req);
}
// init thread local states
ia_gen = createGenerator(options.ia_spec);
if (ia_gen == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
rc = EINVAL;
goto cleanup;
}
ia_gen->set_lambda((double)options.rps / (double)(options.num_threads));
io_gen = new io_generator(options.req_size, ctx->start_region_length, options.read_pct, options.addr_mode);
if (io_gen == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: could not allocate ia generator!\n", ctx->tid);
rc = EINVAL;
goto cleanup;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: init complete.\n", ctx->tid);
worker_thread_init_cnt.fetch_add(1);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: waiting for start...\n", ctx->tid);
while (worker_start.load() == 0) {}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: started...\n", ctx->tid);
/* random delay 0-100 us */
usleep(nm_get_uptime_ns() % 100);
next_ts = get_cur_ts_nano();
while (true) {
uint64_t cur_ts = get_cur_ts_nano();
if (last_loop_ts > 0) {
uint64_t overhead = cur_ts - last_loop_ts;
if (ctx->overhead_max < overhead) {
ctx->overhead_max = overhead;
}
if (ctx->overhead_min > overhead) {
ctx->overhead_min = overhead;
}
ctx->overhead_avg = ctx->overhead_avg * ctx->overhead_cnt + overhead;
ctx->overhead_cnt++;
ctx->overhead_avg /= ctx->overhead_cnt;
}
last_loop_ts = cur_ts;
// process io completion
auto itr = prog_ios.begin();
while (itr != prog_ios.end()) {
int err;
struct io_request * ioreq = *itr;
if ((err = aio_error(&ioreq->aio)) != EINPROGRESS) {
if (err == 0) {
auto rec = new struct io_record;
rec->start_ts = ioreq->start_ts;
rec->end_ts = cur_ts;
ctx->io_records->push_back(rec);
if (ioreq->op == IOGEN_READ) {
memcpy(ioreq->user_buf, ioreq->dma_buf, options.req_size);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d <worker_io_complete>: completed io request type %d\n", ctx->tid, ioreq->op);
} else {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: aio failed with %d...\n", ctx->tid, err);
}
if (aio_return(&ioreq->aio) == -1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "thread %d: aio_return failed with %d...\n", ctx->tid, errno);
exit(errno);
}
/* cleanup */
itr = prog_ios.erase(itr);
free_ios.push_back(ioreq);
} else {
++itr;
}
}
if (worker_stop.load() == 1) {
if (free_ios.size() >= options.queue_depth) {
break;
}
} else {
if (!free_ios.empty()) {
auto io_req = free_ios.front();
cur_ts = get_cur_ts_nano();
if (cur_ts >= next_ts) {
io_gen->issue(&io_ctx, io_req->dma_buf);
a_offset = io_ctx.offset + ctx->start_region_offset;
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: issuing IO type %d at offset 0x%lx size 0x%lx...\n", ctx->tid, io_ctx.op, a_offset, io_ctx.size);
io_req->start_ts = cur_ts;
io_req->op = io_ctx.op;
io_req->aio.aio_offset = a_offset;
if(io_ctx.op == IOGEN_READ) {
rc = aio_read(&io_req->aio);
} else {
rc = aio_write(&io_req->aio);
}
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "thread %d: failed to issue io %d, retrying...\n", ctx->tid, errno);
} else {
free_ios.pop_front();
prog_ios.push_back(io_req);
next_ts = next_ts + ia_gen->generate() * S2NS;
}
}
}
}
}
cleanup:
while (!free_ios.empty()) {
auto req = free_ios.front();
free_ios.pop_front();
nm_free(ctx->sockid, req->dma_buf);
nm_free(ctx->sockid, req->user_buf);
}
if (ia_gen != nullptr) {
delete ia_gen;
}
if (io_gen != nullptr) {
delete io_gen;
}
worker_thread_stop_cnt.fetch_add(1);
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "thread %d: stopped...\n", ctx->tid);
return nullptr;
}
static void
parse_pattern(char * pattern, unsigned int * read_pct, io_generator_address_mode * addr_mode)
{
char * token = strtok(pattern, ",");
if (strcmp(token, "M") == 0) {
*addr_mode = IOGEN_ADDR_MONOTONIC_INCREASING;
} else {
*addr_mode = IOGEN_ADDR_UNIFORM_RANDOM;
}
token = strtok(nullptr, ",");
*read_pct = strtoull(token, nullptr, 10);
}
static void
birb_main()
{
int rc = 0;
std::list<struct thread_context *> worker_threads;
std::ofstream output_file;
unsigned long record_cutoff_time = 0;
unsigned long current_s = 0;
unsigned int total_reqs = 0;
unsigned int tid = 0;
unsigned long per_thread_cap = 0;
int cur_core;
int disk_fd;
off_t disk_size;
u_int disk_sec_size;
/* initialize driver */
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initializing device driver for device %s\n", options.dev_name);
disk_fd = open(options.dev_name, O_RDWR | O_DIRECT);
if (disk_fd == -1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open device - %d\n", errno);
exit(errno);
}
rc = ioctl(disk_fd, DIOCGMEDIASIZE, &disk_size);
if (rc == -1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk size - %d\n", errno);
exit(errno);
}
rc = ioctl(disk_fd, DIOCGSECTORSIZE, &disk_sec_size);
if (rc == -1) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to obtain disk sector size - %d\n", errno);
exit(errno);
}
per_thread_cap = disk_size / options.num_threads;
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: initialized device with capacity %zu bytes ~= %zu MB, sector %u bytes\n", disk_size, disk_size / 1024 / 1024, disk_sec_size);
parse_pattern(options.pattern_spec, &options.read_pct, &options.addr_mode);
dump_options();
output_file.open(options.output_file, std::ofstream::out);
if (!output_file) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to open output file %s\n", options.output_file);
rc = EINVAL;
goto end;
}
cur_core = cmask_get_next_cpu(&options.cpumask);
while(cur_core != NEXT_CPU_NULL) {
auto * ctx = new struct thread_context;
memset(ctx, 0, sizeof(struct thread_context));
if (ctx == NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to alloc thread ctx.\n");
exit(ENOMEM);
}
ctx->tid = tid++;
ctx->sockid = nm_get_node_from_core(cur_core);
ctx->coreid = cur_core;
ctx->io_records = new std::list<struct io_record *>();
ctx->start_region_length = per_thread_cap;
ctx->start_region_offset = per_thread_cap * ctx->tid;
ctx->disk_fd = disk_fd;
// create sys thread
pthread_attr_t attr;
cpuset_t scpuset;
CPU_ZERO(&scpuset);
CPU_SET(cur_core, &scpuset);
pthread_attr_init(&attr);
pthread_attr_setaffinity_np(&attr, sizeof(cpuset_t), &scpuset);
rc = pthread_create(&ctx->sys_thread, &attr, worker_thread_main, ctx);
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "main: failed to create sys thread: %d\n", rc);
rc = EINVAL;
goto end;
}
worker_threads.push_back(ctx);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: created worker thread %d on core %d socket %d offset 0x%lx length %ld\n", ctx->tid, cur_core, ctx->sockid,
ctx->start_region_offset,
ctx->start_region_length);
cur_core = cmask_get_next_cpu(&options.cpumask);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: waiting for worker thread init...\n");
while(worker_thread_init_cnt.load() < options.num_threads) {
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: starting worker threads...\n");
worker_start.store(1);
/* main event loop */
while(current_s < options.time) {
if (current_s >= options.warmup && record_cutoff_time == 0) {
record_cutoff_time = get_cur_ts_nano();
}
usleep(1 * S2US);
current_s++;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_DEBUG, "main: stopping worker threads...\n");
worker_stop.store(1);
while(worker_thread_stop_cnt.load() < options.num_threads) {
}
// keep stats
for (struct thread_context * tctx : worker_threads) {
uint64_t last_ts = 0;
uint64_t processed = 0;
for (struct io_record * r : *tctx->io_records) {
if (r->start_ts >= record_cutoff_time) {
if (r->end_ts > last_ts) {
last_ts = r->end_ts;
}
processed++;
output_file << r->end_ts - r->start_ts << std::endl;
total_reqs++;
}
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: thread %d processed requests: %lu, last request %lu. Overhead - avg %lu min %lu max %lu\n",
tctx->tid, processed, last_ts, tctx->overhead_avg, tctx->overhead_min, tctx->overhead_max);
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "main: total requests: %u, bytes per second: %lu\n",
total_reqs, total_reqs * options.req_size / (options.time - options.warmup));
end:
if (disk_fd != -1) {
close(disk_fd);
}
output_file.close();
for (struct thread_context * tctx : worker_threads) {
for (struct io_record * r : *tctx->io_records) {
delete r;
}
delete tctx->io_records;
delete tctx;
}
return;
}
int
main(int argc, char **argv)
{
ntr_init();
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_INFO);
int c;
while (( c = getopt(argc, argv, "VD:k:a:b:q:Q:P:I:t:w:o:")) != -1)
{
switch (c) {
case 'V':
ntr_set_level(NTR_DEP_USER1,
ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 'D':
strncpy(options.dev_name, optarg, MAX_DEV_NAME_LEN);
break;
case 'k':
strncpy(options.driver_name, optarg, MAX_DEV_NAME_LEN);
break;
case 'a':
options.cpumask = strtoull(optarg, nullptr, 16);
options.num_threads = cmask_get_num_cpus(
options.cpumask);
if (options.num_threads == 0) {
fprintf(stderr,
"must run at least one thread\n");
return EINVAL;
}
break;
case 'b':
options.req_size = strtoull(
optarg, nullptr, 10);
break;
case 'q':
options.rps = strtoull(
optarg, nullptr, 10);
break;
case 'Q':
options.queue_depth = strtoull(
optarg, nullptr, 10);
break;
case 'P':
strncpy(options.pattern_spec, optarg, MAX_SPEC_LEN);
break;
case 'I':
strncpy(options.ia_spec, optarg, MAX_SPEC_LEN);
break;
case 't':
options.time = strtoull(
optarg, nullptr, 10);
break;
case 'w':
options.warmup = strtoull(
optarg, nullptr, 10);
break;
case 'o':
strncpy(options.output_file, optarg, MAX_OUTPUT_FILE_LEN);
break;
case 'h':
usage();
exit(0);
default:
usage();
exit(EINVAL);
}
}
nm_init(options.verbosity);
birb_main();
return 0;
}

95
storage/drivers/bdev.cc Normal file
View File

@ -0,0 +1,95 @@
#include <sys/endian.h>
#include "storage/drivers/bdev.hh"
#include "ntr.h"
#include "spdk/bdev.h"
#include "spdk/thread.h"
size_t
birb_bdev_driver::get_capacity()
{
return block_num * block_sz;
}
birb_driver::birb_driver_status
birb_bdev_driver::get_status()
{
return this->status;
}
void
birb_bdev_driver::bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev * bdev UNUSED,
void * event_ctx UNUSED)
{
ntr(NTR_DEP_USER1, NTR_LEVEL_WARNING, "bdev_event_cb: unsupported bdev event: type %d\n", type);
}
void
birb_bdev_driver::print_all_bdev()
{
struct spdk_bdev * cur = spdk_bdev_first();
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_bdev_driver: all registered block devices: ");
while(cur != NULL) {
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "%s, ", spdk_bdev_get_name(cur));
cur = spdk_bdev_next(cur);
}
}
birb_bdev_driver::birb_bdev_driver(const char * dev_name) : bdev_desc(nullptr),
bdev(nullptr),
block_sz(0),
block_num(0),
status(BIRB_FAIL)
{
int rc;
rc = spdk_bdev_open_ext(dev_name, true, birb_bdev_driver::bdev_event_cb, NULL, &this->bdev_desc);
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_bdev_driver: failed to open bdev: %d\n", rc);
return;
}
/* A bdev pointer is valid while the bdev is opened. */
this->bdev = spdk_bdev_desc_get_bdev(this->bdev_desc);
this->block_sz = spdk_bdev_get_block_size(this->bdev);
this->block_num = spdk_bdev_get_num_blocks(this->bdev);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_bdev_driver: bdev block size %zu bytes, blocks count %zu\n", this->block_sz, this->block_num);
this->status = BIRB_SUCCESS;
}
birb_bdev_driver::~birb_bdev_driver()
{
if (this->status == BIRB_SUCCESS) {
spdk_bdev_close(this->bdev_desc);
}
}
birb_driver::birb_driver_type
birb_bdev_driver::get_type()
{
return BIRB_DRV_BDEV;
}
size_t
birb_bdev_driver::get_align()
{
return spdk_bdev_get_buf_align(this->bdev);
}
struct spdk_bdev *
birb_bdev_driver::get_bdev()
{
return this->bdev;
}
struct spdk_bdev_desc *
birb_bdev_driver::get_bdev_desc()
{
return this->bdev_desc;
}

View File

@ -0,0 +1,72 @@
#include <sys/endian.h>
#include "storage/drivers/bdev.hh"
#include "ntr.h"
#include "spdk/bdev.h"
#include "spdk/thread.h"
birb_bdev_thread_context::birb_bdev_thread_context(birb_bdev_driver * driver) : io_channel(nullptr),
status(birb_driver::BIRB_FAIL),
driver(driver)
{
struct spdk_bdev_desc * desc = driver->get_bdev_desc();
// obtain io channel
this->io_channel = spdk_bdev_get_io_channel(desc);
if (io_channel == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_bdev_thread_context: could not create bdev I/O channel!\n");
}
this->status = birb_driver::BIRB_SUCCESS;
}
birb_driver::birb_driver_status
birb_bdev_thread_context::get_status()
{
return this->status;
}
birb_bdev_thread_context::~birb_bdev_thread_context()
{
if (this->io_channel != nullptr) {
spdk_put_io_channel(this->io_channel);
}
}
/*
* Callback function for io completion.
*/
void
birb_bdev_thread_context::io_callback(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
spdk_bdev_free_io(bdev_io);
auto ctx = reinterpret_cast<struct cb_context *>(cb_arg);
ctx->cb(success, ctx->ctx);
delete ctx;
}
int
birb_bdev_thread_context::read(size_t offset, size_t size, char * buffer, callback callback, void * context)
{
auto ctx = new struct cb_context;
ctx->cb = callback;
ctx->ctx = context;
return spdk_bdev_read(driver->get_bdev_desc(), this->io_channel, buffer, offset, size, io_callback, reinterpret_cast<void*>(ctx));
}
int
birb_bdev_thread_context::write(size_t offset, size_t size, char * buffer, callback callback, void * context)
{
auto ctx = new struct cb_context;
ctx->cb = callback;
ctx->ctx = context;
return spdk_bdev_write(driver->get_bdev_desc(), this->io_channel, buffer, offset, size, io_callback, reinterpret_cast<void*>(ctx));
}
void
birb_bdev_thread_context::poll()
{
return;
}

135
storage/drivers/nvme.cc Normal file
View File

@ -0,0 +1,135 @@
#include <sys/endian.h>
#include "ntr.h"
#include "spdk/nvme.h"
#include "spdk/thread.h"
#include "storage/drivers/nvme.hh"
size_t
birb_nvme_driver::get_capacity()
{
return spdk_nvme_ns_get_size(this->ns);
}
birb_driver::birb_driver_status
birb_nvme_driver::get_status()
{
return this->status;
}
void
birb_nvme_driver::attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts UNUSED)
{
struct spdk_nvme_ns * ns;
auto ctx = reinterpret_cast<struct attach_context *>(cb_ctx);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: attached to nvme at %s\n", trid->traddr);
for (int nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0;
nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
if (ns == nullptr || !spdk_nvme_ns_is_active(ns)) {
continue;
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: namespace id: %d size: %zu LBA size: %u\n", spdk_nvme_ns_get_id(ns), spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns));
/* XXX: use the first namespace */
break;
}
*ctx->ns = ns;
*ctx->ctrlr = ctrlr;
ctx->valid = 1;
}
bool
birb_nvme_driver::probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr_opts *opts UNUSED)
{
printf("birb_nvme_driver: found nvme at %s\n", trid->traddr);
auto ctx = reinterpret_cast<struct attach_context *>(cb_ctx);
if (strcmp(trid->traddr, ctx->dev_name) == 0) {
return true;
}
return false;
}
birb_nvme_driver::birb_nvme_driver(const char * dev_name) : status(BIRB_FAIL),
ctrlr(nullptr),
ns(nullptr),
opts()
{
int rc;
struct spdk_nvme_transport_id trid;
struct attach_context ctx;
ctx.ctrlr = &this->ctrlr;
ctx.ns = &this->ns;
ctx.dev_name = dev_name;
ctx.valid = 0;
spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
rc = spdk_nvme_probe(&trid, reinterpret_cast<void *>(&ctx), probe_cb, attach_cb, nullptr);
if (rc != 0) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: failed to probe nvme device: %d\n", rc);
goto end;
}
if (ctx.valid != 1) {
rc = EINVAL;
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: could not find device: %s\n", dev_name);
goto end;
}
if (spdk_nvme_ns_get_csi(this->ns) == SPDK_NVME_CSI_ZNS) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_driver: zoned nvme namespace is unsupported\n");
spdk_nvme_detach(this->ctrlr);
goto end;
} else {
spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &this->opts, sizeof(this->opts));
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "birb_nvme_driver: io queue depth: %d io queue requests: %d\n", opts.io_queue_size, opts.io_queue_requests);
this->status = BIRB_SUCCESS;
}
end:
return;
}
birb_nvme_driver::~birb_nvme_driver()
{
if (this->ctrlr != nullptr) {
spdk_nvme_detach(this->ctrlr);
}
}
birb_driver::birb_driver_type
birb_nvme_driver::get_type()
{
return BIRB_DRV_NVME;
}
size_t
birb_nvme_driver::get_align()
{
return 0x1000;
}
spdk_nvme_ctrlr *
birb_nvme_driver::get_ctrlr()
{
return this->ctrlr;
}
spdk_nvme_ns *
birb_nvme_driver::get_ns()
{
return this->ns;
}
spdk_nvme_io_qpair_opts *
birb_nvme_driver::get_io_qpair_opts()
{
return &this->opts;
}

View File

@ -0,0 +1,90 @@
#include <sys/endian.h>
#include "storage/drivers/nvme.hh"
#include "ntr.h"
#include "spdk/bdev.h"
#include "spdk/nvme.h"
#include "spdk/nvme_spec.h"
#include "spdk/thread.h"
birb_nvme_thread_context::birb_nvme_thread_context(birb_nvme_driver * driver) : status(birb_driver::BIRB_FAIL),
driver(driver),
qpair(nullptr)
{
struct spdk_nvme_ctrlr * ctrlr = driver->get_ctrlr();
struct spdk_nvme_qpair * qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, driver->get_io_qpair_opts(), sizeof(struct spdk_nvme_io_qpair_opts));
if (qpair == nullptr) {
ntr(NTR_DEP_USER1, NTR_LEVEL_ERROR, "birb_nvme_thread_context: could not allocate qpairs.\n");
} else {
this->qpair = qpair;
status = birb_driver::BIRB_SUCCESS;
}
}
birb_driver::birb_driver_status
birb_nvme_thread_context::get_status()
{
return this->status;
}
birb_nvme_thread_context::~birb_nvme_thread_context()
{
if (this->qpair != nullptr) {
spdk_nvme_ctrlr_free_io_qpair(this->qpair);
}
}
/*
* Callback function for io completion.
*/
void
birb_nvme_thread_context::io_callback(void *arg, const struct spdk_nvme_cpl *completion)
{
bool success = !spdk_nvme_cpl_is_error(completion);
auto ctx = reinterpret_cast<struct cb_context *>(arg);
ctx->cb(success, ctx->ctx);
delete ctx;
}
uint32_t
birb_nvme_thread_context::size_to_lba(size_t size, int lba_size)
{
return (size - 1) / lba_size + 1;
}
uint64_t
birb_nvme_thread_context::addr_to_lba(size_t addr, int lba_size)
{
return addr / lba_size;
}
int
birb_nvme_thread_context::read(size_t offset, size_t size, char * buffer, callback callback, void * context)
{
auto ctx = new struct cb_context;
ctx->cb = callback;
ctx->ctx = context;
struct spdk_nvme_ns * ns = this->driver->get_ns();
int lba_size = spdk_nvme_ns_get_sector_size(ns);
return spdk_nvme_ns_cmd_read(ns, this->qpair, buffer, addr_to_lba(offset, lba_size), size_to_lba(size, lba_size), io_callback, reinterpret_cast<void*>(ctx), 0);
}
int
birb_nvme_thread_context::write(size_t offset, size_t size, char * buffer, callback callback, void * context)
{
auto ctx = new struct cb_context;
ctx->cb = callback;
ctx->ctx = context;
struct spdk_nvme_ns * ns = this->driver->get_ns();
int lba_size = spdk_nvme_ns_get_sector_size(ns);
return spdk_nvme_ns_cmd_write(ns, this->qpair, buffer, addr_to_lba(offset, lba_size), size_to_lba(size, lba_size), io_callback, reinterpret_cast<void*>(ctx), 0);
}
void
birb_nvme_thread_context::poll()
{
spdk_nvme_qpair_process_completions(this->qpair, 0);
}

57
storage/io_gen.cc Normal file
View File

@ -0,0 +1,57 @@
#include <sys/endian.h>
#include <random>
#include "nm.hh"
#include "storage/io_gen.hh"
io_generator::io_generator(
unsigned long req_size,
unsigned long capacity,
unsigned int read_pct,
io_generator_address_mode addr_mode) : cur_offset(0),
capacity(capacity),
req_size(req_size),
read_pct(read_pct),
addr_mode(addr_mode),
rng(rd()),
dist(std::uniform_int_distribution<int>(0, 99)),
addr_rng(addr_rd()),
addr_dist(std::uniform_int_distribution<uint64_t>(0, capacity - 1))
{
rng.seed(nm_get_uptime_ns());
addr_rng.seed(nm_get_uptime_ns());
}
/* returns 0 on success */
int io_generator::issue(struct io_generator_ctx *ctx, char * buf)
{
ctx->size = req_size;
// determine next IO offset
if (addr_mode == IOGEN_ADDR_MONOTONIC_INCREASING) {
if (cur_offset + req_size > capacity) {
cur_offset = 0;
}
ctx->offset = cur_offset;
cur_offset = cur_offset + req_size;
} else {
ctx->offset = (addr_dist(addr_rng) / req_size) * req_size;
if (ctx->offset + req_size > capacity) {
ctx->offset -= req_size;
}
}
// determine next IO data
int op_rng = dist(rng);
if (op_rng < (int)read_pct) {
ctx->op = IOGEN_READ;
} else {
ctx->op = IOGEN_WRITE;
int data = dist(rng);
memset(buf, data, req_size);
}
return 0;
}

32
tests/nms_test.c Normal file
View File

@ -0,0 +1,32 @@
#include "nms.h"
#include <assert.h>
#include <stdio.h>
int main(void)
{
void * ret;
nms_init(1);
// duplicate init
nms_init(1);
// 1G
ret = nms_malloc(0, 1024 * 1024 * 1024);
assert(ret != NULL);
printf("1G: %p\n", ret);
// two 511Ms
ret = nms_malloc(0, 511 * 1024 * 1024);
assert(ret != NULL);
printf("511M: %p\n", ret);
ret = nms_malloc(0, 511 * 1024 * 1024);
assert(ret != NULL);
printf("511M: %p\n", ret);
// another 1G
ret = nms_malloc(0, 1024 * 1024 * 1024);
assert(ret != NULL);
printf("1G: %p\n", ret);
return 0;
}

239
util/memloadgen.cc Normal file
View File

@ -0,0 +1,239 @@
#include <sys/endian.h>
#include <sys/select.h>
#include <sys/signal.h>
#include "gen.hh"
#include <array>
#include <atomic>
#include <cstdlib>
#include <cstring>
#include <list>
#include <iostream>
#include <fstream>
#include "ntr.h"
#include "nms.h"
#include <getopt.h>
#include <pthread.h>
#include <unistd.h>
#include <topo.h>
static void
usage()
{
fprintf(stdout,
"Usage:\n"
" -v: verbose mode\n"
" -b: buffer size\n"
" -q: bytes per second\n"
" -d: destination domain index\n"
" -s: worker threads cpu list\n"
" -m: pull mode cpu list\n"
" -S: enable shared buffer\n"
" -t: time to run\n"
" -T: transaction size\n"
" -i: inter arrival time distribution\n"
" -o: output file path\n"
" -H: history size for pct adjustment\n"
" -M: print this string when threads are ready to run\n");
fflush(stdout);
}
static char output_file[256] = "memloadgen_samples.txt";
int main(int argc, char * argv[])
{
ntr_init();
ntr_set_level(NTR_DEP_USER1, NTR_LEVEL_WARNING);
size_t arr_sz = 64 * 1024 * 1024;
uint32_t time = -1;
uint64_t bps = 0;
uint64_t transaction_size = arr_sz;
cpuset_t threads, modes;
char magic[256] = {0};
CPU_ZERO(&threads);
CPU_ZERO(&modes);
CPU_SET(0, &threads);
char ia_dist[32] = "fixed";
int history_sz = 5;
std::list<uint64_t> history;
int shared_buffer = 0;
int rate_ctrl = 0;
cpuset_t domain_mask;
CPU_ZERO(&domain_mask);
CPU_SET(0, &domain_mask);
{
int c;
// parse arguments
while ((c = getopt(argc, argv, "vhb:d:s:m:So:T:t:q:i:H:M:")) != -1) {
switch (c) {
case 'v':
ntr_set_level(NTR_DEP_USER1, ntr_get_level(NTR_DEP_USER1) + 1);
break;
case 'h':
usage();
exit(0);
case 'b':
arr_sz = strtoull(optarg, nullptr, 10);
break;
case 'd':
cpulist_to_cpuset(optarg, &domain_mask);
break;
case 's':
cpulist_to_cpuset(optarg, &threads);
break;
case 'm':
cpulist_to_cpuset(optarg, &modes);
break;
case 'S':
shared_buffer = 1;
break;
case 'o':
strncpy(output_file, optarg, 256);
break;
case 't':
time = strtoul(optarg, nullptr, 10);
break;
case 'T':
transaction_size = strtoul(optarg, nullptr, 10);
break;
case 'q':
bps = (uint64_t)strtoull(optarg, nullptr, 10);
break;
case 'i':
strncpy(ia_dist, optarg, sizeof(ia_dist));
break;
case 'H':
history_sz = strtol(optarg, nullptr, 10);
break;
case 'M':
strncpy(magic, optarg, sizeof(magic));
break;
default:
usage();
exit(0);
}
}
}
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "Configruation:\n"
" buffer size: %ld\n"
" num threads: %d\n"
" target domain: %ld\n"
" bytes per second: %lu\n"
" interarrival distribution: %s\n"
" shared buffer: %d\n"
" transaction time: %lu\n"
" runtime: %d\n"
" history: %d\n"
" magic: %s\n",
arr_sz, CPU_COUNT(&threads),
CPU_FFS(&domain_mask) - 1, bps,
ia_dist, shared_buffer,
transaction_size,time, history_sz, magic);
// init topo
if (topo_init(ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT)) {
fprintf(stderr, "libtopo init failed!\n");
exit(1);
}
// init
if (nms_init(ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT)) {
fprintf(stderr, "libnms init failed!\n");
exit(1);
}
bool success = false;
memload_generator::memload_generator_options opts;
opts.buffer_size = arr_sz;
opts.trans_per_second = bps / transaction_size;
opts.shared_buffer = shared_buffer;
opts.transaction_size = transaction_size;
opts.verbose = ntr_get_level(NTR_DEP_USER1) != NTR_LEVEL_DEFAULT;
strncpy(opts.ia_dist, ia_dist, sizeof(opts.ia_dist));
std::ofstream ofile;
ofile.open(output_file, std::ios::out | std::ios::trunc);
auto mgen = new memload_generator(&threads, &modes, &domain_mask, &opts, &success);
if (strlen(magic) > 0) {
fprintf(stdout, "%s\n", magic);
fflush(stdout);
}
if (!mgen->start()) {
fprintf(stderr, "failed to start memloadgen!\n");
exit(1);
}
struct timeval stval;
stval.tv_sec = 0;
stval.tv_usec = 0;
char pct_line[64] = {0};
uint64_t prev_ts = topo_uptime_ns();
uint64_t prev_trans = mgen->get_transactions();
uint32_t cur_time = 0;
while(cur_time < time) {
usleep(S2US);
uint64_t cur_ts = topo_uptime_ns();
uint64_t trans = mgen->get_transactions();
uint64_t bps = (uint64_t)((double)((trans - prev_trans) * transaction_size) / ((double)(cur_ts - prev_ts) / (double)S2NS));
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "%ldB,%ldM\n", bps, bps / 1024 / 1024);
ofile << "s," << cur_time << "," << bps << std::endl;
ofile.flush();
prev_ts = cur_ts;
prev_trans = trans;
cur_time++;
if (rate_ctrl == 0) {
// keep history
history.emplace_back(bps);
if ((int)history.size() > history_sz) {
history.pop_front();
}
fd_set fdset;
FD_ZERO(&fdset);
FD_SET(STDIN_FILENO, &fdset);
int ret = select(1, &fdset, NULL, NULL, &stval);
if (ret < 0) {
if (errno != EINTR) {
fprintf(stderr, "select() failed with %d\n", errno);
exit(1);
}
} else if (ret > 0) {
if (FD_ISSET(STDIN_FILENO, &fdset)) {
ret = read(STDIN_FILENO, pct_line, sizeof(pct_line) - 1);
if (ret < 0) {
fprintf(stderr, "read() failed with %d\n", errno);
exit(1);
}
unsigned int pct = strtoul(pct_line, NULL, 10);
uint64_t sum = 0;
size_t sz = history.size();
while (history.size() > 0) {
sum += history.front();
history.pop_front();
}
uint64_t newbps = ((sum / sz) * (double)pct / 100.0);
mgen->set_transactions(newbps / transaction_size);
ntr(NTR_DEP_USER1, NTR_LEVEL_INFO, "adjusted target bps to %u%% = %ldB ~= %ldM\n", pct, newbps, newbps / 1024 / 1024);
ofile << "p," << cur_time << "," << pct << std::endl;
ofile.flush();
rate_ctrl = 1;
}
}
}
}
mgen->stop();
delete mgen;
ofile.close();
return 0;
}

237
util/mornafah.c Normal file
View File

@ -0,0 +1,237 @@
#include <stdio.h>
#include <stdlib.h>
#include "nms.h"
#include <getopt.h>
#include <unistd.h>
#include <topo.h>
#include <immintrin.h>
#include <x86intrin.h>
#include <stdatomic.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <errno.h>
#include <stdint.h>
#include <sys/cpuset.h>
#include <sys/sysctl.h>
#include <pthread.h>
#include <pthread_np.h>
#define BUFFER_SIZE (128 * 1024 * 1024)
#define BUFFER_CNT (BUFFER_SIZE / sizeof(int))
static _Atomic int flush = 0;
static _Atomic uint64_t offset = 0;
static int * remote_buffer = NULL;
static uint64_t * latencies;
static int times = 100;
static int local_core = 0;
static int remote_core = 1;
static int cache_mode = 0;
static int verbose = 0;
static int random_access = 0;
static uint64_t tsc_freq = 0;
static inline uint64_t cyc2ns(uint64_t cyc)
{
return (double)cyc / ((double)tsc_freq / 1000000000.0);
}
static inline uint64_t read_time(void)
{
uint64_t l;
unsigned int a;
l = __rdtscp(&a);
_mm_lfence();
return l;
}
static void * local_thread(void *)
{
int temp, *addr;
uint64_t start, end;
printf("Local thread running...\n");
while(times > 0) {
if (random_access) {
// change offset
offset = (rand() % BUFFER_CNT) * sizeof(int);
}
flush = 1;
while(flush != 0) {
}
addr = (int *)((char *)remote_buffer + offset);
if (verbose > 1) {
printf("Local thread(%d): flushing %p.\n", local_core, addr);
}
_mm_clflush(addr);
_mm_mfence();
atomic_signal_fence(memory_order_seq_cst);
start = read_time();
temp = *addr;
end = read_time();
atomic_signal_fence(memory_order_seq_cst);
if (verbose > 1) {
printf("Local thread(%d): read %p.\n", local_core, addr);
}
latencies[times - 1] = end - start;
times--;
}
return (void *)(uintptr_t)temp;
}
static void * remote_thread(void *)
{
int temp;
int * addr;
printf("Remote thread running...\n");
while(1) {
while(flush == 0) {
}
addr = (int *)((char *)remote_buffer + offset);
if(cache_mode) {
temp = *addr;
_mm_mfence();
} else {
_mm_clflush(addr);
_mm_mfence();
}
if (verbose > 1) {
printf("Remote thread(%d): %p %s.\n", remote_core, addr, cache_mode ? "read into cache" : "flushed");
}
flush = 0;
}
return (void *)(uintptr_t)temp;
}
int main(int argc, char * argv[])
{
{
int c;
// parse arguments
while ((c = getopt(argc, argv, "l:r:t:vR")) != -1) {
switch (c) {
case 'l':
local_core = atoi(optarg);
break;
case 'r':
remote_core = atoi(optarg);
break;
case 't':
times = atoi(optarg);
break;
case 'R':
random_access = 1;
break;
case 'v':
verbose++;
break;
default:
exit(1);
}
}
}
srand(time(NULL));
// init topo
if (topo_init(1)) {
fprintf(stderr, "libtopo init failed!\n");
exit(1);
}
// init
if (nms_init(1)) {
fprintf(stderr, "libnms init failed!\n");
exit(1);
}
size_t sz = sizeof(tsc_freq);
int rc;
if ((rc = sysctlbyname("machdep.tsc_freq", &tsc_freq, &sz, NULL, 0)) < 0) {
fprintf(stderr,"failed to query tsc frequency via sysctl (%d)\n", errno);
} else {
fprintf(stdout,"system tsc frequency = %lu\n", tsc_freq);
}
latencies = malloc(sizeof(uint64_t) * times);
const int remote_numa = topo_core_to_numa(remote_core);
const int local_numa = topo_core_to_numa(local_core);
const int total = times;
remote_buffer = nms_malloc(remote_numa, BUFFER_SIZE);
// fill with random values
for (int i = 0; i < BUFFER_SIZE; i++) {
remote_buffer[i] = rand();
}
pthread_attr_t lattr, rattr;
pthread_t lthread, rthread;
cpuset_t lcpuset, rcpuset;
CPU_ZERO(&lcpuset);
CPU_ZERO(&rcpuset);
CPU_SET(local_core, &lcpuset);
CPU_SET(remote_core, &rcpuset);
pthread_attr_init(&rattr);
pthread_attr_setaffinity_np(&rattr, sizeof(cpuset_t), &rcpuset);
pthread_attr_init(&lattr);
pthread_attr_setaffinity_np(&lattr, sizeof(cpuset_t), &lcpuset);
printf("local thread: %d numa: %d, remote: %d numa: %d\n", local_core, local_numa, remote_core, remote_numa);
pthread_create(&lthread, &lattr, local_thread, NULL);
pthread_create(&rthread, &rattr, remote_thread, NULL);
pthread_join(lthread, NULL);
uint64_t min = UINT64_MAX;
uint64_t max = 0;
uint64_t sum = 0;
for (int i = total - 1; i >= 0; i--) {
if (verbose) {
printf("%lu,\n", latencies[i]);
}
if (min > latencies[i]) {
min = latencies[i];
}
if (max < latencies[i]) {
max = latencies[i];
}
sum += latencies[i];
}
double var = 0.0;
double avg = (double)sum / (double)total;
for (int i = total - 1; i >= 0; i--) {
var += pow(latencies[i] - avg, 2);
}
var = sqrt(var / avg);
printf("Avg: %lu cycles (%lu ns)\n"
"Std: %lu cycles (%lu ns)\n"
"Min: %lu cycles (%lu ns)\n"
"Max: %lu cycles (%lu ns)\n",
(uint64_t)avg, cyc2ns((uint64_t)avg),
(uint64_t)var, cyc2ns((uint64_t)var),
min, cyc2ns(min),
max, cyc2ns(max));
free(latencies);
return 0;
}