numam/scripts/netexp.py
2022-11-22 16:27:27 +01:00

341 lines
11 KiB
Python

import time
import subprocess as sp
import os
import libpar as par
import libtc as tc
import libmechspec as mechspec
class NetExpResult:
def __init__(self):
self.parser = None
self.pmc_parser = None
self.sample = None
class NetExpConf:
def __init__(self):
self.root_dir = ""
self.enable_client_only = False
self.enable_memgen = False
self.memgen_affinity = ""
self.memgen_iteration = -1
self.memgen_size = 512 * 1024 * 1024
self.memgen_tgtdom = 1
self.srv_affinity = ""
self.srv_mechspec = None
self.srv_port = 0
self.clt_qps = 0
self.clt_mechspecs = []
self.clt_affinity = "1"
self.clt_wrkld = 0
self.clt_wrkarg0 = "fixed:0"
self.clt_wrkarg1 = "fixed:0"
self.clt_pkt_loss_lat = 1000
self.clt_rage_quit_lat = 1000
self.clt_port = 0
self.clt_pkt_pad = 0
self.clt_pkt_depth = 1
self.clt_ia = "exponential"
self.mst_mechspec = None
self.mst_affinity = "2"
self.mst_qps = 100
self.mst_port = 0
self.mst_pkt_loss_lat = 1000
self.mst_pkt_loss_max = 1000
self.mst_duration = 10
self.mst_warmup = 5
self.mst_ia = "exponential"
self.enable_pmc = False
self.pmc_counters = []
self.pmc_mode = 0 # 0 = sampling
self.pmc_sampling_rate = 8192
self.pmc_counting_interval = 0.1
def __build_fqdn_arr(self, ns):
ret = []
for n in ns:
if n != None:
ret.append(n.fqdn)
return ret
def get_pmc_str(self):
ret = ""
for counter in self.pmc_counters:
ret = ret + counter + ","
return ret[:-1]
def calc_client_qps(self):
return 0 if self.clt_qps == 0 else (int)((self.clt_qps - self.mst_qps) / len(self.clt_mechspecs))
def finalize_mechspecs(self):
self.clt_fqdns = self.__build_fqdn_arr(self.clt_mechspecs)
self.srv_fqdns = self.__build_fqdn_arr([self.srv_mechspec])
self.mst_fqdns = self.__build_fqdn_arr([self.mst_mechspec])
__SAMPLE_FN = "sample.txt.tmp"
__PMC_FN = "pmc.txt.tmp"
def __keep_result(conf : NetExpConf):
result = NetExpResult()
target_scp_fn = tc.get_odir() + "/" + __SAMPLE_FN
scpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.mst_mechspec.fqdn + ":" + conf.root_dir + "/" + __SAMPLE_FN + " " + target_scp_fn
tc.log_print(scpcmd)
sp.check_call(scpcmd, shell=True)
result.parser = par.khat_parser()
with open(target_scp_fn, "r") as f:
result.sample = f.read()
result.parser.parse(result.sample)
rmcmd = "rm " + target_scp_fn
tc.log_print(rmcmd)
sp.check_call(rmcmd, shell=True)
if conf.enable_pmc:
target_pmc_fn = tc.get_odir() + "/" + __PMC_FN
pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + " " + target_pmc_fn
tc.log_print(pmcscpcmd)
sp.check_call(pmcscpcmd, shell=True)
if conf.pmc_mode == 0:
pmcproccmd = "sudo pmcstat -R " + conf.root_dir + "/" + __PMC_FN + " -m " + conf.root_dir + "/" + __PMC_FN + ".proc"
tc.log_print(pmcproccmd)
tc.remote_exec(conf.srv_fqdns, pmcproccmd)
pmcscpcmd = "scp -P77 " + tc.get_ssh_user() + "@" + conf.srv_mechspec.fqdn + ":" + conf.root_dir + "/" + __PMC_FN + ".proc" + " " + target_pmc_fn + ".proc"
tc.log_print(pmcscpcmd)
sp.check_call(pmcscpcmd, shell=True)
if conf.pmc_mode != 0:
with open(target_pmc_fn, "r") as f:
result.pmc_parser = par.pmc_parser(f.read())
else:
with open(target_pmc_fn, "rb") as f:
with open(target_pmc_fn + ".proc", "r") as g:
result.pmc_parser = [f.read(), g.read()]
rmcmd = "rm " + target_pmc_fn + ".proc"
tc.log_print(rmcmd)
sp.check_call(rmcmd, shell=True)
rmcmd = "rm " + target_pmc_fn
tc.log_print(rmcmd)
sp.check_call(rmcmd, shell=True)
return result
def stop_all(conf : NetExpConf):
# stop clients
tc.log_print("Stopping clients...")
tc.remote_exec(conf.clt_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
# stop master
tc.log_print("Stopping master...")
tc.remote_exec(conf.mst_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
if not conf.enable_client_only:
# stop server
tc.log_print("Stopping server...")
tc.remote_exec(conf.srv_fqdns, "sudo killall -9 rat; sudo killall -9 cat; sudo killall -9 khat; sudo killall -9 memloadgen", check=False)
if conf.enable_pmc:
tc.log_print("Stopping server PMC...")
tc.remote_exec(conf.srv_fqdns, "sudo killall -9 pmcstat", check=False)
def __run_setup_cmd(conf : NetExpConf, cmd : str, desc : str):
all = []
all.extend(conf.srv_fqdns)
all.extend(conf.clt_fqdns)
all.extend(conf.mst_fqdns)
ssrv : list[tuple[str, sp.Popen]] = []
for s in all:
tc.log_print(f"Running \'{desc}\' on {s}...")
ssrv.append((s, tc.remote_exec([s], cmd, blocking=False, check=False)[0]))
for p in ssrv:
_ , stderr = p[1].communicate()
if p[1].returncode != 0:
print(f"{ p[0] } \'{desc}\' failed. stderr:\n{stderr.decode()}\n")
else:
print(f"{ p[0] } \'{desc}\' succeeded")
def setup(conf : NetExpConf, bench : False, dpdk : False):
libtopo_path = "/libtopo"
dpdk_path = "/dpdk"
bench_path = "/numam.d"
if dpdk:
setup_cmd = f'''sudo rm -rf {libtopo_path}; sudo rm -rf /usr/local/include/libtopo;
sudo rm -rf /usr/local/lib/libtopo;
sudo mkdir -p {libtopo_path};
sudo chmod 777 {libtopo_path};
cd {libtopo_path};
git clone https://git.quacker.org/d/libtopo;
cd libtopo;
mkdir build;
cd build;
cmake ../;
sudo make install'''
__run_setup_cmd(conf, setup_cmd, "dpdk - libtopo")
setup_cmd = f'''sudo pkg install -y meson pkgconf py39-pyelftools;
sudo rm -rf {dpdk_path}
sudo mkdir -p {dpdk_path};
sudo chmod 777 {dpdk_path};
cd {dpdk_path};
git clone https://git.quacker.org/d/numam-dpdk;
cd numam-dpdk;
git checkout migration;
CC=gcc CXX=g++ meson -Denable_kmods=true build;
cd build;
sudo ninja install'''
__run_setup_cmd(conf, setup_cmd, "dpdk - dpdk")
if bench:
setup_cmd = f'''sudo rm -rf {bench_path};
sudo mkdir -p {bench_path};
sudo chmod 777 {bench_path}'''
__run_setup_cmd(conf, setup_cmd, "bench - remove")
all = []
all.extend(conf.srv_fqdns)
all.extend(conf.clt_fqdns)
all.extend(conf.mst_fqdns)
dir = f"{os.path.dirname(__file__)}/../"
for clt in all:
print("Syncing files to " + clt + "...")
rsync_cmd = f"rsync -az --no-perms --rsync-path=\"sudo rsync\" --omit-dir-times -e \"ssh -p77\" {dir} {tc.get_ssh_user()}@{clt}:{bench_path}/"
sp.check_call(rsync_cmd, shell=True)
setup_cmd = f'''cd {bench_path};
sudo rm -rf build;
mkdir build;
cd build;
cmake ../;
make -j8 khat cat rat memloadgen'''
__run_setup_cmd(conf, setup_cmd, "bench - compile")
def run(conf : NetExpConf):
stop_all(conf)
while True:
server_cmd = "sudo "
if conf.enable_pmc:
if conf.pmc_mode != 0:
pmc_cmd = "sudo pmcstat -C -w " + str(conf.pmc_counting_interval) + " -s " + conf.get_pmc_str() + " -o " + conf.root_dir + "/" + __PMC_FN
else:
pmc_cmd = "sudo pmcstat -n " + str(conf.pmc_sampling_rate) + " -S " + conf.get_pmc_str() + " -O " + conf.root_dir + "/" + __PMC_FN
tc.log_print("Starting server PMC...")
tc.log_print(pmc_cmd)
spmc = tc.remote_exec(conf.srv_fqdns, pmc_cmd, blocking=False)
server_cmd += conf.root_dir + "/khat --log-level lib.eal:err -- -A " + conf.srv_affinity + \
" -H " + conf.srv_mechspec.netspec + " -p " + str(conf.srv_port)
if int(conf.clt_pkt_pad) > 1518:
server_cmd += " -J "
if conf.enable_client_only:
ssrv = None
tc.log_print(server_cmd)
else:
# start server
tc.log_print("Starting server...")
tc.log_print(server_cmd)
ssrv = tc.remote_exec(conf.srv_fqdns, server_cmd, blocking=False)
if conf.enable_memgen:
memgen_cmd = "sudo " + conf.root_dir + "/memloadgen -b " + str(conf.memgen_size) + " -s " + conf.memgen_affinity + \
" -i " + str(conf.memgen_iteration) + " -d " + str(conf.memgen_tgtdom)
tc.log_print("Starting memloadgen...")
tc.log_print(memgen_cmd)
smem = tc.remote_exec(conf.srv_fqdns, memgen_cmd, blocking=False)
# start clients
tc.log_print("Starting clients...")
sclt = []
sclt_name = []
for i in range(len(conf.clt_fqdns)):
client_cmd = "sudo " + conf.root_dir + "/rat --log-level lib.eal:err -- -S -A " + conf.clt_affinity + \
" -i " + conf.clt_ia + \
" -q " + str(conf.calc_client_qps()) + \
" -H " + conf.clt_mechspecs[i].netspec + \
" -s " + conf.srv_mechspec.netspec + \
" -r " + str(conf.clt_rage_quit_lat) + \
" -l " + str(conf.clt_pkt_loss_lat) + \
" -w " + str(conf.clt_wrkld) + \
" -w " + str(conf.clt_wrkarg0) + \
" -w " + str(conf.clt_wrkarg1) + \
" -P " + str(conf.clt_pkt_pad) + \
" -D " + str(conf.clt_pkt_depth) + \
" -p " + str(conf.clt_port)
if int(conf.clt_pkt_pad) > 1518:
client_cmd += " -J "
tc.log_print(client_cmd)
sclt.append(tc.remote_exec([conf.clt_fqdns[i]], client_cmd, blocking=False)[0])
sclt_name.append(conf.clt_fqdns[i])
time.sleep(5)
# start master
tc.remote_exec
tc.log_print("Starting master...")
master_cmd = "sudo " + conf.root_dir + "/cat --log-level lib.eal:err -- " + \
" -s " + conf.srv_mechspec.netspec + \
" -o " + conf.root_dir + "/" + __SAMPLE_FN + \
" -t " + str(conf.mst_duration) + \
" -T " + str(conf.mst_warmup) + \
" -i " + conf.mst_ia + \
" -q " + str(conf.mst_qps) + \
" -l " + str(conf.mst_pkt_loss_lat) + \
" -L " + str(conf.mst_pkt_loss_max) + \
" -A " + conf.mst_affinity + \
" -H " + conf.mst_mechspec.netspec + \
" -p " + str(conf.mst_port)
for clt in conf.clt_mechspecs:
master_cmd += " -S " + clt.netspec
tc.log_print(master_cmd)
sp = tc.remote_exec(conf.mst_fqdns, master_cmd, blocking=False)
p = sp[0]
# launch stderr monitoring thread
exclude = ["Pseudo-terminal", "ice_", "i40e_"]
tc.errthr_create([p], conf.mst_fqdns, exclude)
if not conf.enable_client_only:
tc.errthr_create(ssrv, conf.srv_fqdns, exclude)
tc.errthr_create(sclt, sclt_name, exclude)
if conf.enable_memgen:
tc.errthr_create(smem, ["memloadgen"], exclude)
if conf.enable_pmc:
tc.errthr_create(spmc, ["pmcstat"], exclude)
tc.errthr_start()
success = False
cur = 0
# selec = select.poll()
# selec.register(p.stdout, select.POLLIN)
while True:
# either failed or timeout
# we use failure detection to save time for long durations
if tc.errthr_get_failed() or cur >= (conf.mst_warmup + conf.mst_duration) * 3:
break
# while selec.poll(1):
# print(p.stdout.readline())
if p.poll() != None:
success = True
break
time.sleep(1)
cur = cur + 1
stop_all(conf)
tc.errthr_stop()
tc.log_print("Cooling down...")
time.sleep(5)
if success:
return __keep_result(conf)