diff --git a/util/mornafah.c b/util/mornafah.c index 6af9bb3..5901459 100644 --- a/util/mornafah.c +++ b/util/mornafah.c @@ -6,43 +6,84 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include +#include #include #include -#define BUFFER_SIZE (1 * 1024 * 1024) +#define BUFFER_SIZE (128 * 1024 * 1024) +#define BUFFER_CNT (BUFFER_SIZE / sizeof(int)) static _Atomic int flush = 0; +static _Atomic uint64_t offset = 0; static int * remote_buffer = NULL; -static uint64_t latencies[65536] = {0}; -static int times = 10; +static uint64_t * latencies; +static int times = 100; static int local_core = 0; static int remote_core = 1; static int cache_mode = 0; +static int verbose = 0; +static int random_access = 0; +static uint64_t tsc_freq = 0; + +static inline uint64_t cyc2ns(uint64_t cyc) +{ + return (double)cyc / ((double)tsc_freq / 1000000000.0); +} + +static inline uint64_t read_time(void) +{ + uint64_t l; + unsigned int a; + l = __rdtscp(&a); + _mm_lfence(); + return l; +} static void * local_thread(void *) { - int temp; - unsigned int dummy; - uint64_t start, end, base; + int temp, *addr; + uint64_t start, end; printf("Local thread running...\n"); while(times > 0) { + if (random_access) { + // change offset + offset = (rand() % BUFFER_CNT) * sizeof(int); + } + flush = 1; while(flush != 0) { } - - _mm_clflush(remote_buffer); - start = __rdtscp(&dummy); - end = __rdtscp(&dummy); - base = end - start; + addr = (int *)((char *)remote_buffer + offset); - start = __rdtscp(&dummy); - temp = *remote_buffer; - end = __rdtscp(&dummy); + if (verbose > 1) { + printf("Local thread(%d): flushing %p.\n", local_core, addr); + } - latencies[times - 1] = end - start - base; + _mm_clflushopt(addr); + _mm_mfence(); + + atomic_signal_fence(memory_order_seq_cst); + + start = read_time(); + temp = *addr; + end = read_time(); + + atomic_signal_fence(memory_order_seq_cst); + + if (verbose > 1) { + printf("Local thread(%d): read %p.\n", local_core, addr); + } + + latencies[times - 1] = end - start; times--; } @@ -52,14 +93,24 @@ static void * local_thread(void *) static void * remote_thread(void *) { int temp; + int * addr; printf("Remote thread running...\n"); while(1) { while(flush == 0) { } + + addr = (int *)((char *)remote_buffer + offset); + if(cache_mode) { - temp = *remote_buffer; + temp = *addr; + _mm_mfence(); } else { - _mm_clflush(remote_buffer); + _mm_clflushopt(addr); + _mm_mfence(); + } + + if (verbose > 1) { + printf("Remote thread(%d): %p %s.\n", remote_core, addr, cache_mode ? "read into cache" : "flushed"); } flush = 0; @@ -72,7 +123,7 @@ int main(int argc, char * argv[]) { int c; // parse arguments - while ((c = getopt(argc, argv, "l:r:t:m:")) != -1) { + while ((c = getopt(argc, argv, "l:r:t:vR")) != -1) { switch (c) { case 'l': local_core = atoi(optarg); @@ -83,8 +134,11 @@ int main(int argc, char * argv[]) case 't': times = atoi(optarg); break; - case 'm': - cache_mode = atoi(optarg); + case 'R': + random_access = 1; + break; + case 'v': + verbose++; break; default: exit(1); @@ -92,6 +146,8 @@ int main(int argc, char * argv[]) } } + srand(time(NULL)); + // init topo if (topo_init(1)) { fprintf(stderr, "libtopo init failed!\n"); @@ -104,12 +160,24 @@ int main(int argc, char * argv[]) exit(1); } - int remote_numa = topo_core_to_numa(remote_core); - int local_numa = topo_core_to_numa(local_core); - int total = times; + size_t sz = sizeof(tsc_freq); + int rc; + if ((rc = sysctlbyname("machdep.tsc_freq", &tsc_freq, &sz, NULL, 0)) < 0) { + fprintf(stderr,"failed to query tsc frequency via sysctl (%d)\n", errno); + } else { + fprintf(stdout,"system tsc frequency = %lu\n", tsc_freq); + } - remote_buffer = nms_alloc_static(remote_numa, BUFFER_SIZE); - *remote_buffer = 0xffa5be6c; + latencies = malloc(sizeof(uint64_t) * times); + const int remote_numa = topo_core_to_numa(remote_core); + const int local_numa = topo_core_to_numa(local_core); + const int total = times; + + remote_buffer = nms_malloc(remote_numa, BUFFER_SIZE); + // fill with random values + for (int i = 0; i < BUFFER_SIZE; i++) { + remote_buffer[i] = rand(); + } pthread_attr_t lattr, rattr; pthread_t lthread, rthread; @@ -131,12 +199,39 @@ int main(int argc, char * argv[]) pthread_join(lthread, NULL); + uint64_t min = UINT64_MAX; + uint64_t max = 0; uint64_t sum = 0; for (int i = total - 1; i >= 0; i--) { - printf("%lu\n", latencies[i]); + if (verbose) { + printf("%lu,\n", latencies[i]); + } + if (min > latencies[i]) { + min = latencies[i]; + } + if (max < latencies[i]) { + max = latencies[i]; + } sum += latencies[i]; } - printf("Avg: %lu\n", sum / total); + + double var = 0.0; + double avg = (double)sum / (double)total; + for (int i = total - 1; i >= 0; i--) { + var += pow(latencies[i] - avg, 2); + } + var = sqrt(var / avg); + + printf("Avg: %lu cycles (%lu ns)\n" + "Std: %lu cycles (%lu ns)\n" + "Min: %lu cycles (%lu ns)\n" + "Max: %lu cycles (%lu ns)\n", + (uint64_t)avg, cyc2ns((uint64_t)avg), + (uint64_t)var, cyc2ns((uint64_t)var), + min, cyc2ns(min), + max, cyc2ns(max)); + + free(latencies); return 0; }