#include #include #include "nms.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define BUFFER_SIZE (128 * 1024 * 1024) #define BUFFER_CNT (BUFFER_SIZE / sizeof(int)) static _Atomic int flush = 0; static _Atomic uint64_t offset = 0; static int * remote_buffer = NULL; static uint64_t * latencies; static int times = 100; static int local_core = 0; static int remote_core = 1; static int cache_mode = 0; static int verbose = 0; static int random_access = 0; static uint64_t tsc_freq = 0; static inline uint64_t cyc2ns(uint64_t cyc) { return (double)cyc / ((double)tsc_freq / 1000000000.0); } static inline uint64_t read_time(void) { uint64_t l; unsigned int a; l = __rdtscp(&a); _mm_lfence(); return l; } static void * local_thread(void *) { int temp, *addr; uint64_t start, end; printf("Local thread running...\n"); while(times > 0) { if (random_access) { // change offset offset = (rand() % BUFFER_CNT) * sizeof(int); } flush = 1; while(flush != 0) { } addr = (int *)((char *)remote_buffer + offset); if (verbose > 1) { printf("Local thread(%d): flushing %p.\n", local_core, addr); } _mm_clflush(addr); _mm_mfence(); atomic_signal_fence(memory_order_seq_cst); start = read_time(); temp = *addr; end = read_time(); atomic_signal_fence(memory_order_seq_cst); if (verbose > 1) { printf("Local thread(%d): read %p.\n", local_core, addr); } latencies[times - 1] = end - start; times--; } return (void *)(uintptr_t)temp; } static void * remote_thread(void *) { int temp; int * addr; printf("Remote thread running...\n"); while(1) { while(flush == 0) { } addr = (int *)((char *)remote_buffer + offset); if(cache_mode) { temp = *addr; _mm_mfence(); } else { _mm_clflush(addr); _mm_mfence(); } if (verbose > 1) { printf("Remote thread(%d): %p %s.\n", remote_core, addr, cache_mode ? "read into cache" : "flushed"); } flush = 0; } return (void *)(uintptr_t)temp; } int main(int argc, char * argv[]) { { int c; // parse arguments while ((c = getopt(argc, argv, "l:r:t:vR")) != -1) { switch (c) { case 'l': local_core = atoi(optarg); break; case 'r': remote_core = atoi(optarg); break; case 't': times = atoi(optarg); break; case 'R': random_access = 1; break; case 'v': verbose++; break; default: exit(1); } } } srand(time(NULL)); // init topo if (topo_init(1)) { fprintf(stderr, "libtopo init failed!\n"); exit(1); } // init if (nms_init(1)) { fprintf(stderr, "libnms init failed!\n"); exit(1); } size_t sz = sizeof(tsc_freq); int rc; if ((rc = sysctlbyname("machdep.tsc_freq", &tsc_freq, &sz, NULL, 0)) < 0) { fprintf(stderr,"failed to query tsc frequency via sysctl (%d)\n", errno); } else { fprintf(stdout,"system tsc frequency = %lu\n", tsc_freq); } latencies = malloc(sizeof(uint64_t) * times); const int remote_numa = topo_core_to_numa(remote_core); const int local_numa = topo_core_to_numa(local_core); const int total = times; remote_buffer = nms_malloc(remote_numa, BUFFER_SIZE); // fill with random values for (int i = 0; i < BUFFER_SIZE; i++) { remote_buffer[i] = rand(); } pthread_attr_t lattr, rattr; pthread_t lthread, rthread; cpuset_t lcpuset, rcpuset; CPU_ZERO(&lcpuset); CPU_ZERO(&rcpuset); CPU_SET(local_core, &lcpuset); CPU_SET(remote_core, &rcpuset); pthread_attr_init(&rattr); pthread_attr_setaffinity_np(&rattr, sizeof(cpuset_t), &rcpuset); pthread_attr_init(&lattr); pthread_attr_setaffinity_np(&lattr, sizeof(cpuset_t), &lcpuset); printf("local thread: %d numa: %d, remote: %d numa: %d\n", local_core, local_numa, remote_core, remote_numa); pthread_create(<hread, &lattr, local_thread, NULL); pthread_create(&rthread, &rattr, remote_thread, NULL); pthread_join(lthread, NULL); uint64_t min = UINT64_MAX; uint64_t max = 0; uint64_t sum = 0; for (int i = total - 1; i >= 0; i--) { if (verbose) { printf("%lu,\n", latencies[i]); } if (min > latencies[i]) { min = latencies[i]; } if (max < latencies[i]) { max = latencies[i]; } sum += latencies[i]; } double var = 0.0; double avg = (double)sum / (double)total; for (int i = total - 1; i >= 0; i--) { var += pow(latencies[i] - avg, 2); } var = sqrt(var / avg); printf("Avg: %lu cycles (%lu ns)\n" "Std: %lu cycles (%lu ns)\n" "Min: %lu cycles (%lu ns)\n" "Max: %lu cycles (%lu ns)\n", (uint64_t)avg, cyc2ns((uint64_t)avg), (uint64_t)var, cyc2ns((uint64_t)var), min, cyc2ns(min), max, cyc2ns(max)); free(latencies); return 0; }