/* * my crude pointer chasing experiment. * * dean@arctic.org * * this code is public domain. */ #include #include #include #include #include #include #include char *buffer; #define MAX_SIZE (4*1024*1024) #define SHARED_ALIGN (4*1024*1024) #define SHARED_SIZE (8192) #define x4(x) x x x x #define x16(x) x4(x) x4(x) x4(x) x4(x) #define x64(x) x16(x) x16(x) x16(x) x16(x) #define xloads(x) x64(x) x64(x) x64(x) x64(x) #define N_LOADS (256) // number of loads in inner loop #define N_ITER (1000) // number of times to iterate inner loop #define N_SAMPLES (20) // number of samples to take uint32_t not_dead; #define ld_template(n) \ void n##_loads(unsigned iter, char *vp) \ { \ uint32_t *p = (uint32_t *)n##_fudge((uintptr_t)(vp)); \ \ asm volatile(".align 16"); \ do { \ --iter; \ xloads(n##_deref) \ } while (iter > 0); \ \ not_dead = (uint32_t)p; \ } \ \ void n##_setup(unsigned cycle_length, unsigned stride) \ { \ int i; \ char *p; \ \ p = buffer; \ for (i = 0; i < cycle_length - 1; ++i) { \ /*printf("\n%08x %08x", p + i*stride, (i*stride) & (SHARED_SIZE-1));*/ \ *(uint32_t *)(p + i*stride) = (uintptr_t)n##_fudge((uintptr_t)(p + (i+1)*stride)); \ } \ /*printf("\n%08x %08x\t", p + i*stride, (i*stride) & (SHARED_SIZE-1)); */ \ *(uint32_t *)(p + i*stride) = (uintptr_t)n##_fudge((uintptr_t)(p)); \ } #define ld_ld_deref asm volatile("mov (%0),%0" : "=&r" (p) : "0" (p)); #define ld_ld_fudge(x) (x) ld_template(ld_ld) #define ld_offs_ld_deref asm volatile("mov 4(%0),%0" : "=&r" (p) : "0" (p)); #define ld_offs_ld_fudge(x) ((x) - 4) ld_template(ld_offs_ld) #define ld_add_ld_deref asm volatile( \ "\nadd $4,%0" \ "\nmov (%0),%0" \ : "=&r" (p) : "0" (p)); #define ld_add_ld_fudge(x) ((x) - 4) ld_template(ld_add_ld) #define ld_xor_ld_deref asm volatile( \ "\nxor $0x54,%0"\ "\nmov (%0),%0" \ : "=&r" (p) : "0" (p)); #define ld_xor_ld_fudge(x) ((x) ^ 0x54) ld_template(ld_xor_ld) static inline uint64_t rdtsc(void) { uint64_t result; asm volatile("rdtsc" : "=A" (result)); return result; } double bench_one_config(void (*loads)(unsigned, char *)) { uint64_t start_tsc, end_tsc; double sample[N_SAMPLES]; double best; int i; // we run this several times which causes it to be translated // by CMS (or cached in trace cache on p4) ... we take only // best score. for (i = 0; i < N_SAMPLES; ++i) { start_tsc = rdtsc(); loads(N_ITER, buffer); end_tsc = rdtsc(); asm("emms"); sample[i] = (double)(end_tsc - start_tsc) / (N_LOADS*N_ITER*1.0); } best = sample[0]; for (i = 1; i < N_SAMPLES; ++i) { if (sample[i] < best) { best = sample[i]; } } return best; } struct { unsigned cycle_length; unsigned stride; } configs[] = { { 8, 4 }, #if 0 { 128, 0x40 }, { 256, 0x40 }, { 512, 0x40 }, { 32, 0x1040 }, { 64, 0x1040 }, { 96, 0x1040 }, { 128, 0x1040 }, { 256, 0x1040 }, { 512, 0x1040 }, #elif 0 { 8, 0x1020 }, { 16, 0x1020 }, { 32, 0x1020 }, { 64, 0x1020 }, { 128, 0x1020 }, #elif 1 { 2, 0x1004 }, { 4, 0x1004 }, { 8, 0x1004 }, { 16, 0x1004 }, { 32, 0x1004 }, { 64, 0x1004 }, { 72, 0x1004 }, { 80, 0x1004 }, { 96, 0x1004 }, #endif }; void bench(const char *name, void (*loads)(unsigned, char *), void (*setup)(unsigned cycle_length, unsigned stride)) { int i; printf("%-40s", name); for (i = 0; i < sizeof(configs)/sizeof(configs[0]); ++i) { setup(configs[i].cycle_length, configs[i].stride); printf(" %5.1f ", bench_one_config(loads)); } printf("\n"); } int main(int argc, char **argv) { int i; setvbuf(stdout, NULL, _IONBF, BUFSIZ); #if 0 buffer = mmap(0, MAX_SIZE + SHARED_ALIGN, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (buffer == MAP_FAILED) { perror("mmap"); exit(1); } // round buffer start to SHARED_ALIGN buffer = (char *)(((uintptr_t)buffer + SHARED_ALIGN - 1) & ~(SHARED_ALIGN - 1)); #else { char wrbuf[1024]; char fname[] = "/var/tmp/chaseXXXXXX"; int fd; int rc; fd = mkstemp(fname); if (fd < 0) { perror("mkstemp"); exit(1); } unlink(fname); memset(wrbuf, 0, sizeof(wrbuf)); for (i = 0; i < SHARED_SIZE; ) { rc = write(fd, wrbuf, (SHARED_SIZE - i) > sizeof(wrbuf) ? sizeof(wrbuf) : (SHARED_SIZE - i)); if (rc < 0) { perror("write /var/tmp file"); exit(1); } i += rc; } // we make one big map to start with so that we know there's a contiguous hole in // our heap large enough for what we want to do buffer = mmap(0, MAX_SIZE + SHARED_ALIGN, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (buffer == MAP_FAILED) { perror("mmap"); exit(1); } // round buffer start to SHARED_ALIGN buffer = (char *)(((uintptr_t)buffer + SHARED_ALIGN - 1) & ~(SHARED_ALIGN - 1)); // now we create lots of repeated copies of the same SHARED_SIZE bytes for (i = 0; i < MAX_SIZE; i += SHARED_SIZE) { char *temp; // is it necessary to unmap first? hmm. if (munmap(buffer + i, SHARED_SIZE) != 0) { perror("munmap"); exit(1); } temp = mmap(buffer + i, SHARED_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, fd, 0); if (temp == MAP_FAILED) { perror("shared mmap"); exit(1); } } } #endif printf("%40s", "cycle_length"); for (i = 0; i < sizeof(configs)/sizeof(configs[0]); ++i) { printf(" %3u ", configs[i].cycle_length); } printf("\n%40s", "stride"); for (i = 0; i < sizeof(configs)/sizeof(configs[0]); ++i) { printf(" 0x%04x ", configs[i].stride); } printf("\n"); bench("mov (%eax),%eax", ld_ld_loads, ld_ld_setup); bench("mov 4(%eax),%eax", ld_offs_ld_loads, ld_offs_ld_setup); bench("mov (%eax),%eax; add $4,%eax", ld_add_ld_loads, ld_add_ld_setup); bench("mov (%eax),%eax; xor $0x54,%eax", ld_xor_ld_loads, ld_xor_ld_setup); return 0; }