/* * testcollperf2.c * gasnet_tree_coll * * Created by Rajesh Nishtala on 10/1/07. * Copyright 2007 Berkeley UPC Group. All rights reserved. * */ /* The primary aim of this tester is to measure the performance and correctness of the various collectives at larger message sizes. It is NOT intended as a tester to measure correctness of synchronization options that is covered testcoll */ #include "gasnet.h" #include "gasnet_coll.h" #define DEFAULT_OUTER_VERIFICATION_ITERS 5 #define DEFAULT_INNER_VERIFICATION_ITERS 50 #define DEFAULT_PERFORMANCE_ITERS 1000 /* max data size for the test in bytes*/ #define DEFAULT_MAX_DATA_SIZE 65536 /*max_dsize is a variable set in main*/ #define TOTAL_THREADS threads_per_node*gasnet_nodes() #if 1 #define ERROR_EXIT() gasnet_exit(1) #else #define ERROR_EXIT() do {} while(0) #endif gasnet_node_t mynode; gasnet_node_t nodes; gasnet_image_t threads_per_node; gasnet_image_t THREADS; int inner_verification_iters; int outer_verification_iters; int performance_iters; size_t max_data_size; #define TEST_SEGSZ_EXPR (sizeof(int)*(max_data_size*(inner_verification_iters)*TOTAL_THREADS*threads_per_node*2)) #define SEG_PER_THREAD (sizeof(int)*max_data_size*(inner_verification_iters)*TOTAL_THREADS) #include "test.h" #define COLL_BARRIER() PTHREAD_BARRIER(threads_per_node) typedef struct { int my_local_thread; int mythread; gasnet_coll_handle_t *hndl; char _pad[GASNETT_CACHE_LINE_BYTES]; uint8_t *mysrc, *mydest; uint8_t *node_src, *node_dst; } thread_data_t; uint8_t **my_srcs; uint8_t **my_dsts; uint8_t **all_srcs; uint8_t **all_dsts; void fill_flag_str(int flags, char *outstr) { if(flags & GASNET_COLL_IN_NOSYNC && flags & GASNET_COLL_OUT_NOSYNC) { sprintf(outstr, "no/no"); } else if(flags & GASNET_COLL_IN_NOSYNC && flags & GASNET_COLL_OUT_MYSYNC) { sprintf(outstr, "no/my"); } else if(flags & GASNET_COLL_IN_NOSYNC && flags & GASNET_COLL_OUT_ALLSYNC) { sprintf(outstr, "no/all"); } else if(flags & GASNET_COLL_IN_MYSYNC && flags & GASNET_COLL_OUT_NOSYNC) { sprintf(outstr, "my/no"); } else if(flags & GASNET_COLL_IN_MYSYNC && flags & GASNET_COLL_OUT_MYSYNC) { sprintf(outstr, "my/my"); } else if(flags & GASNET_COLL_IN_MYSYNC && flags & GASNET_COLL_OUT_ALLSYNC) { sprintf(outstr, "my/all"); } else if(flags & GASNET_COLL_IN_ALLSYNC && flags & GASNET_COLL_OUT_NOSYNC) { sprintf(outstr, "all/no"); } else if(flags & GASNET_COLL_IN_ALLSYNC && flags & GASNET_COLL_OUT_MYSYNC) { sprintf(outstr, "all/my"); } else if(flags & GASNET_COLL_IN_ALLSYNC && flags & GASNET_COLL_OUT_ALLSYNC) { sprintf(outstr, "all/all"); } } void scale_ptrM(void * out_ptr[], void * const in_ptr[], size_t elem_count, size_t elem_size, gasnet_image_t total_images) { int i; for(i=0; imy_local_thread==0) MSG0("%d> %s/%s %s sync_mode: (%s) size: %ld bytes time: %g us", td->mythread, addr_mode, num_addrs,\ coll_str, sync_mode, nelem*sizeof(int), (double)gasnett_ticks_to_us(total_ticks)/performance_iters) #else #define print_timer(td, coll_str, addr_mode, num_addrs, sync_mode, nelem, total_ticks) #endif void run_SINGLE_ADDR_test(thread_data_t *td, uint8_t **dst_arr, uint8_t **src_arr, size_t nelem, int root_thread, int in_flags) { /* all threads pass the same pointers for src and dest*/ int i,j,t,k; int flags = in_flags | GASNET_COLL_SRC_IN_SEGMENT|GASNET_COLL_DST_IN_SEGMENT; int *src, *dst; char output_str[8]; gasnett_tick_t begin, end; char flag_str[8]; fill_flag_str(flags, flag_str); if(flags & GASNET_COLL_SINGLE) { src = (int*) (src_arr[0]); /* all threads have the same address so just use slot 0*/ dst = (int*) (dst_arr[0]); /* all threads have the same address so just use slot 0*/ sprintf(output_str, "SINGLE"); } else { src = ((int*) td->mysrc); dst = ((int*) td->mydest); sprintf(output_str, "LOCAL"); } for(k=0; kmythread == root_thread) { for(i=0; i broadcast verification @ iteration: %d ... expected %d got %d", (int) td->mythread, (int) (i/nelem), expected, dst[i]); ERROR_EXIT(); } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread == root_thread) { for(i=0; imythread*nelem+j); if(dst[i*nelem+j] != expected) { MSG("%d> scatter verification @ iteration: %d ... expected %d got %d", td->mythread, i, expected, dst[i*nelem+j]); ERROR_EXIT(); } } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*nelem+j; } } if(root_thread == td->mythread) { for(i=0; imythread == root_thread) { for(i=0; i gather verification @ iteration: %d ... expected %d got %d", td->mythread, (int)(i/(THREADS*nelem)), expected, dst[i]); ERROR_EXIT(); } } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*nelem; } } for(i=0; i gather_all verification @ iteration: %d ... expected %d got %d", td->mythread, (int)(i/(nelem*THREADS)), expected, dst[i]); ERROR_EXIT(); } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*10000; dst[i] = -1; } if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*nelem+j); if(dst[i*nelem*THREADS+t*nelem+j] != expected) { MSG("%d> exchange verification @ iteration: %d ... expected %d got %d", td->mythread, i, expected, dst[i*nelem*THREADS+t*nelem+j]); ERROR_EXIT(); } } } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imy_local_thread==0) MSG0("%s/SINGLE-addr sync_mode: %s size: %d bytes root: %d. PASS", output_str, flag_str, (int) (sizeof(int)*nelem), root_thread); COLL_BARRIER(); } void run_MULTI_ADDR_test(thread_data_t *td, uint8_t **dst_arr, uint8_t **src_arr, size_t nelem, gasnet_image_t root_thread, int in_flags) { /* all threads pass the same pointers for src and dest*/ int i,j,t,k; int flags = in_flags| GASNET_COLL_SRC_IN_SEGMENT | GASNET_COLL_DST_IN_SEGMENT; gasnet_image_t num_addrs; gasnett_tick_t begin,end; int *src, *dst, *mysrc, *mydest; uint8_t **tmp_src, **tmp_dest, **curr_dst_arr, **curr_src_arr; char output_str[8]; char flag_str[8]; fill_flag_str(flags, flag_str); COLL_BARRIER(); if(flags & GASNET_COLL_SINGLE) { src = (int*) src_arr[root_thread]; /* all threads have the same address so just use slot 0*/ dst = (int*) dst_arr[root_thread]; /* all threads have the same address so just use slot 0*/ num_addrs = THREADS; sprintf(output_str, "SINGLE"); } else { src = ((int*)src_arr[root_thread % threads_per_node]); dst = ((int*)dst_arr[root_thread % threads_per_node]); num_addrs = threads_per_node; sprintf(output_str, "LOCAL"); } mysrc = ((int*) td->mysrc); mydest = ((int*) td->mydest); tmp_src = (uint8_t**) test_malloc(sizeof(uint8_t*)*num_addrs*inner_verification_iters); tmp_dest = (uint8_t**) test_malloc(sizeof(uint8_t*)*num_addrs*inner_verification_iters); for(k=0; kmythread == root_thread) { for(i=0; i broadcastM verification @ iteration: %d ... expected %d got %d", td->mythread, (int)(i/nelem), expected, mydest[i]); ERROR_EXIT(); } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread == root_thread) { for(i=0; imythread*nelem+j); if(mydest[i*nelem+j] != expected) { MSG("%d> scatterM verification @ iteration: %d ... expected %d got %d", td->mythread, i, expected, mydest[i*nelem+j]); ERROR_EXIT(); } } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*nelem+j; } } if(root_thread == td->mythread) { for(i=0; imythread == root_thread) { for(i=0; i gatherM verification @ iteration: %d ... expected %d got %d", td->mythread, (int)(i/(nelem*THREADS)), expected, mydest[i]); ERROR_EXIT(); } } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*nelem; } } for(i=0; i gather_allM verification @ iteration: %d ... expected %d got %d", td->mythread, (int)(i/(nelem*THREADS)), expected, mydest[i]); ERROR_EXIT(); } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imythread*10000; mydest[i] = -1; } if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} curr_dst_arr = tmp_dest; curr_src_arr = tmp_src; for(i=0; imythread*nelem+j); if(mydest[i*nelem*THREADS+t*nelem+j] != expected) { MSG("%d> exchangeM verification @ iteration: %d ... expected %d got %d", td->mythread, i, expected, mydest[i*nelem*THREADS+t*nelem+j]); ERROR_EXIT(); } } } } } COLL_BARRIER(); begin = gasnett_ticks_now(); if(flags & GASNET_COLL_IN_NOSYNC) {COLL_BARRIER();} for(i=0; imy_local_thread==0) MSG0("%s/MULTI-addr sync_mode: %s size: %d bytes root: %d. PASS", output_str, flag_str, (int) (sizeof(int)*nelem), root_thread); COLL_BARRIER(); test_free(tmp_src); test_free(tmp_dest); } void *thread_main(void *arg) { thread_data_t *td = (thread_data_t*) arg; size_t size; int i,flag_iter; #if GASNET_PAR gasnet_image_t *imagearray = test_malloc(nodes * sizeof(gasnet_image_t)); for (i=0; imythread, NULL, 0, 0); test_free(imagearray); #else gasnet_coll_init(NULL, 0, NULL, 0, 0); #endif COLL_BARRIER(); for(flag_iter=0; flag_iter<9; flag_iter++) { int flags; switch(flag_iter) { case 0: flags = GASNET_COLL_IN_NOSYNC | GASNET_COLL_OUT_NOSYNC; break; case 1: flags = GASNET_COLL_IN_NOSYNC | GASNET_COLL_OUT_MYSYNC; break; case 2: flags = GASNET_COLL_IN_NOSYNC | GASNET_COLL_OUT_ALLSYNC; break; case 3: flags = GASNET_COLL_IN_MYSYNC | GASNET_COLL_OUT_NOSYNC; break; case 4: flags = GASNET_COLL_IN_MYSYNC | GASNET_COLL_OUT_MYSYNC; break; case 5: flags = GASNET_COLL_IN_MYSYNC | GASNET_COLL_OUT_ALLSYNC; break; case 6: flags = GASNET_COLL_IN_ALLSYNC | GASNET_COLL_OUT_NOSYNC; break; case 7: flags = GASNET_COLL_IN_ALLSYNC | GASNET_COLL_OUT_MYSYNC; break; case 8: flags = GASNET_COLL_IN_ALLSYNC | GASNET_COLL_OUT_ALLSYNC; break; default: continue; } #if GASNET_ALIGNED_SEGMENTS if(threads_per_node == 1) { for(size = 1; size<=max_data_size; size=size*2) { run_SINGLE_ADDR_test(td, all_dsts, all_srcs, size, 0, flags|GASNET_COLL_SINGLE); } } else { if(td->my_local_thread == 0) MSG0("skipping SINGLE/SINGLE test (multiple threads per node)"); } #else if(td->my_local_thread == 0) MSG0("skipping SINGLE/SINGLE test (unaligned segments)"); #endif for(size = 1; size<=max_data_size; size=size*2) { run_SINGLE_ADDR_test(td, my_dsts, my_srcs, size, 0, flags|GASNET_COLL_LOCAL); } for(size = 1; size<=max_data_size; size=size*2) { run_MULTI_ADDR_test(td, all_dsts, all_srcs, size, 0, flags|GASNET_COLL_SINGLE); } for(size = 1; size<=max_data_size; size=size*2) { run_MULTI_ADDR_test(td, my_dsts, my_srcs, size, 0, flags|GASNET_COLL_LOCAL); } } return NULL; } int main(int argc, char **argv) { static uint8_t *A, *B; int i,j; thread_data_t *td_arr; GASNET_Safe(gasnet_init(&argc, &argv)); if(argc > 1) { max_data_size = MAX(atoi(argv[1])/sizeof(int),1); } else { max_data_size = DEFAULT_MAX_DATA_SIZE/sizeof(int); } if (argc > 2) { outer_verification_iters = MAX(1,atoi(argv[2])); } else { outer_verification_iters = DEFAULT_OUTER_VERIFICATION_ITERS; } if (argc > 3) { inner_verification_iters = MAX(1,atoi(argv[3])); } else { inner_verification_iters = DEFAULT_INNER_VERIFICATION_ITERS; } /* make sure that there is at least 1 inner verification iteration since this is waht we use for the performance runs*/ inner_verification_iters = MAX(1, inner_verification_iters); if(argc > 4) { performance_iters = atoi(argv[4]); } else { performance_iters = DEFAULT_PERFORMANCE_ITERS; } #if GASNET_PAR if (argc > 5) { threads_per_node = atoi(argv[5]); } else { threads_per_node = gasnett_cpu_count(); } if (threads_per_node > TEST_MAXTHREADS || threads_per_node < 1) { printf("ERROR: Threads must be between 1 and %d\n", TEST_MAXTHREADS); exit(EXIT_FAILURE); } if (threads_per_node > gasnett_cpu_count()) { MSG0("WARNING: thread count (%i) exceeds physical cpu count (%i) - enabling \"polite\", low-performance synchronization algorithms", threads_per_node, gasnett_cpu_count()); gasnet_set_waitmode(GASNET_WAIT_BLOCK); } #else threads_per_node = 1; #endif /* get SPMD info */ mynode = gasnet_mynode(); nodes = gasnet_nodes(); THREADS = nodes * threads_per_node; /* do some sanity checking of the input arguments*/ /* the total memory that we will need to attach is inner_verification_iters*total_images*my_images*2*sizeof(int)*max_data_size*/ /* make sure that this value is about less than or equal to half the maximum gasnet segment */ /* if it is not scale down inner_verification_iters and scale up outer_verification_iters until we match*/ { size_t curr_req = inner_verification_iters * THREADS * threads_per_node * sizeof(int) * max_data_size * 2; size_t max_mem_usage = gasnet_getMaxGlobalSegmentSize()/2; MSG0("command line args: max_data_size=%d bytes outer_verification_iters=%d inner_verification_iters=%d performance_iters=%d threads_per_node=%d ", (int)(max_data_size*sizeof(int)), outer_verification_iters, inner_verification_iters, performance_iters, threads_per_node); if(curr_req > max_mem_usage) { MSG0("WARNING: inner iterations too large.\n"); MSG0("Scaling down inner iterations and scaling up outer iterations to compensate\n"); while(curr_req > max_mem_usage && inner_verification_iters > 1) { inner_verification_iters /=2; outer_verification_iters *=2; curr_req = inner_verification_iters * THREADS * threads_per_node * sizeof(int) * max_data_size * 2; } /* we can't scale down inner verifications further than 1 so we have to scale down the data size until it fits*/ if(inner_verification_iters ==1 && curr_req > max_mem_usage) { MSG0("WARNING: scaled inner iterations down to 1 but still not enough room in segment."); MSG0("WARNING: scaling down datasize"); } while(inner_verification_iters == 1 && curr_req > max_mem_usage && max_data_size > 0) { max_data_size /=2; curr_req = inner_verification_iters * THREADS * threads_per_node * sizeof(int) * max_data_size * 2; } if(max_data_size ==0) { MSG0("ERROR: Segment too small ... can't run testcollperf"); gasnet_exit(1); } MSG0("adjusted args: max_data_size=%d bytes outer_verification_iters=%d inner_verification_iters=%d performance_iters=%d threads_per_node=%d ", (int)(max_data_size*sizeof(int)), outer_verification_iters, inner_verification_iters, performance_iters, threads_per_node); } } GASNET_Safe(gasnet_attach(NULL, 0, TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET)); test_init("testcollperf",0,"(max data size) (outer_verification_iters) (inner_verification_iters) (performance_iters) (thread count per node) "); A = TEST_MYSEG(); B = A+(SEG_PER_THREAD*threads_per_node); my_srcs = (uint8_t**) test_malloc(sizeof(uint8_t*)*threads_per_node); my_dsts = (uint8_t**) test_malloc(sizeof(uint8_t*)*threads_per_node); all_srcs = (uint8_t**) test_malloc(sizeof(uint8_t*)*THREADS); all_dsts = (uint8_t**) test_malloc(sizeof(uint8_t*)*THREADS); td_arr = (thread_data_t*) test_malloc(sizeof(thread_data_t)*threads_per_node); for(i=0; i= SEG_PER_THREAD*threads_per_node); */ for(j=0; j