diff --git a/src/masterworker.cpp b/src/masterworker.cpp index 80f0066133f5fab07d324b740bc1174ffbc16a69..0a79fc8188cba1ec7df7770d3c98427c5d01536b 100644 --- a/src/masterworker.cpp +++ b/src/masterworker.cpp @@ -31,7 +31,7 @@ gi::ex multiply_1level_master( tensor3D_t& T, matrix_int_t& J, unsigned int size expr_c = NULL; expr_c = (char*) malloc( 3279 ); // TMP - int i, j; + int i, j; i = 0; j = 0; @@ -45,18 +45,18 @@ gi::ex multiply_1level_master( tensor3D_t& T, matrix_int_t& J, unsigned int size for( a1 = 0 ; a1 < size; a1++ ){ i=i+1; - for( a2 = 0; a2 < size ; a2++ ){ - j=j+1; - for( a3 = 0 ; a3 < size ; a3++ ){ - A = T[a1][a2][a3]; - for( b1 = 0 ; b1 < size ; b1++ ){ - parameters_t p( A, a1, a2, a3, b1 ); - input.push_back( p ); - } - } + for( a2 = 0; a2 < size ; a2++ ){ + j=j+1; + for( a3 = 0 ; a3 < size ; a3++ ){ + A = T[a1][a2][a3]; + for( b1 = 0 ; b1 < size ; b1++ ){ + parameters_t p( A, a1, a2, a3, b1 ); + input.push_back( p ); } + } } - + } + /* Compute the set of symbols */ /* Could be done while the first slave is working */ diff --git a/src/mw_addslave.cpp b/src/mw_addslave.cpp index 6bd37dd48306ed307ad5e36144fed8f4504bf2b8..f13546c93987450cee37fc899ea5f6c52e285db2 100644 --- a/src/mw_addslave.cpp +++ b/src/mw_addslave.cpp @@ -7,6 +7,7 @@ #include "parall_constants.h" #include "parall_internal.h" #include "utils.h" +#include "profiling.h" namespace gi = GiNaC; @@ -149,18 +150,22 @@ void multiply_1level_slave_addslave( tensor3D_t& T, matrix_int_t& J, unsigned in int rank; MPI_Comm_rank( comm, &rank ); - /* Compute the set of symbols */ - - gi::lst symbols = all_symbols_3D( size ); + double t_start, t_wait, t_compute; /* Ask for some work */ MPI_Send( &len, 1, MPI_UNSIGNED, ROOT, TAG_PULL, comm ); + /* Compute the set of symbols */ + + gi::lst symbols = all_symbols_3D( size ); + while( true ){ /* Receive a set of parameters */ - + + t_start = rdtsc(); MPI_Recv( ¶ms, 1, DT_PARAMETERS, ROOT, MPI_ANY_TAG, comm, &status ); + t_wait = rdtsc() - t_start; if( status.MPI_TAG == TAG_WORK ){ a1 = params.a1; @@ -168,8 +173,13 @@ void multiply_1level_slave_addslave( tensor3D_t& T, matrix_int_t& J, unsigned in a3 = params.a3; b1 = params.b1; gi::symbol A( std::string( params.A ) ); - + + t_start = rdtsc(); Tens = one_level1_product( &T, &J, A, size, a1, a2, a3, b1 ); + t_compute = rdtsc() - t_start; + + /* TODO if we waited for too long */ + if( t_wait > t_compute ) {} send_result( Tens ); diff --git a/src/mw_combined.cpp b/src/mw_combined.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b79a9199df23dd52742c902999af32f9851d1c60 --- /dev/null +++ b/src/mw_combined.cpp @@ -0,0 +1,344 @@ +#include <iostream> +#include <mpi.h> +#include <ginac/ginac.h> + +#include "products.h" +#include "utils_parall.h" +#include "parall_constants.h" +#include "parall_internal.h" +#include "utils.h" +#include "profiling.h" + +namespace gi = GiNaC; + +typedef enum { + LONG_ADD_M, /* The addition on the master took a long time */ + FINISHED, /* Computation finished normally */ +} end_code_t; + +typedef enum { + ALGO_MW, /* Regulat master-worker */ + ALGO_ADDSLAVE, /* Do the addition on a slave */ +} algo_t; + +/* This one is a "regular" master. It returns either when it is done, or when it decides to switch to another algorithm. + */ + +end_code_t multiply_combined_master_initial( tensor3D_t& T, matrix_int_t& J, unsigned int size, gi::ex& Tens, MPI_Comm comm = MPI_COMM_WORLD ) { + unsigned int a1, a2, a3, b1; + gi::ex A; + gi::lst symbols; + + MPI_Status status; + char* expr_c; + size_t expr_c_size = 0; + int src, np, running = 0; + unsigned int len; + + double t_start, t_add, t_wait; + algo_t algo = ALGO_MW; + + MPI_Comm_size( comm, &np ); + + expr_c = NULL; + expr_c = (char*) malloc( 3279 ); // TMP + + int i, j; + i = 0; + j = 0; + + int receivedresults = 0; + + std::vector<parameters_t> input; + std::vector<std::string> results_s; + std::vector<gi::ex> results; + + /* Build a list of argument sets */ + + for( a1 = 0 ; a1 < size; a1++ ){ + i=i+1; + for( a2 = 0; a2 < size ; a2++ ){ + j=j+1; + for( a3 = 0 ; a3 < size ; a3++ ){ + A = T[a1][a2][a3]; + for( b1 = 0 ; b1 < size ; b1++ ){ + parameters_t p( A, a1, a2, a3, b1 ); + input.push_back( p ); + } + } + } + } + + /* Compute the set of symbols */ + /* Could be done while the first slave is working */ + + symbols = all_symbols_3D( size ); + + /* Workers that have yet to send their first request */ + + bool initialround = true; + running = 0; + + /* Distribute the work */ + + while( input.size() > 0 ) { + t_start = rdtsc(); + MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status ); + src = status.MPI_SOURCE; + t_wait = rdtsc() - t_start; + std::cout << "wait " << t_wait << std::endl; + + if( status.MPI_TAG == TAG_PULL ) { + + /* Nothing else will come: just send wome work */ + send_work( input, src, comm ); + + if( initialround ){ + running++; + if( np - 1 == running ) initialround = false; // everyone is at work + } + + } else { + if( status.MPI_TAG == TAG_RES ){ + src = status.MPI_SOURCE; + + /* The first message contains the length of what is coming next */ + if( len != 0 ) { + if( len > expr_c_size ) { + expr_c_size = len; + if( NULL != expr_c ) free( expr_c ); + expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave + } + + /* Receive the result */ + MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status ); + + /* put it in the result queue */ + std::string s( expr_c ); + + if( algo == ALGO_ADDSLAVE ) { + send_work_addslave( input, results_s, src ); + } else { + send_work( input, src, comm ); + } + + /* Process what I have just received */ + + if( ALGO_MW == algo ) { + t_start = rdtsc(); + gi::ex received = de_linearize_expression( s, symbols ); + Tens += received; + t_add = rdtsc() - t_start; + std::cout << "Add " << t_add << std::endl; +#if DEBUG + results.push_back( received ); + results_s.push_back( s ); + receivedresults++; +#endif + if( t_add > t_wait ) { + /* We are spending too much time adding these results. Now we are going to ask a worker to do this. */ + // TODO use the average NP last wait time instead + // double average = accumulate( v.begin(), v.end(), 0.0)/v.size(); + + std::cout << "The master spent too much time computing the sum. Switch to ADDSLAVE algorithm" << std::endl; + algo = ALGO_ADDSLAVE; + } + } else { + if( ALGO_ADDSLAVE == algo ) { + results_s.push_back( s ); + } else { + std::cout << "ERROR: unknown algorithm on the master " << algo << std::endl; + } + } + } else { + /* Send more work */ + send_work( input, src, comm ); + } + } else{ + std::cerr << "Wrong tag received " << status.MPI_TAG << std::endl; + } + } + } + + /* Wait until everyone is done */ + + running = np - 1; // all the slaves are running + while( running > 0 ) { + + MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status ); + src = status.MPI_SOURCE; + + if( len != 0 ) { + if( len > expr_c_size ) { + expr_c_size = len; + if( NULL != expr_c ) free( expr_c ); + expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave + } + + /* Receive the result */ + MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status ); + + /* And send the END signal */ + send_end( src, comm ); + running--; + + /* Process what I have just received */ + /* Could be given to a slave... */ + /* put it in the result queue */ + std::string s( expr_c ); + gi::ex received = de_linearize_expression( s, symbols ); + Tens += received; +#if DEBUG + results.push_back( received ); + results_s.push_back( s ); + receivedresults++; +#endif + } else { + send_end( src, comm ); + running--; + } + } + +#if DEBUG + std::cout << "Received " << receivedresults << " results" << std::endl; + + std::cout << "Tpara=" << Tens << ";" << std::endl; +#endif + + if( NULL != expr_c) free( expr_c ); + return FINISHED; +} + +/* The traditional slave */ + +void multiply_combined_slave_initial( tensor3D_t& T, matrix_int_t& J, int size, MPI_Comm comm = MPI_COMM_WORLD ) { + gi::ex Tens; + int a1, a2, a3, b1; + // gi::ex A; + unsigned int len = 0; + + parameters_t params; + MPI_Status status; + char* expr_c; + + int rank; + MPI_Comm_rank( comm, &rank ); + + /* Ask for some work */ + + MPI_Send( &len, 1, MPI_UNSIGNED, ROOT, TAG_PULL, comm ); + + /* Compute the set of symbols */ + + gi::lst symbols = all_symbols_3D( size ); + + while( true ){ + /* Receive a set of parameters */ + + MPI_Recv( ¶ms, 1, DT_PARAMETERS, ROOT, MPI_ANY_TAG, comm, &status ); + + if( status.MPI_TAG == TAG_WORK ){ + a1 = params.a1; + a2 = params.a2; + a3 = params.a3; + b1 = params.b1; + gi::symbol A( std::string( params.A ) ); + + Tens = one_level1_product( &T, &J, A, size, a1, a2, a3, b1 ); + send_result( Tens ); + + } else { + if( status.MPI_TAG == TAG_ADD ) { + /* Receive a set of expressions to add */ + + /* Number of expressions received */ + int nb = params.a1; + + /* Length of each string */ + + unsigned int* lengths = (unsigned int*) malloc( nb*sizeof( unsigned int ) ); + MPI_Recv( lengths, nb, MPI_INT, ROOT, TAG_ADD, comm, &status ); + std::vector<std::string> results_s; + char* c_str; + int i; + int len; + for( i = 0 ; i < nb ; i++ ) { + len = lengths[i] + 1; + c_str = (char*) malloc( len ); + MPI_Recv( c_str, len, MPI_CHAR, ROOT, TAG_ADD, comm, &status ); + c_str[len-1] = '\0'; // The master sends C++ strings, which do not contain the final '\0' + results_s.push_back( std::string( c_str ) ); + free( c_str ); + } + + /* Delinearize all the expressions and add them */ + + Tens = add_expressions( results_s, symbols ); + + /* Send the result */ + + send_result( Tens ); + + } else { + if( status.MPI_TAG == TAG_END ){ + return; + } else { + std::cerr << "Wrong tag received on slave " << status.MPI_TAG << std::endl; + } + } + } + } +} + +/******************************************************************************* + * Combined master-worker * + *******************************************************************************/ + +gi::ex multiply_combined_master( tensor3D_t& T, matrix_int_t& J, int size ) { // simpler: same dimension everywhere + gi::ex Tens = 0; + end_code_t rc; + + /* Initially: start as a traditional M/W */ + + rc = multiply_combined_master_initial( T, J, size, Tens ); + switch( rc ){ + case FINISHED: + return Tens; + } + + return Tens; +} + +void multiply_combined_worker( tensor3D_t& T, matrix_int_t& J, int size ) { // simpler: same dimension everywhere + gi::ex Tens = 0; + + std::cout << "worker" << std::endl; + multiply_combined_slave_initial( T, J, size ); + +} + + +gi::ex multiply_combined( tensor3D_t& T, matrix_int_t& J, int size ) { // simpler: same dimension everywhere + + int rank; + gi::ex Tens = 0; + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); + + /* Create a new datatype for the parameters */ + + create_parameters_datatye(); + + /* Here we go */ + + if( 0 == rank ) { + Tens = multiply_combined_master( T, J, size ); + } else { + multiply_combined_worker( T, J, size ); + } + + /* Finalize */ + + free_parameters_dt(); + return Tens; + +} diff --git a/src/sequential.cpp b/src/sequential.cpp index 855104a7f570fe315a8f9b61fb1a463509afb7e0..0aa406557902abdeae64b064047c47ee88a19649 100644 --- a/src/sequential.cpp +++ b/src/sequential.cpp @@ -35,15 +35,15 @@ gi::ex multiply_seq( tensor3D_t& T, matrix_int_t& J, int size ) { // simpler: s j=j+1; // printf("j = %d\n", j); for( a3 = 0 ; a3 < size ; a3++ ){ - TAU_START( timerA ); - A = T[a1][a2][a3]; - /* Beyond this point, a2 and a3 are only used in the simplectic matrix */ + TAU_START( timerA ); + A = T[a1][a2][a3]; + /* Beyond this point, a2 and a3 are only used in the simplectic matrix */ for( b1 = 0 ; b1 < size ; b1++ ){ TAB = J[a1][b1]; for( b2 = 0 ; b2 < size ; b2++ ){ for( b3 = 0 ; b3 < size ; b3++ ){ TAU_START( timerB ); - /* Beyond this point, b1 is not used anymore */ + /* Beyond this point, b1 is not used anymore */ TABB = TAB * A*T[b1][b2][b3]; for( c1 = 0 ; c1 < size ; c1++ ){ for( c2 = 0 ; c2 < size ; c2++ ){ @@ -60,33 +60,33 @@ gi::ex multiply_seq( tensor3D_t& T, matrix_int_t& J, int size ) { // simpler: s Tens = Tens + TABCDD * T[d1][d2][d3]*J[a3][d3]; t_end = rdtsc(); - TAU_STOP( timeradd ); + TAU_STOP( timeradd ); #ifdef TAUPROF // std::cout << "add " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl; - printf( "add %lf %lu len %d\n", getTimeSpent( timeradd ), t_end - t_start, Tens.nops() ); + printf( "add %lf %lu len %d\n", getTimeSpent( timeradd ), t_end - t_start, Tens.nops() ); // std::cout << Tens << std::endl; #endif // TAUPROF } - } + } } } } } - TAU_STOP( timerB ); + TAU_STOP( timerB ); #ifdef TAUPROF - std::cout << "B " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl; + std::cout << "B " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl; #endif // TAUPROF } - } + } } - TAU_STOP( timerA ); + TAU_STOP( timerA ); #ifdef TAUPROF - std::cout << "A " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl; + std::cout << "A " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl; #endif // TAUPROF } } - } + } return Tens; } @@ -115,15 +115,16 @@ gi::ex multiply_1level( tensor3D_t& T, matrix_int_t& J, int size ) { // simpler j=j+1; // printf("j = %d\n", j); for( a3 = 0 ; a3 < size ; a3++ ){ - TAU_START( "b" ); + // TAU_START( "b" ); A = T[a1][a2][a3]; /* Beyond this point, a2 and a3 are only used in the simplectic matrix */ for( b1 = 0 ; b1 < size ; b1++ ){ Tn = one_level1_product( &T, &J, A, size, a1, a2, a3, b1 ); Tens += Tn; } - TAU_STOP( "b" ); -#ifdef TAUPROF + // TAU_STOP( "b" ); + //#ifdef TAUPROF + #if 0 time = getTimeSpent( "b" ); #endif // TAUPROF } @@ -147,7 +148,7 @@ gi::ex one_level1_product( tensor3D_t* T, matrix_int_t *J, gi::ex A, int size, i TABB = TAB * A*(*T)[b1][b2][b3]; T5 = 0; /* Beyond this point, b1 is not used anymore */ - TAU_START( timerB ); + // TAU_START( timerB ); for( c1 = 0 ; c1 < size ; c1++ ){ T4 = 0; for( c2 = 0 ; c2 < size ; c2++ ){ @@ -176,7 +177,7 @@ gi::ex one_level1_product( tensor3D_t* T, matrix_int_t *J, gi::ex A, int size, i T5 += T4; } Tens += T5; - TAU_STOP( timerB ); + // TAU_STOP( timerB ); } } diff --git a/src/tensormatrix.h b/src/tensormatrix.h index 246010a5f050afb5d84969fca04c85ab38c5de97..bcf5d8d86eb90c460db9319bb8b4e5fa99ea5983 100644 --- a/src/tensormatrix.h +++ b/src/tensormatrix.h @@ -26,6 +26,7 @@ gi::ex multiply_2levels( tensor3D_t&, matrix_int_t&, int ); gi::ex multiply_1level_mw( tensor3D_t&, matrix_int_t&, int ); gi::ex multiply_1level_mw_addslave( tensor3D_t&, matrix_int_t&, int ); gi::ex multiply_1level_mw_hierarch( tensor3D_t&, matrix_int_t&, int ); +gi::ex multiply_combined( tensor3D_t&, matrix_int_t&, int ); /******************************************************************************* * Default values * diff --git a/src/tensormatrix_mpi.cpp b/src/tensormatrix_mpi.cpp index ef6a4c915626e11579737d1d69aa8299e5485cef..e72db52db810e224783ab809487347aa1a41813c 100644 --- a/src/tensormatrix_mpi.cpp +++ b/src/tensormatrix_mpi.cpp @@ -26,6 +26,7 @@ namespace gi = GiNaC; - M/m: Master-Worker -> multiply_1level_mw - A/a: Master-Worker, addition on a slave -> multiply_1level_mw_addslave - H/h: Hierarchical master-worker -> multiply_1level_mw_hierarch + - C/c: Combined -> multiply_combined */ /* Sequentiel sur Minimum @@ -86,6 +87,10 @@ int main( int argc, char** argv ){ case 'h': tostart = 'h'; break; + case 'C': + case 'c': + tostart = 'c'; + break; case 'S': case 's': tostart = 's'; @@ -126,6 +131,9 @@ int main( int argc, char** argv ){ case 'h': Tpara = multiply_1level_mw_hierarch( T, J, N ); break; + case 'c': + Tpara = multiply_combined( T, J, N ); + break; case 's': Tpara = multiply_seq( T, J, N ); break;