diff --git a/src/Makefile b/src/Makefile index db349944217d64bb42965ab207d409ef64936b4e..59876781d359430ede4b6a1f68356a1a888588ff 100644 --- a/src/Makefile +++ b/src/Makefile @@ -30,7 +30,7 @@ MPISRC = masterworker.cpp mw_addslave.cpp hierarchical.cpp \ perf.cpp sequential.cpp tensormatrix_mpi.cpp \ utils.cpp utils_parall.cpp profiling.cpp mw_combined.cpp \ masterworker2.cpp mw_addslave2.cpp hierarchical2.cpp \ - masterworker3.cpp mw_addslave3.cpp + masterworker3.cpp mw_addslave3.cpp mw_addslave4.cpp MPIOBJ= $(MPISRC:.cpp=.o) diff --git a/src/masterworker.cpp b/src/masterworker.cpp index 5bbb605e555d771d456467cb56bb80fa296808d6..b3f2cbe2b8a8baacaca0d13a4c0602ba74d0e069 100644 --- a/src/masterworker.cpp +++ b/src/masterworker.cpp @@ -31,10 +31,6 @@ gi::ex multiply_1level_master( tensor3D_t& T, unsigned int size, MPI_Comm comm = expr_c = NULL; expr_c = (char*) malloc( 3279 ); // TMP - int i, j; - i = 0; - j = 0; - int receivedresults = 0; unsigned int N = size/2; @@ -45,9 +41,7 @@ gi::ex multiply_1level_master( tensor3D_t& T, unsigned int size, MPI_Comm comm = /* Build a list of argument sets */ for( a4 = 0 ; a4 < N ; a4++ ){ - i=i+1; for( a2 = 0; a2 < N ; a2++ ){ - j=j+1; for( a1 = 0 ; a1 < N ; a1++ ){ parameters_t p( a4, a2, a1 ); input.push_back( p ); diff --git a/src/mw_addslave.cpp b/src/mw_addslave.cpp index 91b8a04432cc06f56bf479388fbbdb9d1ce20d9d..833dff363967ae9438b317283365427ff111c42d 100644 --- a/src/mw_addslave.cpp +++ b/src/mw_addslave.cpp @@ -42,6 +42,8 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co std::vector<parameters_t> input; std::vector<std::string> results; /* length and char* */ + double t1 = getTime(); + /* Build a list of argument sets */ for( a4 = 0 ; a4 < N ; a4++ ){ @@ -60,6 +62,8 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co symbols = all_symbols_3D( size ); + double t2 = getTime(); + /* Distribute the work */ while( input.size() > 0 ) { @@ -98,6 +102,8 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co } } + double t3 = getTime(); + /* Wait until everyone is done */ running = np - 1; // all the slaves are running @@ -122,9 +128,17 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co send_add_or_end_addslave( results, src, &running ); } + double t4 = getTime(); + /* Add whatever I have left */ Tens = add_expressions( results, symbols ); - + + double t5 = getTime(); + std::cout << "Init: " << t2 - t1 << std::endl; + std::cout << "Loop: " << t3 - t2 << std::endl; + std::cout << "Fini: " << t4 - t3 << std::endl; + std::cout << "Add: " << t5 - t4 << std::endl; + #if DEBUG std::cout << "Received " << receivedresults << " results" << std::endl; diff --git a/src/mw_addslave2.cpp b/src/mw_addslave2.cpp index e64a7a4d1b6ff35cc129dfc49ae5b7420f89cd8a..9e0aaf9b99d631e04b6764bb47687999e0bd9775 100644 --- a/src/mw_addslave2.cpp +++ b/src/mw_addslave2.cpp @@ -183,8 +183,10 @@ void multiply_1level_slave_addslave2( tensor3D_t& T, unsigned int size, MPI_Comm /* Delinearize all the expressions and add them */ + double t1 = getTime(); Tens = add_expressions( results_s, symbols ); - + std::cout << "Addition: " << getTime() - t1 << std::endl; + /* Send the result */ send_result( Tens ); diff --git a/src/mw_addslave4.cpp b/src/mw_addslave4.cpp new file mode 100644 index 0000000000000000000000000000000000000000..016269ebb2b27393fde92703ba47c3159c8c488e --- /dev/null +++ b/src/mw_addslave4.cpp @@ -0,0 +1,343 @@ +#include <iostream> +#include <mpi.h> +#include <ginac/ginac.h> +#include <math.h> // for ceil + +#include "products.h" +#include "utils_parall.h" +#include "parall_constants.h" +#include "parall_internal.h" +#include "utils.h" +#include "profiling.h" + +namespace gi = GiNaC; + +#define MAXLENADD 1 // 256 + +unsigned int maxlen( std::vector<std::string> expressions ){ + unsigned int len = 0; + for( auto s: expressions ) { + unsigned int l2 = s.length(); + if( len < l2 ) { + len = l2; + } + } + return len; +} + +gi::ex add_expressions_parall( std::vector<std::string> expressions, gi::lst symbols, parameters_2_1_t p, MPI_Comm comm = MPI_COMM_WORLD ) { + gi::ex Tens = 0; + int size, i, nb, len; + unsigned int chunk, end; + std::vector<unsigned int> cut; + unsigned int* lengths; + std::string result; + char* expr; + MPI_Status status; + size_t expr_c_size = 0; + char* expr_c; + + /* If the expressions are short, compute the sum locally */ + if( maxlen( expressions ) < MAXLENADD ) + return add_expressions( expressions, symbols ); + + MPI_Comm_size( comm, &size ); + nb = expressions.size(); + lengths = (unsigned int*) malloc( nb * sizeof( unsigned int ) ); + for( i = 0 ; i < nb ; i++ ) { + cut.push_back( 0 ); + lengths[i] = 0; + } + unsigned int running = size - 1; + p.setParams( nb, 1 ); + + /* TODO ca se factorise avec send_expressions_to_add */ + + for( int peer = 1 ; peer < size ; peer++ ) { + + i = 0; + for( auto s: expressions ) { + /* How much are we going to send: stop at a + or - sign (and keep the sign) */ + chunk = ceil( s.length() / ( size - 1 ) ); + end = cut[i] + chunk; + while( !( s[end] == '+' || s[end] == '-' || end == s.length() - 1) ){ + end++; + } + end--; + + lengths[i] = end - cut[i] + 1; + i++; + } + + /* Send the lengths */ + MPI_Send( &p, 1, DT_PARAMETERS_2_1, peer, TAG_ADD, comm ); + MPI_Send( lengths, nb, MPI_INT, peer, TAG_ADD, comm ); + + /* Send the strings */ + + for( unsigned int j = 0 ; j < nb ; j++ ) { + expr = const_cast<char*>( expressions[j].c_str() ); + + MPI_Send( &( expr[ cut[j] ] ), lengths[j], MPI_CHAR, peer, TAG_ADD, comm ); + cut[j] += lengths[j]; + } + } + + /* Receive the results */ + + expr_c = NULL; + + while( running > 0 ) { + MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status ); + int src = status.MPI_SOURCE; + len++; + + if( len != 0 ) { + + if( len > expr_c_size ) { + expr_c_size = len; + if( NULL != expr_c ) free( expr_c ); + expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave + } + + /* Receive the result */ + MPI_Recv( expr_c, len-1, MPI_CHAR, src, TAG_EXPR, comm, &status ); + expr_c[len - 1] = '\n'; + + /* Concatenate the result */ + std::string recvs( expr_c ); + if( expr_c[0] != '-' ) result += '+'; + result += recvs; + } + running--; + send_end( src, p ); + } + + Tens = de_linearize_expression( result, symbols ); + + free( lengths ); + free( expr_c ); + return Tens; +} + +/******************************************************************************* + * Parallel 1-level decomposition with addition on a slave * + *******************************************************************************/ + +gi::ex multiply_1level_master_addslave4( tensor3D_t& T, unsigned int size, MPI_Comm comm = MPI_COMM_WORLD ) { + gi::ex Tens = 0; + unsigned int a2, a4; + gi::lst symbols; + + MPI_Status status; + char* expr_c; + size_t expr_c_size = 0; + int src, np; + unsigned int len, running = 0; + parameters_2_1_t pzero( 0, 0 ); + + MPI_Comm_size( comm, &np ); + + expr_c = NULL; + expr_c = (char*) malloc( 3279 ); + + int receivedresults = 0; + unsigned int N = size/2; + + std::vector<parameters_2_1_t> input; + std::vector<std::string> results; /* length and char* */ + + /* Build a list of argument sets */ + + for( a4 = 0 ; a4 < N ; a4++ ){ + for( a2 = 0; a2 < N ; a2++ ){ + parameters_2_1_t p( a4, a2 ); + input.push_back( p ); + } + } + + /* Compute the set of symbols */ + /* Could be done while the first slave is working */ + + symbols = all_symbols_3D( size ); + + /* Distribute the work */ + + while( input.size() > 0 ) { + MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status ); + + if( status.MPI_TAG == TAG_PULL ) { + /* Nothing else will come: just send wome work */ + src = status.MPI_SOURCE; + send_work( input, src ); + + } else { + if( status.MPI_TAG == TAG_RES ){ + src = status.MPI_SOURCE; + + /* The first message contains the length of what is coming next */ + if( len != 0 ) { + if( len > expr_c_size ) { + expr_c_size = len; + if( NULL != expr_c ) free( expr_c ); + expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave + } + + /* Receive the result */ + MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status ); + + /* Put it in the result queue */ + results.push_back( std::string( expr_c ) ); + } + + /* Send more work */ + send_work_addslave( input, results, src ); + } else { + std::cerr << "Wrong tag received " << status.MPI_TAG << std::endl; + } + + } + } + + /* Wait until everyone is done */ + + running = np - 1; // all the slaves are running + while( running > 0 ) { + MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status ); + src = status.MPI_SOURCE; + + if( len != 0 ) { + if( len > expr_c_size ) { + expr_c_size = len; + if( NULL != expr_c ) free( expr_c ); + expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave + } + + /* Receive the result */ + MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status ); + + /* Put it in the result queue */ + results.push_back( std::string( expr_c ) ); + } + /* Do not send the end signal yet */ + running--; + } + + /* Add whatever I have left */ + Tens = add_expressions_parall( results, symbols, pzero, comm ); + +#if DEBUG + std::cout << "Received " << receivedresults << " results" << std::endl; + + std::cout << "Tpara=" << Tens << ";" << std::endl; +#endif + + if( NULL != expr_c) free( expr_c ); + return Tens; +} + +void multiply_1level_slave_addslave4( tensor3D_t& T, unsigned int size, MPI_Comm comm = MPI_COMM_WORLD ) { + gi::ex Tens; + int a2, a4; + unsigned int len = 0; + + parameters_2_1_t params; + MPI_Status status; + char* expr_c; + + int rank; + MPI_Comm_rank( comm, &rank ); + + /* Ask for some work */ + + MPI_Send( &len, 1, MPI_UNSIGNED, ROOT, TAG_PULL, comm ); + + /* Compute the set of symbols */ + + gi::lst symbols = all_symbols_3D( size ); + + while( true ){ + /* Receive a set of parameters */ + + MPI_Recv( ¶ms, 1, DT_PARAMETERS_2_1, ROOT, MPI_ANY_TAG, comm, &status ); + + if( status.MPI_TAG == TAG_WORK ){ + a4 = params.a4; + a2 = params.a2; + + Tens = one_level1_product( &T, size, a4, a2 ); + + send_result( Tens ); + + } else { + if( status.MPI_TAG == TAG_ADD ) { + /* Receive a set of expressions to add */ + + /* Number of expressions received */ + int nb = params.a4; + a2 = params.a2; + + /* Length of each string */ + + unsigned int* lengths = (unsigned int*) malloc( nb*sizeof( unsigned int ) ); + MPI_Recv( lengths, nb, MPI_INT, ROOT, TAG_ADD, comm, &status ); + std::vector<std::string> results_s; + char* c_str; + int i; + int len; + for( i = 0 ; i < nb ; i++ ) { + len = lengths[i] + 1 ; + c_str = (char*) malloc( len ); + MPI_Recv( c_str, len - 1, MPI_CHAR, ROOT, TAG_ADD, comm, &status ); + c_str[len - 1] = '\0'; // The master sends C++ strings, which do not contain the final '\0' + results_s.push_back( std::string( c_str ) ); + free( c_str ); + } + + /* Delinearize all the expressions and add them */ + + Tens = add_expressions( results_s, symbols ); + + /* Send the result */ + + send_result( Tens ); + + } else { + if( status.MPI_TAG == TAG_END ){ + return; + } else { + std::cerr << "Wrong tag received on slave " << status.MPI_TAG << std::endl; + } + } + } + } +} + +/* Communication protocol: + M -> W: always the same size, therefore unique communication + W -> M: send an unsigned int (size of the expression), then the expression (table of chars) +*/ + +gi::ex multiply_1level_mw_addslave4( tensor3D_t& T, int size ) { // simpler: same dimension everywhere + int rank; + gi::ex Tens = 0; + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); + + /* Create a new datatype for the parameters */ + + create_parameters_datatype_2_1(); + + /* Here we go */ + + if( 0 == rank ) { + Tens = multiply_1level_master_addslave4( T, size ); + } else { + multiply_1level_slave_addslave4( T, size ); + } + + /* Finalize */ + + free_parameters_2_1_dt(); + return Tens; +} + diff --git a/src/tensormatrix.h b/src/tensormatrix.h index f72a89c2d95ab45751a76e000d2a6723aaf30123..7b33ff3429e6e8b060c6c5839d1fb8d86745adc4 100644 --- a/src/tensormatrix.h +++ b/src/tensormatrix.h @@ -29,6 +29,7 @@ gi::ex multiply_1level_mw3( tensor3D_t&, int ); gi::ex multiply_1level_mw_addslave( tensor3D_t&, int ); gi::ex multiply_1level_mw_addslave2( tensor3D_t&, int ); gi::ex multiply_1level_mw_addslave3( tensor3D_t&, int ); +gi::ex multiply_1level_mw_addslave4( tensor3D_t&, int ); gi::ex multiply_2levels_mw_hierarch( tensor3D_t&, int ); gi::ex multiply_2levels_mw_hierarch2( tensor3D_t&, int ); gi::ex multiply_combined( tensor3D_t&, int ); diff --git a/src/tensormatrix_mpi.cpp b/src/tensormatrix_mpi.cpp index c7ccf50df5b72a737824535641d040d09b2d2e33..96bbc9a0ca693daca7d545fc85f26148a57e606f 100644 --- a/src/tensormatrix_mpi.cpp +++ b/src/tensormatrix_mpi.cpp @@ -32,7 +32,8 @@ namespace gi = GiNaC; - o/O: Master-Worker, middle grain -> multiply_1level_mw3 - A/a: Master-Worker, addition on a slave -> multiply_1level_mw_addslave - B/b: Master-Worker, coarser grain, addition on a slave -> multiply_1level_mw_addslave2 - - D/d: Master-Worker, middle grain, addition on a slave -> multiply_1level_mw_addslave2 + - D/d: Master-Worker, middle grain, addition on a slave -> multiply_1level_mw_addslave3 + - E/e: Master-Worker, middle grain, addition on a slave, parallel final addition -> multiply_1level_mw_addslave4 - H/h: Hierarchical master-worker -> multiply_1level_mw_hierarch - i/I: Hierarchical master-worker, coarser grain -> multiply_1level_mw_hierarch - C/c: Combined -> multiply_combined @@ -115,6 +116,10 @@ int main( int argc, char** argv ){ case 'd': tostart = 'd'; break; + case 'E': + case 'e': + tostart = 'e'; + break; case 'H': case 'h': tostart = 'h'; @@ -176,6 +181,9 @@ int main( int argc, char** argv ){ case 'd': Tpara = multiply_1level_mw_addslave3( T, N ); break; + case 'e': + Tpara = multiply_1level_mw_addslave4( T, N ); + break; case 'h': Tpara = multiply_2levels_mw_hierarch( T, N ); break; diff --git a/src/utils_parall.h b/src/utils_parall.h index 45a978ec47d7f98ae27a484dbd5b66134b5cbb85..9629bc3913590539cff28c09f710bfb218e5e405 100644 --- a/src/utils_parall.h +++ b/src/utils_parall.h @@ -27,6 +27,9 @@ public: unsigned int a4, a2; parameters_2_1_t( unsigned int, unsigned int ); parameters_2_1_t( void ){}; + void setA4( unsigned int _a4 ) { this->a4 = _a4; } + void setA2( unsigned int _a2 ) { this->a2 = _a2; } + void setParams( unsigned int _a4, unsigned int _a2 ) { this->a4 = _a4; this->a2 = _a2; } ; }; class parameters_2_2_t{