diff --git a/src/Makefile b/src/Makefile
index db349944217d64bb42965ab207d409ef64936b4e..59876781d359430ede4b6a1f68356a1a888588ff 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -30,7 +30,7 @@ MPISRC = masterworker.cpp mw_addslave.cpp hierarchical.cpp  \
          perf.cpp  sequential.cpp  tensormatrix_mpi.cpp      \
          utils.cpp  utils_parall.cpp profiling.cpp mw_combined.cpp \
 	 masterworker2.cpp mw_addslave2.cpp hierarchical2.cpp  \
-	 masterworker3.cpp mw_addslave3.cpp 
+	 masterworker3.cpp mw_addslave3.cpp mw_addslave4.cpp 
 
 MPIOBJ= $(MPISRC:.cpp=.o)
 
diff --git a/src/masterworker.cpp b/src/masterworker.cpp
index 5bbb605e555d771d456467cb56bb80fa296808d6..b3f2cbe2b8a8baacaca0d13a4c0602ba74d0e069 100644
--- a/src/masterworker.cpp
+++ b/src/masterworker.cpp
@@ -31,10 +31,6 @@ gi::ex multiply_1level_master( tensor3D_t& T, unsigned int size, MPI_Comm comm =
     expr_c = NULL;
     expr_c = (char*) malloc( 3279 ); // TMP
     
-    int i, j;
-    i = 0;
-    j = 0;
-
     int receivedresults = 0;
     unsigned int N = size/2;
 
@@ -45,9 +41,7 @@ gi::ex multiply_1level_master( tensor3D_t& T, unsigned int size, MPI_Comm comm =
     /* Build a list of argument sets */
     
     for( a4 = 0 ; a4 < N ; a4++ ){
-        i=i+1; 
         for( a2 = 0; a2 < N ; a2++ ){
-            j=j+1; 
             for( a1 = 0 ; a1 < N ; a1++ ){
                 parameters_t p( a4, a2, a1 );
                 input.push_back( p );
diff --git a/src/mw_addslave.cpp b/src/mw_addslave.cpp
index 91b8a04432cc06f56bf479388fbbdb9d1ce20d9d..833dff363967ae9438b317283365427ff111c42d 100644
--- a/src/mw_addslave.cpp
+++ b/src/mw_addslave.cpp
@@ -42,6 +42,8 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co
     std::vector<parameters_t> input;
     std::vector<std::string> results; /* length and char* */
 
+    double t1 = getTime();
+    
     /* Build a list of argument sets */
     
     for( a4 = 0 ; a4 < N ; a4++ ){
@@ -60,6 +62,8 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co
     
     symbols = all_symbols_3D( size );
 
+    double t2 = getTime();
+    
     /* Distribute the work */
 
     while( input.size() > 0 ) {
@@ -98,6 +102,8 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co
         }
    }
 
+    double t3 = getTime();
+
    /* Wait until everyone is done */
 
     running = np - 1; // all the slaves are running 
@@ -122,9 +128,17 @@ gi::ex multiply_1level_master_addslave( tensor3D_t& T, unsigned int size, MPI_Co
         send_add_or_end_addslave( results, src, &running );
     }
 
+    double t4 = getTime();
+
     /* Add whatever I have left */
     Tens = add_expressions( results, symbols );
-    
+
+    double t5 = getTime();
+    std::cout << "Init: " << t2 - t1 << std::endl;
+    std::cout << "Loop: " << t3 - t2 << std::endl;
+    std::cout << "Fini: " << t4 - t3 << std::endl;
+    std::cout << "Add:  " << t5 - t4 << std::endl;
+
 #if DEBUG
     std::cout << "Received " << receivedresults << " results" << std::endl;
 
diff --git a/src/mw_addslave2.cpp b/src/mw_addslave2.cpp
index e64a7a4d1b6ff35cc129dfc49ae5b7420f89cd8a..9e0aaf9b99d631e04b6764bb47687999e0bd9775 100644
--- a/src/mw_addslave2.cpp
+++ b/src/mw_addslave2.cpp
@@ -183,8 +183,10 @@ void multiply_1level_slave_addslave2( tensor3D_t& T, unsigned int size, MPI_Comm
 
                 /* Delinearize all the expressions and add them */
 
+                double t1 = getTime();
                 Tens = add_expressions( results_s, symbols );
-                
+                std::cout << "Addition: " <<  getTime() - t1 << std::endl;
+
                 /* Send the result */
 
                 send_result( Tens );
diff --git a/src/mw_addslave4.cpp b/src/mw_addslave4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..016269ebb2b27393fde92703ba47c3159c8c488e
--- /dev/null
+++ b/src/mw_addslave4.cpp
@@ -0,0 +1,343 @@
+#include <iostream>
+#include <mpi.h>
+#include <ginac/ginac.h>
+#include <math.h> // for ceil
+
+#include "products.h"
+#include "utils_parall.h"
+#include "parall_constants.h"
+#include "parall_internal.h"
+#include "utils.h"
+#include "profiling.h"
+
+namespace gi = GiNaC;
+
+#define MAXLENADD  1 // 256
+
+unsigned int maxlen(  std::vector<std::string> expressions ){
+    unsigned int len = 0;
+    for( auto s: expressions  ) {
+        unsigned int l2 = s.length();
+        if( len < l2 ) {
+            len = l2;
+        }
+    }
+    return len;
+}
+
+gi::ex add_expressions_parall( std::vector<std::string> expressions, gi::lst symbols, parameters_2_1_t p, MPI_Comm comm = MPI_COMM_WORLD ) {
+    gi::ex Tens = 0;
+    int size, i, nb, len;
+    unsigned int chunk, end;
+    std::vector<unsigned int> cut;
+    unsigned int* lengths;
+    std::string result;
+    char* expr;
+    MPI_Status status;
+    size_t expr_c_size = 0;
+    char* expr_c;
+
+    /* If the expressions are short, compute the sum locally */
+    if( maxlen( expressions ) < MAXLENADD )
+        return add_expressions( expressions, symbols );
+    
+    MPI_Comm_size( comm, &size );
+    nb = expressions.size();
+    lengths = (unsigned int*) malloc( nb * sizeof( unsigned int ) );
+    for( i = 0 ; i < nb ; i++ ) {
+        cut.push_back( 0 );
+        lengths[i] = 0;
+    }
+    unsigned int running = size - 1;
+    p.setParams( nb, 1 );
+
+    /* TODO ca se factorise avec send_expressions_to_add */
+
+    for( int peer = 1 ; peer < size ; peer++ ) {
+    
+        i = 0;
+        for( auto s: expressions  ) {
+            /* How much are we going to send: stop at a + or - sign (and keep the sign) */
+            chunk = ceil( s.length() / ( size - 1 ) );
+            end = cut[i] + chunk;
+            while( !( s[end] == '+' || s[end] == '-' || end == s.length() - 1) ){
+                end++;
+            }
+            end--;
+           
+            lengths[i] = end - cut[i] + 1;
+            i++;
+        }
+        
+        /* Send the lengths */
+        MPI_Send( &p, 1, DT_PARAMETERS_2_1, peer, TAG_ADD, comm );
+        MPI_Send( lengths, nb, MPI_INT, peer, TAG_ADD, comm );
+        
+        /* Send the strings */
+        
+        for( unsigned int j = 0 ; j < nb ; j++ ) {
+            expr = const_cast<char*>( expressions[j].c_str() );
+        
+           MPI_Send( &( expr[ cut[j] ] ), lengths[j], MPI_CHAR, peer, TAG_ADD, comm );
+           cut[j] += lengths[j];
+        }
+    }
+    
+    /* Receive the results */
+    
+    expr_c = NULL;
+
+    while( running > 0 ) {
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        int src = status.MPI_SOURCE;
+        len++;
+        
+        if( len != 0 ) {
+            
+            if( len > expr_c_size ) {
+                expr_c_size = len;
+                if( NULL != expr_c ) free( expr_c );
+                expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+            }
+
+            /* Receive the result */
+            MPI_Recv( expr_c, len-1, MPI_CHAR, src, TAG_EXPR, comm, &status );
+            expr_c[len - 1] = '\n';
+            
+            /* Concatenate the result */
+            std::string recvs( expr_c );
+            if( expr_c[0] != '-' ) result += '+';
+            result += recvs;
+        }
+        running--;
+        send_end( src, p );
+    }
+        
+    Tens = de_linearize_expression( result, symbols );
+            
+    free( lengths );
+    free( expr_c );
+    return Tens;
+}
+
+/*******************************************************************************
+ *         Parallel 1-level decomposition with addition on a slave             *
+ *******************************************************************************/
+
+gi::ex multiply_1level_master_addslave4( tensor3D_t& T, unsigned int size, MPI_Comm comm = MPI_COMM_WORLD ) { 
+    gi::ex Tens = 0;
+    unsigned int a2, a4;
+    gi::lst symbols;
+
+    MPI_Status status;
+    char* expr_c;
+    size_t expr_c_size = 0;
+    int src, np;
+    unsigned int len, running = 0;
+    parameters_2_1_t pzero( 0, 0 );
+
+    MPI_Comm_size( comm, &np );
+
+    expr_c = NULL;
+    expr_c = (char*) malloc( 3279 );
+    
+    int receivedresults = 0;
+    unsigned int N = size/2;
+
+    std::vector<parameters_2_1_t> input;
+    std::vector<std::string> results; /* length and char* */
+
+    /* Build a list of argument sets */
+    
+    for( a4 = 0 ; a4 < N ; a4++ ){
+        for( a2 = 0; a2 < N ; a2++ ){
+            parameters_2_1_t p( a4, a2 );
+            input.push_back( p );
+	    }
+	}
+
+    /* Compute the set of symbols */
+    /* Could be done while the first slave is working */
+    
+    symbols = all_symbols_3D( size );
+
+    /* Distribute the work */
+
+    while( input.size() > 0 ) {
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        
+        if( status.MPI_TAG == TAG_PULL ) {
+            /* Nothing else will come: just send wome work */
+            src = status.MPI_SOURCE;
+            send_work( input, src );
+            
+        } else {
+            if( status.MPI_TAG == TAG_RES ){
+                src = status.MPI_SOURCE;
+
+                /* The first message contains the length of what is coming next */
+                if( len != 0 ) {
+                    if( len > expr_c_size ) {
+                        expr_c_size = len;
+                        if( NULL != expr_c ) free( expr_c );
+                        expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+                    }
+                    
+                    /* Receive the result */
+                    MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status );
+
+                    /* Put it in the result queue */
+                    results.push_back( std::string( expr_c ) );
+                }                    
+                    
+                /* Send more work  */
+                send_work_addslave( input, results, src );
+            } else {
+                std::cerr << "Wrong tag received " << status.MPI_TAG << std::endl;
+            }
+            
+        }
+   }
+
+   /* Wait until everyone is done */
+
+    running = np - 1; // all the slaves are running 
+    while( running > 0 ) {
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        src = status.MPI_SOURCE;
+        
+        if( len != 0 ) {
+            if( len > expr_c_size ) {
+                expr_c_size = len;
+                if( NULL != expr_c ) free( expr_c );
+                expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+            }
+
+            /* Receive the result */
+            MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status );
+
+            /* Put it in the result queue */
+            results.push_back( std::string( expr_c ) );
+        }
+        /* Do not send the end signal yet */
+        running--;
+    }
+
+    /* Add whatever I have left */
+    Tens = add_expressions_parall( results, symbols, pzero, comm );
+    
+#if DEBUG
+    std::cout << "Received " << receivedresults << " results" << std::endl;
+
+    std::cout << "Tpara=" << Tens << ";" << std::endl;
+#endif
+    
+    if( NULL != expr_c) free( expr_c );
+    return Tens;
+}
+
+void multiply_1level_slave_addslave4( tensor3D_t& T, unsigned int size, MPI_Comm comm = MPI_COMM_WORLD ) {
+    gi::ex Tens;
+    int  a2, a4;
+    unsigned int len = 0;
+    
+    parameters_2_1_t params;
+    MPI_Status status;
+    char* expr_c;
+
+    int rank;
+    MPI_Comm_rank( comm, &rank );
+
+    /* Ask for some work */
+    
+    MPI_Send( &len, 1, MPI_UNSIGNED, ROOT, TAG_PULL, comm );
+
+    /* Compute the set of symbols */
+    
+    gi::lst symbols = all_symbols_3D( size );
+
+    while( true ){
+        /* Receive a set of parameters */
+
+        MPI_Recv( &params, 1, DT_PARAMETERS_2_1, ROOT, MPI_ANY_TAG, comm, &status );
+        
+        if( status.MPI_TAG == TAG_WORK ){
+            a4 = params.a4;
+            a2 = params.a2;
+
+            Tens = one_level1_product( &T, size, a4, a2 );
+
+            send_result( Tens );
+
+        } else {
+            if( status.MPI_TAG == TAG_ADD ) {
+                /* Receive a set of expressions to add */
+
+                /* Number of expressions received */
+                int nb = params.a4;
+                a2 = params.a2;
+
+                /* Length of each string */
+
+                unsigned int* lengths = (unsigned int*) malloc( nb*sizeof( unsigned int ) );
+                MPI_Recv( lengths, nb, MPI_INT, ROOT, TAG_ADD, comm, &status );
+                std::vector<std::string> results_s;
+                char* c_str;
+                int i;
+                int len;
+                for( i = 0 ; i < nb ; i++ ) {
+                    len = lengths[i] + 1 ;
+                    c_str = (char*) malloc( len );
+                    MPI_Recv( c_str, len - 1, MPI_CHAR, ROOT, TAG_ADD, comm, &status );
+                    c_str[len - 1] = '\0';    // The master sends C++ strings, which do not contain the final '\0'
+                    results_s.push_back( std::string( c_str ) );
+                    free( c_str );
+                }
+
+                /* Delinearize all the expressions and add them */
+
+                Tens = add_expressions( results_s, symbols );
+                
+                /* Send the result */
+
+                send_result( Tens );
+
+            } else {
+                if( status.MPI_TAG == TAG_END ){
+                    return;
+                } else {
+                    std::cerr << "Wrong tag received on slave " << status.MPI_TAG << std::endl;
+                }
+            }
+        }
+    }
+}
+
+/* Communication protocol:
+   M -> W: always the same size, therefore unique communication
+   W -> M: send an unsigned int (size of the expression), then the expression (table of chars)
+*/
+        
+gi::ex multiply_1level_mw_addslave4( tensor3D_t& T, int size ) {  // simpler: same dimension everywhere
+    int rank;
+    gi::ex Tens = 0;
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+
+    /* Create a new datatype for the parameters */
+    
+    create_parameters_datatype_2_1();
+
+    /* Here we go */
+
+    if( 0 == rank ) {
+        Tens = multiply_1level_master_addslave4( T, size );
+    } else {
+        multiply_1level_slave_addslave4( T, size );
+    }
+
+    /* Finalize */
+    
+    free_parameters_2_1_dt();
+    return Tens;
+}
+
diff --git a/src/tensormatrix.h b/src/tensormatrix.h
index f72a89c2d95ab45751a76e000d2a6723aaf30123..7b33ff3429e6e8b060c6c5839d1fb8d86745adc4 100644
--- a/src/tensormatrix.h
+++ b/src/tensormatrix.h
@@ -29,6 +29,7 @@ gi::ex multiply_1level_mw3( tensor3D_t&, int );
 gi::ex multiply_1level_mw_addslave( tensor3D_t&, int );
 gi::ex multiply_1level_mw_addslave2( tensor3D_t&, int );
 gi::ex multiply_1level_mw_addslave3( tensor3D_t&, int );
+gi::ex multiply_1level_mw_addslave4( tensor3D_t&, int );
 gi::ex multiply_2levels_mw_hierarch( tensor3D_t&, int );
 gi::ex multiply_2levels_mw_hierarch2( tensor3D_t&, int );
 gi::ex multiply_combined( tensor3D_t&, int );
diff --git a/src/tensormatrix_mpi.cpp b/src/tensormatrix_mpi.cpp
index c7ccf50df5b72a737824535641d040d09b2d2e33..96bbc9a0ca693daca7d545fc85f26148a57e606f 100644
--- a/src/tensormatrix_mpi.cpp
+++ b/src/tensormatrix_mpi.cpp
@@ -32,7 +32,8 @@ namespace gi = GiNaC;
    - o/O: Master-Worker, middle grain -> multiply_1level_mw3
    - A/a: Master-Worker, addition on a slave -> multiply_1level_mw_addslave
    - B/b: Master-Worker, coarser grain, addition on a slave -> multiply_1level_mw_addslave2
-   - D/d: Master-Worker, middle grain, addition on a slave -> multiply_1level_mw_addslave2
+   - D/d: Master-Worker, middle grain, addition on a slave -> multiply_1level_mw_addslave3
+   - E/e: Master-Worker, middle grain, addition on a slave, parallel final addition -> multiply_1level_mw_addslave4
    - H/h: Hierarchical master-worker -> multiply_1level_mw_hierarch
    - i/I: Hierarchical master-worker, coarser grain -> multiply_1level_mw_hierarch
    - C/c: Combined -> multiply_combined
@@ -115,6 +116,10 @@ int main( int argc, char** argv ){
             case 'd':
                 tostart = 'd';
                 break;
+            case 'E':
+            case 'e':
+                tostart = 'e';
+                break;
             case 'H':
             case 'h':
                 tostart = 'h';
@@ -176,6 +181,9 @@ int main( int argc, char** argv ){
     case 'd':
         Tpara = multiply_1level_mw_addslave3( T, N );
         break;
+    case 'e':
+        Tpara = multiply_1level_mw_addslave4( T, N );
+        break;
     case 'h':
         Tpara = multiply_2levels_mw_hierarch( T, N );
         break;
diff --git a/src/utils_parall.h b/src/utils_parall.h
index 45a978ec47d7f98ae27a484dbd5b66134b5cbb85..9629bc3913590539cff28c09f710bfb218e5e405 100644
--- a/src/utils_parall.h
+++ b/src/utils_parall.h
@@ -27,6 +27,9 @@ public:
     unsigned int a4, a2;
     parameters_2_1_t( unsigned int, unsigned int );
     parameters_2_1_t( void ){};
+    void setA4( unsigned int _a4 ) { this->a4 = _a4; }
+    void setA2( unsigned int _a2 ) { this->a2 = _a2; }
+    void setParams( unsigned int _a4, unsigned int _a2 ) { this->a4 = _a4; this->a2 = _a2; } ;
 };
 
 class parameters_2_2_t{