diff --git a/src/masterworker.cpp b/src/masterworker.cpp
index 80f0066133f5fab07d324b740bc1174ffbc16a69..0a79fc8188cba1ec7df7770d3c98427c5d01536b 100644
--- a/src/masterworker.cpp
+++ b/src/masterworker.cpp
@@ -31,7 +31,7 @@ gi::ex multiply_1level_master( tensor3D_t& T, matrix_int_t& J, unsigned int size
     expr_c = NULL;
     expr_c = (char*) malloc( 3279 ); // TMP
     
- 	int i, j;
+    int i, j;
     i = 0;
     j = 0;
 
@@ -45,18 +45,18 @@ gi::ex multiply_1level_master( tensor3D_t& T, matrix_int_t& J, unsigned int size
     
     for( a1 = 0 ; a1 < size; a1++ ){
         i=i+1; 
-		for( a2 = 0; a2 < size ; a2++ ){
-			j=j+1; 
-			for( a3 = 0 ; a3 < size ; a3++ ){
-				A = T[a1][a2][a3];
-				for( b1 = 0 ; b1 < size ; b1++ ){
-                    parameters_t p( A, a1, a2, a3, b1 );
-                    input.push_back( p );
-                }
-			}
+	for( a2 = 0; a2 < size ; a2++ ){
+	    j=j+1; 
+	    for( a3 = 0 ; a3 < size ; a3++ ){
+		A = T[a1][a2][a3];
+		for( b1 = 0 ; b1 < size ; b1++ ){
+		    parameters_t p( A, a1, a2, a3, b1 );
+		    input.push_back( p );
 		}
+	    }
 	}
-
+    }
+    
     /* Compute the set of symbols */
     /* Could be done while the first slave is working */
     
diff --git a/src/mw_addslave.cpp b/src/mw_addslave.cpp
index 6bd37dd48306ed307ad5e36144fed8f4504bf2b8..f13546c93987450cee37fc899ea5f6c52e285db2 100644
--- a/src/mw_addslave.cpp
+++ b/src/mw_addslave.cpp
@@ -7,6 +7,7 @@
 #include "parall_constants.h"
 #include "parall_internal.h"
 #include "utils.h"
+#include "profiling.h"
 
 namespace gi = GiNaC;
 
@@ -149,18 +150,22 @@ void multiply_1level_slave_addslave( tensor3D_t& T, matrix_int_t& J, unsigned in
     int rank;
     MPI_Comm_rank( comm, &rank );
 
-    /* Compute the set of symbols */
-    
-    gi::lst symbols = all_symbols_3D( size );
+    double t_start, t_wait, t_compute;
 
     /* Ask for some work */
     
     MPI_Send( &len, 1, MPI_UNSIGNED, ROOT, TAG_PULL, comm );
 
+    /* Compute the set of symbols */
+    
+    gi::lst symbols = all_symbols_3D( size );
+
     while( true ){
         /* Receive a set of parameters */
-        
+
+	t_start = rdtsc();
         MPI_Recv( &params, 1, DT_PARAMETERS, ROOT, MPI_ANY_TAG, comm, &status );
+	t_wait = rdtsc() - t_start;
         
         if( status.MPI_TAG == TAG_WORK ){
             a1 = params.a1;
@@ -168,8 +173,13 @@ void multiply_1level_slave_addslave( tensor3D_t& T, matrix_int_t& J, unsigned in
             a3 = params.a3;
             b1 = params.b1;
             gi::symbol A( std::string( params.A  ) );
-            
+
+	    t_start = rdtsc();
             Tens = one_level1_product( &T, &J, A, size, a1, a2, a3, b1 );
+	    t_compute = rdtsc() - t_start;
+
+	    /* TODO if we waited for too long */
+	    if( t_wait > t_compute ) {}
 
             send_result( Tens );
 
diff --git a/src/mw_combined.cpp b/src/mw_combined.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b79a9199df23dd52742c902999af32f9851d1c60
--- /dev/null
+++ b/src/mw_combined.cpp
@@ -0,0 +1,344 @@
+#include <iostream>
+#include <mpi.h>
+#include <ginac/ginac.h>
+
+#include "products.h"
+#include "utils_parall.h"
+#include "parall_constants.h"
+#include "parall_internal.h"
+#include "utils.h"
+#include "profiling.h"
+
+namespace gi = GiNaC;
+
+typedef enum {
+	      LONG_ADD_M,  /* The addition on the master took a long time */
+	      FINISHED,    /* Computation finished normally */
+} end_code_t;
+
+typedef enum {
+	      ALGO_MW,    /* Regulat master-worker */
+	      ALGO_ADDSLAVE,   /* Do the addition on a slave */
+} algo_t;
+
+/* This one is a "regular" master. It returns either when it is done, or when it decides to switch to another algorithm.
+ */
+
+end_code_t multiply_combined_master_initial( tensor3D_t& T, matrix_int_t& J, unsigned int size, gi::ex& Tens, MPI_Comm comm = MPI_COMM_WORLD ) { 
+    unsigned int a1, a2, a3, b1;
+    gi::ex A;
+    gi::lst symbols;
+
+    MPI_Status status;
+    char* expr_c;
+    size_t expr_c_size = 0;
+    int src, np, running = 0;
+    unsigned int len;
+
+    double t_start, t_add, t_wait;
+    algo_t algo = ALGO_MW;
+
+    MPI_Comm_size( comm, &np );
+
+    expr_c = NULL;
+    expr_c = (char*) malloc( 3279 ); // TMP
+    
+    int i, j;
+    i = 0;
+    j = 0;
+
+    int receivedresults = 0;
+
+    std::vector<parameters_t> input;
+    std::vector<std::string> results_s;
+    std::vector<gi::ex> results;
+
+    /* Build a list of argument sets */
+    
+    for( a1 = 0 ; a1 < size; a1++ ){
+        i=i+1; 
+	for( a2 = 0; a2 < size ; a2++ ){
+	    j=j+1; 
+	    for( a3 = 0 ; a3 < size ; a3++ ){
+		A = T[a1][a2][a3];
+		for( b1 = 0 ; b1 < size ; b1++ ){
+		    parameters_t p( A, a1, a2, a3, b1 );
+		    input.push_back( p );
+		}
+	    }
+	}
+    }
+    
+    /* Compute the set of symbols */
+    /* Could be done while the first slave is working */
+    
+    symbols = all_symbols_3D( size );
+
+    /* Workers that have yet to send their first request */
+    
+    bool initialround = true;
+    running = 0;
+    
+    /* Distribute the work */
+
+    while( input.size() > 0 ) {
+	t_start = rdtsc();
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        src = status.MPI_SOURCE;
+	t_wait = rdtsc() - t_start;
+	std::cout << "wait " << t_wait << std::endl;
+       
+        if( status.MPI_TAG == TAG_PULL ) {
+            
+            /* Nothing else will come: just send wome work */
+            send_work( input, src, comm );
+	    
+	    if( initialround ){
+		running++;
+		if( np - 1 == running ) initialround = false; // everyone is at work
+	    }
+            
+        } else {
+            if( status.MPI_TAG == TAG_RES ){
+                src = status.MPI_SOURCE;
+
+                /* The first message contains the length of what is coming next */
+                if( len != 0 ) {
+                    if( len > expr_c_size ) {
+                        expr_c_size = len;
+                        if( NULL != expr_c ) free( expr_c );
+                        expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+                    }
+                    
+                    /* Receive the result */
+                    MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status );
+                        
+                    /* put it in the result queue */
+                    std::string s( expr_c );
+
+		    if( algo == ALGO_ADDSLAVE ) {
+			send_work_addslave( input, results_s, src );
+		    } else {
+			send_work( input, src, comm );
+                    }
+		    
+                    /* Process what I have just received */
+
+		    if( ALGO_MW == algo ) {
+			t_start = rdtsc();
+			gi::ex received = de_linearize_expression( s, symbols );
+			Tens += received;
+			t_add = rdtsc() - t_start;
+			std::cout << "Add " << t_add << std::endl;
+#if DEBUG
+			results.push_back( received );
+			results_s.push_back( s );
+			receivedresults++;
+#endif
+			if( t_add > t_wait ) {
+			    /* We are spending too much time adding these results. Now we are going to ask a worker to do this. */
+			    // TODO use the average NP last wait time instead
+			    // double average = accumulate( v.begin(), v.end(), 0.0)/v.size(); 
+
+			    std::cout << "The master spent too much time computing the sum. Switch to ADDSLAVE algorithm" << std::endl;
+			    algo = ALGO_ADDSLAVE;
+			}
+		    } else {
+			if( ALGO_ADDSLAVE == algo ) {
+			    results_s.push_back( s );
+			} else {
+			    std::cout << "ERROR: unknown algorithm on the master " << algo << std::endl;
+			}
+		    }
+                } else {
+                    /* Send more work  */
+                    send_work( input, src, comm );
+                }                
+            } else{
+                std::cerr << "Wrong tag received " << status.MPI_TAG << std::endl;
+            }
+        }
+   }
+
+   /* Wait until everyone is done */
+
+    running = np - 1; // all the slaves are running 
+    while( running > 0 ) {
+
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        src = status.MPI_SOURCE;
+
+        if( len != 0 ) {
+            if( len > expr_c_size ) {
+                expr_c_size = len;
+                if( NULL != expr_c ) free( expr_c );
+                expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+            }
+
+            /* Receive the result */
+            MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status );
+            
+            /* And send the END signal */
+            send_end( src, comm );
+            running--;
+            
+            /* Process what I have just received */
+            /* Could be given to a slave... */
+            /* put it in the result queue */
+            std::string s( expr_c );
+            gi::ex received = de_linearize_expression( s, symbols );
+            Tens += received;
+#if DEBUG
+            results.push_back( received );
+            results_s.push_back( s );
+            receivedresults++;
+#endif
+        } else {
+            send_end( src, comm );
+            running--;
+        }
+    }
+    
+#if DEBUG
+    std::cout << "Received " << receivedresults << " results" << std::endl;
+
+    std::cout << "Tpara=" << Tens << ";" << std::endl;
+#endif
+    
+    if( NULL != expr_c) free( expr_c );
+    return FINISHED;
+}
+
+/* The traditional slave */
+
+void multiply_combined_slave_initial( tensor3D_t& T, matrix_int_t& J, int size, MPI_Comm comm = MPI_COMM_WORLD ) {
+    gi::ex Tens;
+    int  a1, a2, a3, b1;
+    //    gi::ex A;
+    unsigned int len = 0;
+    
+    parameters_t params;
+    MPI_Status status;
+    char* expr_c;
+
+    int rank;
+    MPI_Comm_rank( comm, &rank );
+    
+    /* Ask for some work */
+    
+    MPI_Send( &len, 1, MPI_UNSIGNED, ROOT, TAG_PULL, comm );
+
+    /* Compute the set of symbols */
+    
+    gi::lst symbols = all_symbols_3D( size );
+
+    while( true ){
+        /* Receive a set of parameters */
+        
+        MPI_Recv( &params, 1, DT_PARAMETERS, ROOT, MPI_ANY_TAG, comm, &status );
+        
+        if( status.MPI_TAG == TAG_WORK ){
+            a1 = params.a1;
+            a2 = params.a2;
+            a3 = params.a3;
+            b1 = params.b1;
+            gi::symbol A( std::string( params.A  ) );
+            
+            Tens = one_level1_product( &T, &J, A, size, a1, a2, a3, b1 );
+            send_result( Tens );
+
+        } else {
+            if( status.MPI_TAG == TAG_ADD ) {
+                /* Receive a set of expressions to add */
+		
+                /* Number of expressions received */
+                int nb = params.a1;
+                
+                /* Length of each string */
+		
+                unsigned int* lengths = (unsigned int*) malloc( nb*sizeof( unsigned int ) );
+                MPI_Recv( lengths, nb, MPI_INT, ROOT, TAG_ADD, comm, &status );
+                std::vector<std::string> results_s;
+                char* c_str;
+                int i;
+                int len;
+                for( i = 0 ; i < nb ; i++ ) {
+                    len = lengths[i] + 1;
+                    c_str = (char*) malloc( len );
+                    MPI_Recv( c_str, len, MPI_CHAR, ROOT, TAG_ADD, comm, &status );
+                    c_str[len-1] = '\0';    // The master sends C++ strings, which do not contain the final '\0'
+                    results_s.push_back( std::string( c_str ) );
+                    free( c_str );
+                }
+
+                /* Delinearize all the expressions and add them */
+		
+                Tens = add_expressions( results_s, symbols );
+                
+                /* Send the result */
+		
+                send_result( Tens );
+		
+            } else {
+		if( status.MPI_TAG == TAG_END ){
+		    return;
+		} else {
+		    std::cerr << "Wrong tag received on slave " << status.MPI_TAG << std::endl;
+		}
+	    }
+        }
+    }
+}
+
+/*******************************************************************************
+ *                         Combined master-worker                              *
+ *******************************************************************************/
+
+gi::ex multiply_combined_master( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler: same dimension everywhere
+    gi::ex Tens = 0;
+    end_code_t rc;
+    
+    /* Initially: start as a traditional M/W */
+    
+    rc = multiply_combined_master_initial( T, J, size, Tens );
+    switch( rc ){
+    case FINISHED:
+	return Tens;
+    }
+    
+    return Tens;
+}
+
+void multiply_combined_worker( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler: same dimension everywhere
+    gi::ex Tens = 0;
+    
+    std::cout << "worker" << std::endl;
+    multiply_combined_slave_initial( T, J, size );
+    
+}
+
+
+gi::ex multiply_combined( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler: same dimension everywhere
+
+    int rank;
+    gi::ex Tens = 0;
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+
+    /* Create a new datatype for the parameters */
+    
+    create_parameters_datatye();
+
+    /* Here we go */
+    
+    if( 0 == rank ) {
+        Tens = multiply_combined_master( T, J, size );
+    } else {
+        multiply_combined_worker( T, J, size );
+    }
+
+    /* Finalize */
+    
+    free_parameters_dt();
+    return Tens;
+
+}
diff --git a/src/sequential.cpp b/src/sequential.cpp
index 855104a7f570fe315a8f9b61fb1a463509afb7e0..0aa406557902abdeae64b064047c47ee88a19649 100644
--- a/src/sequential.cpp
+++ b/src/sequential.cpp
@@ -35,15 +35,15 @@ gi::ex multiply_seq( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler: s
 			j=j+1; 
             //			printf("j = %d\n", j);
 			for( a3 = 0 ; a3 < size ; a3++ ){
-                TAU_START( timerA );
-				A = T[a1][a2][a3];
-                /* Beyond this point, a2 and a3 are only used in the simplectic matrix */
+			    TAU_START( timerA );
+			    A = T[a1][a2][a3];
+			    /* Beyond this point, a2 and a3 are only used in the simplectic matrix */
 				for( b1 = 0 ; b1 < size ; b1++ ){
 					TAB = J[a1][b1]; 
 					for( b2 = 0 ; b2 < size ; b2++ ){
 						for( b3 = 0 ; b3 < size ; b3++ ){
 						  TAU_START( timerB );
-                           /* Beyond this point, b1 is not used anymore */
+						  /* Beyond this point, b1 is not used anymore */
 							TABB =  TAB * A*T[b1][b2][b3];
 							for( c1 = 0 ; c1 < size ; c1++ ){
 								for( c2 = 0 ; c2 < size ; c2++ ){
@@ -60,33 +60,33 @@ gi::ex multiply_seq( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler: s
 												  Tens = Tens + TABCDD * T[d1][d2][d3]*J[a3][d3];
 												  t_end = rdtsc();
 												  
-                                                    TAU_STOP( timeradd );
+												  TAU_STOP( timeradd );
 #ifdef TAUPROF
 						    //                                                    std::cout << "add " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl;
-                                                    printf( "add %lf %lu len %d\n", getTimeSpent( timeradd ), t_end - t_start, Tens.nops() );
+												  printf( "add %lf %lu len %d\n", getTimeSpent( timeradd ), t_end - t_start, Tens.nops() );
 
 						    //                                                    std::cout << Tens << std::endl;
 #endif // TAUPROF
    												}
-                                            }
+											}
 										}
 									}
 								}
 							}
-                            TAU_STOP( timerB );
+							TAU_STOP( timerB );
 #ifdef TAUPROF
-                            std::cout << "B " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl;
+							std::cout << "B " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl;
 #endif // TAUPROF
 						}
-                    }
+					}
 				}
-                TAU_STOP( timerA );
+				TAU_STOP( timerA );
 #ifdef TAUPROF
-                std::cout << "A " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl;
+				std::cout << "A " << getTimeSpent( timeradd ) << " len " << Tens.nops() << std::endl;
 #endif // TAUPROF
 			}
 		}
-	}
+    }
     return Tens;
 }
 
@@ -115,15 +115,16 @@ gi::ex multiply_1level( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler
 			j=j+1; 
             //	printf("j = %d\n", j);
 			for( a3 = 0 ; a3 < size ; a3++ ){
-                TAU_START( "b" );
+			  //                TAU_START( "b" );
 				A = T[a1][a2][a3];
                 /* Beyond this point, a2 and a3 are only used in the simplectic matrix */
 				for( b1 = 0 ; b1 < size ; b1++ ){
                     Tn = one_level1_product( &T, &J, A, size, a1, a2, a3, b1 );
                     Tens += Tn;
                 }
-                TAU_STOP( "b" );
-#ifdef TAUPROF
+				//                TAU_STOP( "b" );
+				//#ifdef TAUPROF
+				#if 0
                 time = getTimeSpent( "b" );
 #endif // TAUPROF                
 			}
@@ -147,7 +148,7 @@ gi::ex one_level1_product( tensor3D_t* T, matrix_int_t *J, gi::ex A, int size, i
             TABB =  TAB * A*(*T)[b1][b2][b3];
             T5 = 0;
             /* Beyond this point, b1 is not used anymore */
-            TAU_START( timerB );
+	    //            TAU_START( timerB );
             for( c1 = 0 ; c1 < size ; c1++ ){
                 T4 = 0;
                 for( c2 = 0 ; c2 < size ; c2++ ){
@@ -176,7 +177,7 @@ gi::ex one_level1_product( tensor3D_t* T, matrix_int_t *J, gi::ex A, int size, i
                 T5 += T4;
             }
             Tens += T5;
-            TAU_STOP( timerB );
+	    //            TAU_STOP( timerB );
         }
     }
 
diff --git a/src/tensormatrix.h b/src/tensormatrix.h
index 246010a5f050afb5d84969fca04c85ab38c5de97..bcf5d8d86eb90c460db9319bb8b4e5fa99ea5983 100644
--- a/src/tensormatrix.h
+++ b/src/tensormatrix.h
@@ -26,6 +26,7 @@ gi::ex multiply_2levels( tensor3D_t&, matrix_int_t&, int );
 gi::ex multiply_1level_mw( tensor3D_t&, matrix_int_t&, int );
 gi::ex multiply_1level_mw_addslave( tensor3D_t&, matrix_int_t&, int );
 gi::ex multiply_1level_mw_hierarch( tensor3D_t&, matrix_int_t&, int );
+gi::ex multiply_combined( tensor3D_t&, matrix_int_t&, int );
 
 /*******************************************************************************
  *                              Default values                                 *
diff --git a/src/tensormatrix_mpi.cpp b/src/tensormatrix_mpi.cpp
index ef6a4c915626e11579737d1d69aa8299e5485cef..e72db52db810e224783ab809487347aa1a41813c 100644
--- a/src/tensormatrix_mpi.cpp
+++ b/src/tensormatrix_mpi.cpp
@@ -26,6 +26,7 @@ namespace gi = GiNaC;
    - M/m: Master-Worker -> multiply_1level_mw
    - A/a: Master-Worker, addition on a slave -> multiply_1level_mw_addslave
    - H/h: Hierarchical master-worker -> multiply_1level_mw_hierarch
+   - C/c: Combined -> multiply_combined
 */
 
 /* Sequentiel sur Minimum
@@ -86,6 +87,10 @@ int main( int argc, char** argv ){
             case 'h':
                 tostart = 'h';
                 break;
+            case 'C':
+            case 'c':
+                tostart = 'c';
+                break;
             case 'S':
             case 's':
                 tostart = 's';
@@ -126,6 +131,9 @@ int main( int argc, char** argv ){
     case 'h':
         Tpara = multiply_1level_mw_hierarch( T, J, N );
         break;
+    case 'c':
+        Tpara = multiply_combined( T, J, N );
+        break;
     case 's':
         Tpara = multiply_seq( T, J, N );
         break;