diff --git a/src/Makefile b/src/Makefile
index cb3cdf2b1c47c34c7a3f8bae8ad35b4d2116399b..d9a4393e3c0d2d4a07e46d7a026a44f62dda19ed 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -26,10 +26,10 @@ LDOPT = -lginac $(TAULIB)
 MPIEXEC = mpiexec
 NP = 5
 
-MPISRC = masterworker.cpp mw_addslave.cpp \
+MPISRC = masterworker.cpp mw_addslave.cpp hierarchical.cpp  \
          perf.cpp  sequential.cpp  tensormatrix_mpi.cpp      \
          utils.cpp  utils_parall.cpp profiling.cpp mw_combined.cpp
-#  hierarchical.cpp  
+
 MPIOBJ= $(MPISRC:.cpp=.o)
 
 
diff --git a/src/hierarchical.cpp b/src/hierarchical.cpp
index ed447ccab0b12a862f5462ac99e7a8e2e00a9c92..87168ffc7e0e8bdaedd12b529563d433a3677d58 100644
--- a/src/hierarchical.cpp
+++ b/src/hierarchical.cpp
@@ -61,12 +61,16 @@ void create_communicators_hierarch( MPI_Comm& COMM_FOREMEN, MPI_Comm& COMM_TEAM
 
 }
 
-gi::ex multiply_1level_foreman_hierarch_distribute_work( tensor3D_t& T, matrix_int_t& J, int size, parameters_t params, gi::lst symbols, MPI_Comm comm_team, int rank_foreman /* DEBUG */ ) {
+/*******************************************************************************
+ *                                  Foreman                                    *
+ *******************************************************************************/
+
+gi::ex multiply_2levels_foreman_hierarch_distribute_work( tensor3D_t& T, int size, parameters_2_1_t params, gi::lst symbols, MPI_Comm comm_team, int rank_foreman /* DEBUG */ ) {
 
     gi::ex Tens = 0;
-    gi::ex A;
-    int  a1, a2, a3, b1, b2, b3, c1, c2;
-    
+    int  a1, a2, a4, a6;
+    int N = size / 2;
+
     MPI_Status status;
     int unsigned len;
     char* expr_c;
@@ -78,28 +82,21 @@ gi::ex multiply_1level_foreman_hierarch_distribute_work( tensor3D_t& T, matrix_i
 
     /* Compute the points that need to be computed */
 
-    A = gi::symbol( params.A );
-    a1 = params.a1;
     a2 = params.a2;
-    a3 = params.a3;
-    b1 = params.b1;
+    a4 = params.a4;
     
-    std::vector<parameters_2_t> input;
+    std::vector<parameters_2_2_t> input;
     std::vector<gi::ex> results;
 
-    for( b2 = 0 ; b2 < size ; b2++ ) {
-        for( b3 = 0 ; b3 < size ; b3++ ) {
-            for( c1 = 0 ; c1 < size ; c1++ ) {
-                for( c2 = 0 ; c2 < size ; c2++ ) {
-                    parameters_2_t p( A, a1, a2, a3, b1, b2, b3, c1, c2 );
-                    input.push_back( p );
-                }
-            }
+    for( a6 = 0 ; a6 < N ; a6++ ) {
+        for( a1 = 0 ; a1 < N ; a1++ ) {
+            parameters_2_2_t p( a4, a2, a1, a6 );
+            input.push_back( p );
         }
     }
     
     /* Distribute the work */
-    /* Very copy/paste from multiply_1level_master -> possible refactoring here */
+    /* Very copy/paste from multiply_2levels_master -> possible refactoring here */
     
     while( input.size() > 0 ) {
         MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm_team, &status );
@@ -182,7 +179,7 @@ gi::ex multiply_1level_foreman_hierarch_distribute_work( tensor3D_t& T, matrix_i
     return Tens;
 }
 
-void multiply_1level_foreman_hierarch_finalize( MPI_Comm comm_team ) {
+void multiply_2levels_foreman_hierarch_finalize( MPI_Comm comm_team ) {
 
     int src, np, running = 0;
     unsigned int len;
@@ -200,13 +197,13 @@ void multiply_1level_foreman_hierarch_finalize( MPI_Comm comm_team ) {
     return; 
 }
 
-void multiply_1level_foreman_hierarch( tensor3D_t& T, matrix_int_t& J, int size, MPI_Comm comm_foremen, MPI_Comm comm_team ) {
+void multiply_2levels_foreman_hierarch( tensor3D_t& T, int size, MPI_Comm comm_foremen, MPI_Comm comm_team ) {
 
     gi::ex Tens;
     gi::lst symbols;
     unsigned int len = 0;
 
-    parameters_t params;
+    parameters_2_1_t params;
     MPI_Status status;
     char* expr_c;
 
@@ -224,25 +221,25 @@ void multiply_1level_foreman_hierarch( tensor3D_t& T, matrix_int_t& J, int size,
     while( true ){
         
         /* Receive a set of parameters */
-        
-        MPI_Recv( &params, 1, DT_PARAMETERS, ROOT, MPI_ANY_TAG, comm_foremen, &status );
-        
+
+        MPI_Recv( &params, 1, DT_PARAMETERS_2_1, ROOT, MPI_ANY_TAG, comm_foremen, &status );
+
         if( status.MPI_TAG == TAG_WORK ){
             
             /* Distribute the work on my workers */
         
-            Tens = multiply_1level_foreman_hierarch_distribute_work( T, J, size, params, symbols, comm_team, rank /* DEBUG */ );
+            Tens = multiply_2levels_foreman_hierarch_distribute_work( T, size, params, symbols, comm_team, rank /* DEBUG */ );
 
-           /* Send the result to the master */
+            /* Send the result to the master */
             
-           send_result( Tens, comm_foremen);
+            send_result( Tens, comm_foremen );
 
         } else {
             if( status.MPI_TAG == TAG_END ){
-                
+
                 /* The end: forward the signal to our workers */
                 
-                multiply_1level_foreman_hierarch_finalize( comm_team );
+                multiply_2levels_foreman_hierarch_finalize( comm_team );
                 return;
             } else {
                 std::cerr << "Wrong tag received on slave " << status.MPI_TAG << std::endl;
@@ -251,13 +248,17 @@ void multiply_1level_foreman_hierarch( tensor3D_t& T, matrix_int_t& J, int size,
     }
 }
 
-void multiply_1level_slave_hierarch( tensor3D_t& T, matrix_int_t& J, int size, MPI_Comm comm ) {
+/*******************************************************************************
+ *                                 Worker                                      *
+ *******************************************************************************/
+
+void multiply_2levels_slave_hierarch( tensor3D_t& T, int size, MPI_Comm comm ) {
 
     gi::ex Tens;
-    int  a1, a2, a3, b1, b2, b3, c1, c2;
+    int  a1, a2, a4, a6;
     unsigned int len = 0;
     
-    parameters_2_t params;
+    parameters_2_2_t params;
     MPI_Status status;
     char* expr_c;
 
@@ -272,21 +273,16 @@ void multiply_1level_slave_hierarch( tensor3D_t& T, matrix_int_t& J, int size, M
     while( true ){
         /* Receive a set of parameters */
         
-        MPI_Recv( &params, 1, DT_PARAMETERS_2, ROOT, MPI_ANY_TAG, comm, &status );
+        MPI_Recv( &params, 1, DT_PARAMETERS_2_2, ROOT, MPI_ANY_TAG, comm, &status );
+
         
         if( status.MPI_TAG == TAG_WORK ){
-            a1 = params.a1;
+            a4 = params.a4;
             a2 = params.a2;
-            a3 = params.a3;
-            b1 = params.b1;
-            b2 = params.b2;
-            b3 = params.b3;
-            c1 = params.c1;
-            c2 = params.c2;
-
-            gi::symbol A( std::string( params.A  ) );
-            
-            Tens = one_level2_product( &T, &J, A, size, a1, a2, a3, b1, b2, b3, c1, c2 );
+            a1 = params.a1;
+            a6 = params.a6;
+
+            Tens = two_level2_product( &T, size, a4, a2, a1, a6 );
 
             if( Tens.is_zero() ) {
                 len = 0;
@@ -324,6 +320,152 @@ void multiply_1level_slave_hierarch( tensor3D_t& T, matrix_int_t& J, int size, M
 
 }
 
+/*******************************************************************************
+ *                                 Master                                      *
+ *******************************************************************************/
+
+gi::ex multiply_2levels_master( tensor3D_t& T, unsigned int size, MPI_Comm comm = MPI_COMM_WORLD ) { 
+    gi::ex Tens = 0;
+    unsigned int a2, a4;
+    gi::lst symbols;
+
+    MPI_Status status;
+    char* expr_c;
+    size_t expr_c_size = 0;
+    int src, np, running = 0;
+    unsigned int len;
+    parameters_2_1_t pzero( 0, 0 );
+
+    MPI_Comm_size( comm, &np );
+
+    expr_c = NULL;
+    expr_c = (char*) malloc( 3279 ); // TMP
+    
+    int i, j;
+    i = 0;
+    j = 0;
+
+    int receivedresults = 0;
+    unsigned int N = size/2;
+
+    std::vector<parameters_2_1_t> input;
+    std::vector<std::string> results_s;
+    std::vector<gi::ex> results;
+
+    /* Build a list of argument sets */
+    
+    for( a4 = 0 ; a4 < N ; a4++ ){
+        for( a2 = 0; a2 < N ; a2++ ){
+            parameters_2_1_t p( a4, a2 );
+            input.push_back( p );
+        }
+	}
+    
+    /* Compute the set of symbols */
+    /* Could be done while the first slave is working */
+    
+    symbols = all_symbols_3D( size );
+
+    /* Distribute the work */
+
+    while( input.size() > 0 ) {
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        src = status.MPI_SOURCE;
+       
+        if( status.MPI_TAG == TAG_PULL ) {
+
+            /* Nothing else will come: just send wome work */
+            send_work( input, src, comm );
+            running++;
+            
+        } else {
+            if( status.MPI_TAG == TAG_RES ){
+                src = status.MPI_SOURCE;
+
+                /* The first message contains the length of what is coming next */
+                if( len != 0 ) {
+                    if( len > expr_c_size ) {
+                        expr_c_size = len;
+                        if( NULL != expr_c ) free( expr_c );
+                        expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+                    }
+                    
+                    /* Receive the result */
+                    MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status );
+                        
+                    /* put it in the result queue */
+                    std::string s( expr_c );
+                    
+                    send_work( input, src, comm );
+                    
+                    /* Process what I have just received */
+                    /* Could be given to a slave... */
+                    gi::ex received = de_linearize_expression( s, symbols );
+                    Tens += received;
+#if DEBUG
+                    results.push_back( received );
+                    results_s.push_back( s );
+                    receivedresults++;
+#endif
+                } else {
+                    /* Send more work  */
+                    send_work( input, src, comm );
+                }                
+            } else{
+                std::cerr << "Wrong tag received " << status.MPI_TAG << std::endl;
+            }
+        }
+   }
+
+   /* Wait until everyone is done */
+
+   // running = np - 1; // all the foremen are running 
+    while( running > 0 ) {
+
+        MPI_Recv( &len, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &status );
+        src = status.MPI_SOURCE;
+
+        if( len != 0 ) {
+            if( len > expr_c_size ) {
+                expr_c_size = len;
+                if( NULL != expr_c ) free( expr_c );
+                expr_c = (char*)malloc( expr_c_size ); // The \0 was added by the slave
+            }
+
+            /* Receive the result */
+            MPI_Recv( expr_c, len, MPI_CHAR, src, TAG_EXPR, comm, &status );
+            
+            /* And send the END signal */
+            send_end( src, pzero, comm );
+            running--;
+            
+            /* Process what I have just received */
+            /* Could be given to a slave... */
+            /* put it in the result queue */
+            std::string s( expr_c );
+            gi::ex received = de_linearize_expression( s, symbols );
+            Tens += received;
+#if DEBUG
+            results.push_back( received );
+            results_s.push_back( s );
+            receivedresults++;
+#endif
+        } else {
+            send_end( src, comm );
+            running--;
+        }
+    }
+    
+#if DEBUG
+    std::cout << "Received " << receivedresults << " results" << std::endl;
+
+    std::cout << "Tpara=" << Tens << ";" << std::endl;
+#endif
+    
+    if( NULL != expr_c) free( expr_c );
+    return Tens;
+}
+
 /* The master sends a parameter set to a foreman. 
    The foreman computes a set of parameter sets and distributes them to the workers.
    The foreman adds the results and, when everything is done, sends the total to the master. 
@@ -338,7 +480,7 @@ void multiply_1level_slave_hierarch( tensor3D_t& T, matrix_int_t& J, int size, M
    F -> M: Same
 */
         
-gi::ex multiply_1level_mw_hierarch( tensor3D_t& T, matrix_int_t& J, int size ) {  // simpler: same dimension everywhere
+gi::ex multiply_2levels_mw_hierarch( tensor3D_t& T, int size ) {  // simpler: same dimension everywhere
     int rank;
     gi::ex Tens = 0;
     MPI_Comm COMM_FOREMEN, COMM_TEAM;
@@ -370,14 +512,14 @@ gi::ex multiply_1level_mw_hierarch( tensor3D_t& T, matrix_int_t& J, int size ) {
     /* Here we go*/
     
     if( 0 == rank ) {
-        Tens = multiply_1level_master( T, J, size, COMM_FOREMEN );
+        Tens = multiply_2levels_master( T, size, COMM_FOREMEN );
     } else {
         int rank_foreman;
         MPI_Comm_rank( COMM_TEAM, &rank_foreman );
         if( 0 == rank_foreman ) {
-            multiply_1level_foreman_hierarch( T, J, size, COMM_FOREMEN, COMM_TEAM );
+            multiply_2levels_foreman_hierarch( T, size, COMM_FOREMEN, COMM_TEAM );
         } else {
-            multiply_1level_slave_hierarch( T, J, size, COMM_TEAM );
+            multiply_2levels_slave_hierarch( T, size, COMM_TEAM );
         }
     }
 
diff --git a/src/tensormatrix.cpp b/src/tensormatrix.cpp
index 77fdc2bf5df1bcb7a640b00821f8846c12eeefd3..eeba3e67a4e2fb28fb7e3a98c7ae146cf1202dc5 100644
--- a/src/tensormatrix.cpp
+++ b/src/tensormatrix.cpp
@@ -1,7 +1,6 @@
 #include <iostream>
 #include <cstdlib>
 #include <ginac/ginac.h>
-//#include <starpu.h>
 
 #define N 4
 
@@ -18,7 +17,7 @@ typedef std::vector<int> vector_int_t;
 
 /*
   libginac-dev ginac-tools
-  g++ -O3 -Wall -o tensormatrix tensormatrix.cpp -lginac
+  g++ -std=c++11 -O3 -Wall -o tensormatrix tensormatrix.cpp -lginac
 */
 /* Sequentiel sur Minimum
 real	3m31,034s
diff --git a/src/tensormatrix.h b/src/tensormatrix.h
index a21dca21e0d20aa55e93ab8e2fde7c34a2fd49c5..dd936bfd53c439327a22d0adff267281484c6a9f 100644
--- a/src/tensormatrix.h
+++ b/src/tensormatrix.h
@@ -25,7 +25,7 @@ gi::ex multiply_2levels( tensor3D_t&, int );
 // parallel
 gi::ex multiply_1level_mw( tensor3D_t&, int );
 gi::ex multiply_1level_mw_addslave( tensor3D_t&, int );
-gi::ex multiply_1level_mw_hierarch( tensor3D_t&, matrix_int_t&, int );
+gi::ex multiply_2levels_mw_hierarch( tensor3D_t&, int );
 gi::ex multiply_combined( tensor3D_t&, int );
 
 /*******************************************************************************
diff --git a/src/tensormatrix_mpi.cpp b/src/tensormatrix_mpi.cpp
index 39a89de53e3dd0a15bc11cc1341d0c9fe270a067..9b42408df4429210ae39695fb15e936556e79945 100644
--- a/src/tensormatrix_mpi.cpp
+++ b/src/tensormatrix_mpi.cpp
@@ -114,7 +114,7 @@ int main( int argc, char** argv ){
                 break;
             }
             if( argc >= 4 ) {
-                nbforemen = atoi( argv[3] );  /* these two variables are obtained from the same CI parameter but since they are */
+                nbforemen = atoi( argv[3] );  /* these two variables are obtained from the same CLI parameter but since they are */
                 maxresult = atoi( argv[3] );  /* used in two functions for two purposes, use two different names                */
             }
         }
@@ -138,9 +138,9 @@ int main( int argc, char** argv ){
     case 'a':
         Tpara = multiply_1level_mw_addslave( T, N );
         break;
-        /*case 'h':
-        Tpara = multiply_1level_mw_hierarch( T, J, N );
-        break;*/
+    case 'h':
+        Tpara = multiply_2levels_mw_hierarch( T, N );
+        break;
     case 'c':
         Tpara = multiply_combined( T, N );
         break;
diff --git a/src/utils_parall.cpp b/src/utils_parall.cpp
index 64c7beccccb51340839383a80d5cbf99f2015be7..3ce59790315815f7455f2601b32f1519ca5c8d26 100644
--- a/src/utils_parall.cpp
+++ b/src/utils_parall.cpp
@@ -31,9 +31,11 @@ parameters_2_1_t::parameters_2_1_t( unsigned int a4, unsigned int a2 ){
     this->a2 = a2;
 }
 
-parameters_2_2_t::parameters_2_2_t( unsigned int a4, unsigned int a2 ){
-    this->a6 = a6;
+parameters_2_2_t::parameters_2_2_t( unsigned int a4, unsigned int a2, unsigned int a1, unsigned int a6 ){
+    this->a4 = a4;
+    this->a2 = a2;
     this->a1 = a1;
+    this->a6 = a6;
 }
 
 void create_parameters_datatye(){
@@ -46,8 +48,8 @@ void create_parameters_datatye_2_1(){
     MPI_Type_commit( &DT_PARAMETERS_2_1 );
 }
 
-void create_parameters_datatye_2(){
-    MPI_Type_contiguous( 2, MPI_UNSIGNED, &DT_PARAMETERS_2_2 );
+void create_parameters_datatye_2_2(){
+    MPI_Type_contiguous( 4, MPI_UNSIGNED, &DT_PARAMETERS_2_2 );
     MPI_Type_commit( &DT_PARAMETERS_2_2 );
 }
 
@@ -79,6 +81,11 @@ void send_end( int peer, MPI_Comm comm ) {
     MPI_Send( &para, 1, DT_PARAMETERS, peer, TAG_END, comm );
 }
 
+void send_end( int peer, parameters_2_1_t p, MPI_Comm comm ) {
+    /* The parameters_2_1_t argument is not used, but needed to instinguish between functions */
+    MPI_Send( &p, 1, DT_PARAMETERS_2_1, peer, TAG_END, comm );
+}
+
 void send_end_batch( int peer, MPI_Comm comm ) {
     parameters_t para;
     MPI_Send( &para, 1, DT_PARAMETERS_2_1, peer, TAG_END_BATCH, comm );
@@ -98,6 +105,12 @@ void send_work( std::vector<parameters_2_1_t>& input, int peer, MPI_Comm comm ){
     MPI_Send( &para, 1, DT_PARAMETERS_2_1, peer, TAG_WORK, comm );
 }
 
+void send_work( std::vector<parameters_2_2_t>& input, int peer, MPI_Comm comm ){
+    parameters_2_2_t para = input.back();
+    input.pop_back();
+    MPI_Send( &para, 1, DT_PARAMETERS_2_2, peer, TAG_WORK, comm );
+}
+
 /* M -> W: Send a set of expressions to be added */
 
 void send_expressions_to_add( std::vector<std::string>& results, int peer ) {
diff --git a/src/utils_parall.h b/src/utils_parall.h
index bf9ebb22cd28e85d744d53c5241e45fe6237ad3e..a0083e88a1e7e5684c8f98a74a9fbb17fb030cea 100644
--- a/src/utils_parall.h
+++ b/src/utils_parall.h
@@ -21,10 +21,11 @@ public:
     parameters_2_1_t( unsigned int, unsigned int );
     parameters_2_1_t( void ){};
 };
+
 class parameters_2_2_t{
 public:
-    unsigned int a6, a1;
-    parameters_2_2_t( unsigned int, unsigned int );
+    unsigned int a4, a2, a6, a1;
+    parameters_2_2_t( unsigned int, unsigned int, unsigned int, unsigned int );
     parameters_2_2_t( void ){};
 };
 
@@ -38,12 +39,14 @@ gi::ex de_linearize_expression( std::string, gi::lst );
 
 void send_work( std::vector<parameters_t>& input, int peer, MPI_Comm comm = MPI_COMM_WORLD );
 void send_work( std::vector<parameters_2_2_t>& input, int peer, MPI_Comm comm = MPI_COMM_WORLD );
+void send_work( std::vector<parameters_2_1_t>& input, int peer, MPI_Comm comm );
 
 void send_expressions_to_add( std::vector<std::string>&, int );
 void send_add_or_end_addslave(  std::vector<std::string>&, int, int* );
 void send_work_addslave(  std::vector<parameters_t>&, std::vector<std::string>&, int ) ;
 void send_result( gi::ex T, MPI_Comm comm = MPI_COMM_WORLD );
 void send_end( int peer, MPI_Comm comm = MPI_COMM_WORLD );
+void send_end( int peer, parameters_2_1_t p, MPI_Comm comm = MPI_COMM_WORLD );
 void send_end_batch( int peer, MPI_Comm comm = MPI_COMM_WORLD );
 
 void create_parameters_datatye( void );