diff --git a/src/Makefile b/src/Makefile
index d9a4393e3c0d2d4a07e46d7a026a44f62dda19ed..ac171997860a93c588bcda956bf1d9d09f773169 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,7 +28,8 @@ NP = 5
 
 MPISRC = masterworker.cpp mw_addslave.cpp hierarchical.cpp  \
          perf.cpp  sequential.cpp  tensormatrix_mpi.cpp      \
-         utils.cpp  utils_parall.cpp profiling.cpp mw_combined.cpp
+         utils.cpp  utils_parall.cpp profiling.cpp mw_combined.cpp \
+	 masterworker2.cpp mw_addslave2.cpp
 
 MPIOBJ= $(MPISRC:.cpp=.o)
 
diff --git a/src/hierarchical.cpp b/src/hierarchical.cpp
index 87168ffc7e0e8bdaedd12b529563d433a3677d58..64b3039b2ff64dd73274d5a7062bddd9706e43fd 100644
--- a/src/hierarchical.cpp
+++ b/src/hierarchical.cpp
@@ -501,9 +501,9 @@ gi::ex multiply_2levels_mw_hierarch( tensor3D_t& T, int size ) {  // simpler: sa
 
     /* Create new datatypes for the parameters */
 
-    create_parameters_datatye();
-    create_parameters_datatye_2_1();
-    create_parameters_datatye_2_2();
+    create_parameters_datatype();
+    create_parameters_datatype_2_1();
+    create_parameters_datatype_2_2();
 
     /* Create the communicators */
     
diff --git a/src/masterworker.cpp b/src/masterworker.cpp
index 2bfccc3e2e8e456016fefd42a9a1cd1508403b8b..5bbb605e555d771d456467cb56bb80fa296808d6 100644
--- a/src/masterworker.cpp
+++ b/src/masterworker.cpp
@@ -211,7 +211,7 @@ gi::ex multiply_1level_mw( tensor3D_t& T, int size ) {  // simpler: same dimensi
 
     /* Create a new datatype for the parameters */
     
-    create_parameters_datatye();
+    create_parameters_datatype();
 
     /* Here we go */
     
diff --git a/src/mw_addslave.cpp b/src/mw_addslave.cpp
index d74de64ce288d77235cf31fbca10f5f2d28492d2..91b8a04432cc06f56bf479388fbbdb9d1ce20d9d 100644
--- a/src/mw_addslave.cpp
+++ b/src/mw_addslave.cpp
@@ -224,7 +224,7 @@ gi::ex multiply_1level_mw_addslave( tensor3D_t& T, int size ) {  // simpler: sam
 
     /* Create a new datatype for the parameters */
     
-    create_parameters_datatye();
+    create_parameters_datatype();
 
     /* Here we go */
 
diff --git a/src/mw_combined.cpp b/src/mw_combined.cpp
index 663eda0cd765851901bb00b5c0e5359bc82ba4ce..aaaec877dedfcc61a5ab35dae3e39af0c23f122e 100644
--- a/src/mw_combined.cpp
+++ b/src/mw_combined.cpp
@@ -341,7 +341,7 @@ gi::ex multiply_combined( tensor3D_t& T, int size ) {  // simpler: same dimensio
 
     /* Create a new datatype for the parameters */
     
-    create_parameters_datatye();
+    create_parameters_datatype();
 
     /* Here we go */
     
diff --git a/src/products.h b/src/products.h
index 3d54d90925041b3096c64ac4db7963e66bb8467f..d03b3ce34b80973648a0beea07e9b75a89f60cd2 100644
--- a/src/products.h
+++ b/src/products.h
@@ -8,6 +8,7 @@ namespace gi = GiNaC;
 
 // internal (sequential) routines
 gi::ex one_level1_product( tensor3D_t*, int, int, int, int );
+gi::ex one_level1_product( tensor3D_t*, int, int );
 gi::ex one_level2_product( tensor3D_t*, int, int, int, int, int );
 gi::ex two_level1_product( tensor3D_t*, int, int, int );
 gi::ex two_level2_product( tensor3D_t*, int, int, int, int, int );
diff --git a/src/sequential.cpp b/src/sequential.cpp
index 54fb8be5e7327e1dc3fa84cc067f6859fe62ad33..d6337b2d113d7eb91ab06c91a036aae0b996773f 100644
--- a/src/sequential.cpp
+++ b/src/sequential.cpp
@@ -241,6 +241,106 @@ gi::ex one_level1_product( tensor3D_t* T, int size, int a4, int a2, int a1 ){
     return Tens;
 }
 
+gi::ex one_level1_product( tensor3D_t* T, int size, int a4 ){
+
+    gi::ex Tens = 0;
+    gi::ex Ti0, Ti1, Ti2;
+    gi::ex W1, W2, W3, W4, W5, W6, W7;
+    gi::ex Z1, Z2, Z6, t5, tE, t1, t12, t123, t126, t13, t134, t14, t16, t2, t23, t24, t26, t3, t4, X7Y5;
+    gi::ex TE, T1, T2, T3, T4, T5, T12, T13, T14, T16, T23, T24, T26, T123, T126, T134;
+    const char timerB[] = "B";
+    
+	int a1, a2, a3, a5, a6;
+    int A1, A2, A3, A4, A5, A6;
+    TE = T1 = T2 = T3 = T4 = T5 = T12 = T13 = T14 = T16 = T23 = T24 = T26 = T123 = T126 = T134 = 0;
+    Ti0 = Ti1 = Ti2 = 0;
+    
+    int N = size/2;
+    
+    A4 = a4 + N;
+    for( a2 = 0 ; a2 < N ; a2++ ) {
+        A2 = a2 + N;
+        Ti2 = 0;
+        for( a6 = 0 ; a6 < N ; a6++ ) {
+            A6 = a6 + N;
+            
+            W1 = (*T)[a4][a2][a6];
+            W2 = (*T)[a4][A2][a6];
+            W3 = (*T)[a4][a2][A6];
+            W4 = (*T)[A4][A2][a6];
+            W5 = (*T)[a4][A2][A6];
+            W6 = (*T)[A4][a2][A6];
+            W7 = (*T)[A4][A2][A6];
+            
+            Ti1 = 0;
+            for( a1 = 0 ; a1 < N ; a1++ ) {
+                A1 = a1 + N;
+                Ti0 = TE = T12 = T13 = T14 = T16 = T23 = T24 = T26 = T1 = T2 = T3 = T4 = T5 = T123 = T126 = T134 = 0;
+                for( a5 = 0 ; a5 < N ; a5++ ) {
+                    A5 = a5 + N;
+                    Z1 = (*T)[a1][a5][a6];
+                    Z2 = (*T)[A1][a5][a6];
+                    Z6 = (*T)[A1][a5][A6];
+                    t5 = W3*(*T)[a1][A5][a6];
+                    tE = W4*(*T)[A1][A5][A6];
+                    t1 = W3*Z2;
+                    t13 = t1;
+                    t2 = W5*Z1;
+                    t23 = t2;
+                    t3 = W3*Z1;
+                    t4 = W6*Z1;
+                    t12 = W5*Z2;
+                    t14 = W6*Z2;
+                    t134 = t14 ;
+                    t16 = W1*Z6;
+                    t24 = W7*Z1;
+                    t26 = W2*(*T)[a1][a5][A6];
+                    t123 = W5*Z2;
+                    t126 = W2*Z6;
+                    
+                    for( a3 = 0 ; a3 < N ; a3++ ) {
+                        A3 = a3 + N;
+                        TE = TE + tE*(*T)[a1][a2][a3]*(*T)[a4][a5][A3];
+                        T5 = T5 + t5*(*T)[A1][A2][A3]*(*T)[A4][a5][a3];
+                        X7Y5 = (*T)[a1][A2][A3]*(*T)[A4][A5][a3];
+                        T1 = T1 + t1*X7Y5;
+                        T16 = T16 + t16*X7Y5;
+                        T2 = T2 + t2*(*T)[A1][a2][A3]*(*T)[A4][A5][a3];
+                        T3 = T3 + t3*(*T)[A1][A2][a3]*(*T)[A4][A5][A3];
+                        T4 = T4 + t4*(*T)[A1][A2][A3]*(*T)[a4][A5][a3];
+                        T12 = T12 + t12*(*T)[a1][a2][A3]*(*T)[A4][A5][a3];
+                        T13 = T13 + t13*(*T)[a1][A2][a3]*(*T)[A4][A5][A3];
+                        T14 = T14 + t14*(*T)[a1][A2][A3]*(*T)[a4][A5][a3];
+                        T23 = T23 + t23*(*T)[A1][a2][a3]*(*T)[A4][A5][A3];
+                        T24 = T24 + t24*(*T)[A1][a2][A3]*(*T)[a4][A5][a3];
+                        T26 = T26 + t26*(*T)[A1][a2][A3]*(*T)[A4][A5][a3];
+                        T123 = T123 + t123*(*T)[a1][a2][a3]*(*T)[A4][A5][A3];
+                        T126 = T126 + t126*(*T)[a1][a2][A3]*(*T)[A4][A5][a3];
+                        T134 = T134 + t134*(*T)[a1][A2][a3]*(*T)[a4][A5][A3];
+                    }
+                    Ti0 += ( 4*(TE+T12+T13+T14+T16+T23+T24+T26 - (T1 + T2 + T3 + T4 + T5 +T123 + T126 + T134)) );
+                }
+                Ti1 += Ti0;
+            }
+            Ti2 += Ti1;
+        }
+        Tens += Ti2;
+    }
+
+#if 0
+    std::ostringstream oss;
+    oss << "output_" << getpid();
+    std::ofstream fd;
+    fd.open( oss.str(), std::ios::app	);
+    fd << "T" << A << "=" << Tens << ";";
+    //    fd << " with " << A << " " << a1 <<  " " << a2 <<  " " << a3 <<  " " << b1;
+    fd << std::endl;
+    fd.close();
+#endif
+    
+    return Tens;
+}
+
 gi::ex one_level2_product( tensor3D_t* T, int size, int a4, int a2, int a1, int a6 ){
 
     int a3, a5;
diff --git a/src/tensormatrix.h b/src/tensormatrix.h
index dd936bfd53c439327a22d0adff267281484c6a9f..889933220ea4bef8a4e19004cb1eefde091b3f80 100644
--- a/src/tensormatrix.h
+++ b/src/tensormatrix.h
@@ -24,7 +24,9 @@ gi::ex multiply_1level( tensor3D_t&, int );
 gi::ex multiply_2levels( tensor3D_t&, int );
 // parallel
 gi::ex multiply_1level_mw( tensor3D_t&, int );
+gi::ex multiply_1level_mw2( tensor3D_t&, int );
 gi::ex multiply_1level_mw_addslave( tensor3D_t&, int );
+gi::ex multiply_1level_mw_addslave2( tensor3D_t&, int );
 gi::ex multiply_2levels_mw_hierarch( tensor3D_t&, int );
 gi::ex multiply_combined( tensor3D_t&, int );
 
diff --git a/src/tensormatrix_mpi.cpp b/src/tensormatrix_mpi.cpp
index 9b42408df4429210ae39695fb15e936556e79945..b0d949978106e4e5f734f06b455ace7c9f2a0b47 100644
--- a/src/tensormatrix_mpi.cpp
+++ b/src/tensormatrix_mpi.cpp
@@ -28,7 +28,9 @@ namespace gi = GiNaC;
    tensormatrix_mpi [N] [Function name] [Nb of foremen]
    Function names being: 
    - M/m: Master-Worker -> multiply_1level_mw
+   - n/n: Master-Worker, coarser grain -> multiply_1level_mw2
    - A/a: Master-Worker, addition on a slave -> multiply_1level_mw_addslave
+   - B/b: Master-Worker, coarser grain, addition on a slave -> multiply_1level_mw_addslave2
    - H/h: Hierarchical master-worker -> multiply_1level_mw_hierarch
    - C/c: Combined -> multiply_combined
 */
@@ -44,6 +46,7 @@ real	3m31,034s
 MPI_Datatype DT_PARAMETERS;
 MPI_Datatype DT_PARAMETERS_2_1;
 MPI_Datatype DT_PARAMETERS_2_2;
+MPI_Datatype DT_PARAMETERS_S;
 
 unsigned int nbforemen = NBFOREMEN;     /* Number of foremen to use with the hierarchical M/W */
 unsigned int maxresult = MAXRESULT;     /* Maximum results in the result queue, addslave version */
@@ -89,10 +92,18 @@ int main( int argc, char** argv ){
             case 'm':
                 tostart = 'm';
                 break;
+            case 'N':
+            case 'n':
+                tostart = 'n';
+                break;
             case 'A':
             case 'a':
                 tostart = 'a';
                 break;
+            case 'B':
+            case 'b':
+                tostart = 'b';
+                break;
             case 'H':
             case 'h':
                 tostart = 'h';
@@ -135,9 +146,15 @@ int main( int argc, char** argv ){
     case 'm':
         Tpara = multiply_1level_mw( T, N );
         break;
+    case 'n':
+        Tpara = multiply_1level_mw2( T, N );
+        break;
     case 'a':
         Tpara = multiply_1level_mw_addslave( T, N );
         break;
+    case 'b':
+        Tpara = multiply_1level_mw_addslave2( T, N );
+        break;
     case 'h':
         Tpara = multiply_2levels_mw_hierarch( T, N );
         break;
diff --git a/src/utils_parall.cpp b/src/utils_parall.cpp
index 3ce59790315815f7455f2601b32f1519ca5c8d26..d50d750f60098cf7d928b5f0138d04a82714c2d2 100644
--- a/src/utils_parall.cpp
+++ b/src/utils_parall.cpp
@@ -26,6 +26,10 @@ parameters_t::parameters_t( unsigned int a4,  unsigned int a2, unsigned int a1 )
     this->a1 = a1;
 }
 
+parameters_s_t::parameters_s_t( unsigned int a4 ){
+    this->a4 = a4;
+}
+
 parameters_2_1_t::parameters_2_1_t( unsigned int a4, unsigned int a2 ){
     this->a4 = a4;
     this->a2 = a2;
@@ -38,17 +42,22 @@ parameters_2_2_t::parameters_2_2_t( unsigned int a4, unsigned int a2, unsigned i
     this->a6 = a6;
 }
 
-void create_parameters_datatye(){
+void create_parameters_datatype(){
     MPI_Type_contiguous( 3, MPI_UNSIGNED, &DT_PARAMETERS );
     MPI_Type_commit( &DT_PARAMETERS );
 }
 
-void create_parameters_datatye_2_1(){
+void create_parameters_datatype_s(){
+    MPI_Type_contiguous( 1, MPI_UNSIGNED, &DT_PARAMETERS_S );
+    MPI_Type_commit( &DT_PARAMETERS_S );
+}
+
+void create_parameters_datatype_2_1(){
     MPI_Type_contiguous( 2, MPI_UNSIGNED, &DT_PARAMETERS_2_1 );
     MPI_Type_commit( &DT_PARAMETERS_2_1 );
 }
 
-void create_parameters_datatye_2_2(){
+void create_parameters_datatype_2_2(){
     MPI_Type_contiguous( 4, MPI_UNSIGNED, &DT_PARAMETERS_2_2 );
     MPI_Type_commit( &DT_PARAMETERS_2_2 );
 }
@@ -65,6 +74,10 @@ void free_parameters_2_2_dt( ){
     MPI_Type_free( &DT_PARAMETERS_2_2 );
 }
 
+void free_parameters_s_dt( ){
+    MPI_Type_free( &DT_PARAMETERS_S );
+}
+
 gi::ex add_expressions( std::vector<std::string> expressions, gi::lst symbols ) {
     gi::ex Tens = 0;
     for( auto s: expressions  ) {
@@ -82,10 +95,15 @@ void send_end( int peer, MPI_Comm comm ) {
 }
 
 void send_end( int peer, parameters_2_1_t p, MPI_Comm comm ) {
-    /* The parameters_2_1_t argument is not used, but needed to instinguish between functions */
+    /* The parameters_2_1_t argument is not used, but needed to distinguish between functions */
     MPI_Send( &p, 1, DT_PARAMETERS_2_1, peer, TAG_END, comm );
 }
 
+void send_end( int peer, parameters_s_t p, MPI_Comm comm ) {
+    /* The parameters_s_t argument is not used, but needed to distinguish between functions */
+    MPI_Send( &p, 1, DT_PARAMETERS_S, peer, TAG_END, comm );
+}
+
 void send_end_batch( int peer, MPI_Comm comm ) {
     parameters_t para;
     MPI_Send( &para, 1, DT_PARAMETERS_2_1, peer, TAG_END_BATCH, comm );
@@ -111,6 +129,12 @@ void send_work( std::vector<parameters_2_2_t>& input, int peer, MPI_Comm comm ){
     MPI_Send( &para, 1, DT_PARAMETERS_2_2, peer, TAG_WORK, comm );
 }
 
+void send_work( std::vector<parameters_s_t>& input, int peer, MPI_Comm comm ){
+    parameters_s_t para = input.back();
+    input.pop_back();
+    MPI_Send( &para, 1, DT_PARAMETERS_S, peer, TAG_WORK, comm );
+}
+
 /* M -> W: Send a set of expressions to be added */
 
 void send_expressions_to_add( std::vector<std::string>& results, int peer ) {
@@ -156,6 +180,19 @@ void send_add_or_end_addslave(  std::vector<std::string>& results, int peer, int
     }
 }
 
+void send_add_or_end_addslave(  std::vector<std::string>& results, int peer, int* running, parameters_s_t p ){
+    
+    /* Do I have a lot of results to be treated in the result queue? */
+
+    if( results.size() > maxresult ) {
+        /* if the result queue is too big, send it */
+        send_expressions_to_add( results, peer );
+    } else {
+        send_end( peer, p );
+        (*running)--;
+    }
+}
+
 /* M -> W: Send work: either a set of expressions to add, or a parameter set */
 
 void send_work_addslave(  std::vector<parameters_t>& input, std::vector<std::string>& results, int peer ) {
@@ -168,6 +205,16 @@ void send_work_addslave(  std::vector<parameters_t>& input, std::vector<std::str
     }    
 }
 
+void send_work_addslave(  std::vector<parameters_s_t>& input, std::vector<std::string>& results, int peer ) {
+
+    if( results.size() > maxresult ) {
+        /* if the result queue is too big, send it */
+        send_expressions_to_add( results, peer );
+    } else {
+        send_work( input, peer );
+    }    
+}
+
 /* W -> M: send the result of a computation */
 
 void send_result( gi::ex T, MPI_Comm comm ){
diff --git a/src/utils_parall.h b/src/utils_parall.h
index a0083e88a1e7e5684c8f98a74a9fbb17fb030cea..8c438ce08273db28eeb6e1d3149ca25dd1e6542d 100644
--- a/src/utils_parall.h
+++ b/src/utils_parall.h
@@ -15,6 +15,13 @@ public:
     parameters_t( void ){};
 };
 
+class parameters_s_t{
+public:
+    unsigned int a4;
+    parameters_s_t( unsigned int );
+    parameters_s_t( void ){};
+};
+
 class parameters_2_1_t{
 public:
     unsigned int a4, a2;
@@ -40,21 +47,27 @@ gi::ex de_linearize_expression( std::string, gi::lst );
 void send_work( std::vector<parameters_t>& input, int peer, MPI_Comm comm = MPI_COMM_WORLD );
 void send_work( std::vector<parameters_2_2_t>& input, int peer, MPI_Comm comm = MPI_COMM_WORLD );
 void send_work( std::vector<parameters_2_1_t>& input, int peer, MPI_Comm comm );
+void send_work( std::vector<parameters_s_t>& input, int peer, MPI_Comm comm = MPI_COMM_WORLD );
 
 void send_expressions_to_add( std::vector<std::string>&, int );
 void send_add_or_end_addslave(  std::vector<std::string>&, int, int* );
+void send_add_or_end_addslave(  std::vector<std::string>&, int, int*, parameters_s_t );
 void send_work_addslave(  std::vector<parameters_t>&, std::vector<std::string>&, int ) ;
+void send_work_addslave(  std::vector<parameters_s_t>&, std::vector<std::string>&, int ) ;
 void send_result( gi::ex T, MPI_Comm comm = MPI_COMM_WORLD );
 void send_end( int peer, MPI_Comm comm = MPI_COMM_WORLD );
 void send_end( int peer, parameters_2_1_t p, MPI_Comm comm = MPI_COMM_WORLD );
+void send_end( int peer, parameters_s_t p, MPI_Comm comm = MPI_COMM_WORLD );
 void send_end_batch( int peer, MPI_Comm comm = MPI_COMM_WORLD );
 
-void create_parameters_datatye( void );
-void create_parameters_datatye_2_1( void );
-void create_parameters_datatye_2_2( void );
+void create_parameters_datatype( void );
+void create_parameters_datatype_s( void );
+void create_parameters_datatype_2_1( void );
+void create_parameters_datatype_2_2( void );
 void free_parameters_dt( void );
 void free_parameters_2_1_dt( void );
 void free_parameters_2_2_dt( void );
+void free_parameters_s_dt( void );
 
 gi::ex add_expressions( std::vector<std::string>, gi::lst );
 
@@ -65,6 +78,7 @@ gi::ex add_expressions( std::vector<std::string>, gi::lst );
 extern MPI_Datatype DT_PARAMETERS;
 extern MPI_Datatype DT_PARAMETERS_2_1;
 extern MPI_Datatype DT_PARAMETERS_2_2;
+extern MPI_Datatype DT_PARAMETERS_S;
 
 extern unsigned int nbforemen;     /* Number of foremen to use with the hierarchical M/W */
 extern unsigned int maxresult;     /* Maximum results in the result queue, addslave version */