/**************************************************************************
*
*   Subroutine P11 performs the matrix multiplication C = A x B
*
*   Parameters:
*
*    Provided by calling routine:
*	A	=  Matrix of multiple precision numbers
*	B	=  Matrix of multiple precision numbers
*	MODULUS =  Large integer modulus, expressed as an array of integers
*	Q	=  The size of the grid of PEs, Q x Q
*	M	=  The size of the sub-matrices on each PE
*	NUMSIZE =  The number of words used to store an MP integer
*
*    Returned by this routine:
*	C	=  Matrix of multiple precision numbers
*
***************************************************************************
*
*    This version for non-shared-memory systems uses Fox`s algorithm for
*    matrix multiplication.
*
**************************************************************************/
/* CVS info   */
/* $Date: 2005/09/19 18:45:08 $     */
/* $Revision: 1.3 $ */
/* $RCSfile: p11.c,v $  */
/* $Name: rel_5 $     */

#include "bench11.h"

#include <mpi.h>

static char cvs_info[] = "BMARKGRP $Date: 2005/09/19 18:45:08 $ $Revision: 1.3 $ $RCSfile: p11.c,v $ $Name: rel_5 $"; 


// use macros to address 3-dim arrays with variable dimension

#define c(I,J,K)	c[ (K) + numsize * ( (J) + m * (I) ) ]
#define la(I,J,K)	la[ (K) + numsize * ( (J) + m * (I) ) ]
#define lb(I,J,K)	lb[ (K) + numsize * ( (J) + m * (I) ) ]
#define lc(I,J,K)	lc[ (K) + numsize * ( (J) + m * (I) ) ]


void p11 ( mplong a[], mplong b[], mplong c[], mplong modulus[],
	   int q, int m, int numsize )
{

   // Multiply into tempmp, accumulate sums in tempc
   mplong tempmp [ 2*maxnumsize ];
   mplong tempc  [ 2*maxnumsize ];

   int i, j, abcsize, tempsize;

   void mult();

   // tempa, otherb are for shuffling submatrices for Fox's algorithm
   // multa, multb pointers passed to mult

   mplong * sendb , * recvb , * multa , * multb;
   mplong * tempa , * otherb;

   int row, col, root, whichb, step, matrixsize, sendsize;
   int sour, dest, tag, ierror;

   extern int npes, mype;

   MPI_Status status;
   MPI_Request request;
   MPI_Comm world = MPI_COMM_WORLD;
   MPI_Comm row_comm;

   if ( q > 1 )
   {
      matrixsize = numsize * m * m;
      tempa  = (mplong *) malloc ( 8 * matrixsize );
      otherb = (mplong *) malloc ( 8 * matrixsize );
      row = mype / q;
      col = mype % q;

      // make row communicator for broadcasting A across row
      MPI_Comm_split( world, row, col, &row_comm );

      tag = 8;
      whichb = 0;
   }

   abcsize    = numsize - MPHL;
   tempsize   = 2*maxnumsize - MPHL;

   // initialize c to hold product a*b

   for ( i = 0; i < m; i++ )
   {
      for ( j = 0; j < m; j++ )
      {
         MPINIT ( &c(i,j,0), abcsize, 0 );
      }
   }

   // initialize temporaries
   MPINIT ( tempmp, tempsize, 0 );

   if (q == 1)
   {
      // M = N, ordinary multiply on 1 processor for timing comparison

      mult ( a, b, c, modulus, numsize, m, tempc, tempmp, tempsize );
   }
   else
   {
      // q > 1 -- Fox's algorithm takes q steps

      sendsize = matrixsize * intspermplong;

      for ( step = 0; step < q; step++ )
      {
         // Spread a subblock of A across a row of PEs, then multiply
         // PE's on main diagonal send; next step, diag moves to right

         root = (row + step) % q;

         MPI_Barrier(world);

         if ( col == root )
         {
            // Each PE on curr diag broadcasts its A, multiplies its A

            MPI_Bcast ( a,     sendsize, MPI_INT, root, row_comm );

            multa = a;
         }
         else
         {
            // Each PE not on curr diag receives broadcast into tempa,
            // multiplies tempa

            MPI_Bcast ( tempa, sendsize, MPI_INT, root, row_comm );

            multa = tempa;
         }

         // active version of B switches between B and otherB
         if ( whichb == 0 )
            multb = b;
         else
            multb = otherb;

         // multiply appropriate current subblocks

         mult ( multa, multb, c, modulus, numsize, m, tempc, tempmp, tempsize );

         if ( step < q-1 )

         {
            // Rotate rows of sub-matrices up one (B->otherB or vice versa)
            // Each PE receives from below and sends to above

            dest = ( mype + q * (q-1) ) % npes;
            sour = ( mype + q         ) % npes;

            if ( whichb == 0 )
            {
               sendb = b;
               recvb = otherb;
            }
            else
            {
               sendb = otherb;
               recvb = b;
            }

            MPI_Barrier(world);

            MPI_Irecv ( recvb, sendsize, MPI_INT, sour, tag, world, &request);

            MPI_Send  ( sendb, sendsize, MPI_INT, dest, tag, world );

            MPI_Wait  ( &request, &status );

            MPI_Barrier(world);

            // Switch to "other" B - toggle between B, otherb in mult
            whichb = 1 - whichb;

         }  // if ( step < q-1 )

      } // loop on step to q-1

      free ( tempa );
      free ( otherb );

   } // q > 1

}  // p11


// Single subr to handle 4 combinations (A or tempA) X (B or otherB)

void mult ( mplong la[], mplong lb[], mplong lc[], mplong modulus[],
	   int numsize, int m, mplong tempc[], mplong tempmp[], int tempsize )
{
   int i, j, k;
// extern int VERBOSE;

#ifdef GMP
   // Must fix up pointer word since most A's and all B's have been moved
   for ( i = 0; i < m; i++ )
      for ( j = 0; j < m; j++ )
      {
         la(i,j,MPPTR) = (mplong) & la(i,j,MPDATA);
         lb(i,j,MPPTR) = (mplong) & lb(i,j,MPDATA);
      }
#endif

   for ( i = 0; i < m; i++ )
      for ( j = 0; j < m; j++ )
      {
         // zero tempc - accumulate sum during loop
         // size = 0 is enough
         MPINIT ( tempc, tempsize, 0 );

/* VERBOSE = 2; */	/* it makes MPUTIL print lots of debug info */

         for ( k = 0; k < m; k++ )
         {
            mpmult ( tempmp, &la(i,k,0), &lb(k,j,0) );
            mpadd  ( tempc, tempc, tempmp );
         }

	 /*
          * Accumulate sums in lc(i,j) -- every entry has Q contributions
          * (one for each time P11 calls MULT)
          * With Q-1 extra calls to mpmod, C can be (numsize.. not (2*numsize..
          * Work in double-size tempc, then copy back into single-size lc(i,j)
          */

         mpadd ( tempc, tempc, &lc(i,j,0) );

         mpmod ( tempc, tempc, modulus );

#ifdef GMP
         mpset ( &lc(i,j,0), tempc );
#else  // MPUTIL
         lc(i,j,MPSIZE) = tempc[MPSIZE];
         for ( k = MPDATA; k < MPDATA + tempc[MPSIZE]; k++ )
            lc(i,j,k) = tempc[k];
#endif
      }
}