/************************************************************************** * * Subroutine P11 performs the matrix multiplication C = A x B * * Parameters: * * Provided by calling routine: * A = Matrix of multiple precision numbers * B = Matrix of multiple precision numbers * MODULUS = Large integer modulus, expressed as an array of integers * Q = The size of the grid of PEs, Q x Q * M = The size of the sub-matrices on each PE * NUMSIZE = The number of words used to store an MP integer * * Returned by this routine: * C = Matrix of multiple precision numbers * *************************************************************************** * * This version for non-shared-memory systems uses Fox`s algorithm for * matrix multiplication. * **************************************************************************/ /* CVS info */ /* $Date: 2005/09/19 18:45:08 $ */ /* $Revision: 1.3 $ */ /* $RCSfile: p11.c,v $ */ /* $Name: rel_5 $ */ #include "bench11.h" #include static char cvs_info[] = "BMARKGRP $Date: 2005/09/19 18:45:08 $ $Revision: 1.3 $ $RCSfile: p11.c,v $ $Name: rel_5 $"; // use macros to address 3-dim arrays with variable dimension #define c(I,J,K) c[ (K) + numsize * ( (J) + m * (I) ) ] #define la(I,J,K) la[ (K) + numsize * ( (J) + m * (I) ) ] #define lb(I,J,K) lb[ (K) + numsize * ( (J) + m * (I) ) ] #define lc(I,J,K) lc[ (K) + numsize * ( (J) + m * (I) ) ] void p11 ( mplong a[], mplong b[], mplong c[], mplong modulus[], int q, int m, int numsize ) { // Multiply into tempmp, accumulate sums in tempc mplong tempmp [ 2*maxnumsize ]; mplong tempc [ 2*maxnumsize ]; int i, j, abcsize, tempsize; void mult(); // tempa, otherb are for shuffling submatrices for Fox's algorithm // multa, multb pointers passed to mult mplong * sendb , * recvb , * multa , * multb; mplong * tempa , * otherb; int row, col, root, whichb, step, matrixsize, sendsize; int sour, dest, tag, ierror; extern int npes, mype; MPI_Status status; MPI_Request request; MPI_Comm world = MPI_COMM_WORLD; MPI_Comm row_comm; if ( q > 1 ) { matrixsize = numsize * m * m; tempa = (mplong *) malloc ( 8 * matrixsize ); otherb = (mplong *) malloc ( 8 * matrixsize ); row = mype / q; col = mype % q; // make row communicator for broadcasting A across row MPI_Comm_split( world, row, col, &row_comm ); tag = 8; whichb = 0; } abcsize = numsize - MPHL; tempsize = 2*maxnumsize - MPHL; // initialize c to hold product a*b for ( i = 0; i < m; i++ ) { for ( j = 0; j < m; j++ ) { MPINIT ( &c(i,j,0), abcsize, 0 ); } } // initialize temporaries MPINIT ( tempmp, tempsize, 0 ); if (q == 1) { // M = N, ordinary multiply on 1 processor for timing comparison mult ( a, b, c, modulus, numsize, m, tempc, tempmp, tempsize ); } else { // q > 1 -- Fox's algorithm takes q steps sendsize = matrixsize * intspermplong; for ( step = 0; step < q; step++ ) { // Spread a subblock of A across a row of PEs, then multiply // PE's on main diagonal send; next step, diag moves to right root = (row + step) % q; MPI_Barrier(world); if ( col == root ) { // Each PE on curr diag broadcasts its A, multiplies its A MPI_Bcast ( a, sendsize, MPI_INT, root, row_comm ); multa = a; } else { // Each PE not on curr diag receives broadcast into tempa, // multiplies tempa MPI_Bcast ( tempa, sendsize, MPI_INT, root, row_comm ); multa = tempa; } // active version of B switches between B and otherB if ( whichb == 0 ) multb = b; else multb = otherb; // multiply appropriate current subblocks mult ( multa, multb, c, modulus, numsize, m, tempc, tempmp, tempsize ); if ( step < q-1 ) { // Rotate rows of sub-matrices up one (B->otherB or vice versa) // Each PE receives from below and sends to above dest = ( mype + q * (q-1) ) % npes; sour = ( mype + q ) % npes; if ( whichb == 0 ) { sendb = b; recvb = otherb; } else { sendb = otherb; recvb = b; } MPI_Barrier(world); MPI_Irecv ( recvb, sendsize, MPI_INT, sour, tag, world, &request); MPI_Send ( sendb, sendsize, MPI_INT, dest, tag, world ); MPI_Wait ( &request, &status ); MPI_Barrier(world); // Switch to "other" B - toggle between B, otherb in mult whichb = 1 - whichb; } // if ( step < q-1 ) } // loop on step to q-1 free ( tempa ); free ( otherb ); } // q > 1 } // p11 // Single subr to handle 4 combinations (A or tempA) X (B or otherB) void mult ( mplong la[], mplong lb[], mplong lc[], mplong modulus[], int numsize, int m, mplong tempc[], mplong tempmp[], int tempsize ) { int i, j, k; // extern int VERBOSE; #ifdef GMP // Must fix up pointer word since most A's and all B's have been moved for ( i = 0; i < m; i++ ) for ( j = 0; j < m; j++ ) { la(i,j,MPPTR) = (mplong) & la(i,j,MPDATA); lb(i,j,MPPTR) = (mplong) & lb(i,j,MPDATA); } #endif for ( i = 0; i < m; i++ ) for ( j = 0; j < m; j++ ) { // zero tempc - accumulate sum during loop // size = 0 is enough MPINIT ( tempc, tempsize, 0 ); /* VERBOSE = 2; */ /* it makes MPUTIL print lots of debug info */ for ( k = 0; k < m; k++ ) { mpmult ( tempmp, &la(i,k,0), &lb(k,j,0) ); mpadd ( tempc, tempc, tempmp ); } /* * Accumulate sums in lc(i,j) -- every entry has Q contributions * (one for each time P11 calls MULT) * With Q-1 extra calls to mpmod, C can be (numsize.. not (2*numsize.. * Work in double-size tempc, then copy back into single-size lc(i,j) */ mpadd ( tempc, tempc, &lc(i,j,0) ); mpmod ( tempc, tempc, modulus ); #ifdef GMP mpset ( &lc(i,j,0), tempc ); #else // MPUTIL lc(i,j,MPSIZE) = tempc[MPSIZE]; for ( k = MPDATA; k < MPDATA + tempc[MPSIZE]; k++ ) lc(i,j,k) = tempc[k]; #endif } }