/* Kernels used in sparse ILU (and LU) and in the resulting triangular solves. These are for block algorithms where the block sizes are on the order of 2-6+. There are TWO versions of the macros below. 1) standard for MatScalar == PetscScalar use the standard BLAS for block size larger than 7 and 2) handcoded Fortran single precision for the matrices, since BLAS does not have some arguments in single and some in double. */ #if !defined(__ILU_H) #define __ILU_H #include "petscblaslapack.h" /* These are C kernels,they are contained in src/mat/impls/baij/seq */ EXTERN PetscErrorCode LINPACKdgefa(MatScalar*,PetscInt,PetscInt*); EXTERN PetscErrorCode LINPACKdgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*); EXTERN PetscErrorCode Kernel_A_gets_inverse_A_2(MatScalar*); EXTERN PetscErrorCode Kernel_A_gets_inverse_A_3(MatScalar*); #define Kernel_A_gets_inverse_A_4_nopivot(mat) 0;\ {\ MatScalar d, di;\ \ di = mat[0];\ mat[0] = d = 1.0 / di;\ mat[4] *= -d;\ mat[8] *= -d;\ mat[12] *= -d;\ mat[1] *= d;\ mat[2] *= d;\ mat[3] *= d;\ mat[5] += mat[4] * mat[1] * di;\ mat[6] += mat[4] * mat[2] * di;\ mat[7] += mat[4] * mat[3] * di;\ mat[9] += mat[8] * mat[1] * di;\ mat[10] += mat[8] * mat[2] * di;\ mat[11] += mat[8] * mat[3] * di;\ mat[13] += mat[12] * mat[1] * di;\ mat[14] += mat[12] * mat[2] * di;\ mat[15] += mat[12] * mat[3] * di;\ di = mat[5];\ mat[5] = d = 1.0 / di;\ mat[1] *= -d;\ mat[9] *= -d;\ mat[13] *= -d;\ mat[4] *= d;\ mat[6] *= d;\ mat[7] *= d;\ mat[0] += mat[1] * mat[4] * di;\ mat[2] += mat[1] * mat[6] * di;\ mat[3] += mat[1] * mat[7] * di;\ mat[8] += mat[9] * mat[4] * di;\ mat[10] += mat[9] * mat[6] * di;\ mat[11] += mat[9] * mat[7] * di;\ mat[12] += mat[13] * mat[4] * di;\ mat[14] += mat[13] * mat[6] * di;\ mat[15] += mat[13] * mat[7] * di;\ di = mat[10];\ mat[10] = d = 1.0 / di;\ mat[2] *= -d;\ mat[6] *= -d;\ mat[14] *= -d;\ mat[8] *= d;\ mat[9] *= d;\ mat[11] *= d;\ mat[0] += mat[2] * mat[8] * di;\ mat[1] += mat[2] * mat[9] * di;\ mat[3] += mat[2] * mat[11] * di;\ mat[4] += mat[6] * mat[8] * di;\ mat[5] += mat[6] * mat[9] * di;\ mat[7] += mat[6] * mat[11] * di;\ mat[12] += mat[14] * mat[8] * di;\ mat[13] += mat[14] * mat[9] * di;\ mat[15] += mat[14] * mat[11] * di;\ di = mat[15];\ mat[15] = d = 1.0 / di;\ mat[3] *= -d;\ mat[7] *= -d;\ mat[11] *= -d;\ mat[12] *= d;\ mat[13] *= d;\ mat[14] *= d;\ mat[0] += mat[3] * mat[12] * di;\ mat[1] += mat[3] * mat[13] * di;\ mat[2] += mat[3] * mat[14] * di;\ mat[4] += mat[7] * mat[12] * di;\ mat[5] += mat[7] * mat[13] * di;\ mat[6] += mat[7] * mat[14] * di;\ mat[8] += mat[11] * mat[12] * di;\ mat[9] += mat[11] * mat[13] * di;\ mat[10] += mat[11] * mat[14] * di;\ } EXTERN PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *); # if defined(PETSC_HAVE_SSE) EXTERN PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(MatScalar *); # endif EXTERN PetscErrorCode Kernel_A_gets_inverse_A_5(MatScalar *); EXTERN PetscErrorCode Kernel_A_gets_inverse_A_6(MatScalar *); EXTERN PetscErrorCode Kernel_A_gets_inverse_A_7(MatScalar *); /* A = inv(A) A_gets_inverse_A A - square bs by bs array stored in column major order pivots - integer work array of length bs W - bs by 1 work array */ #define Kernel_A_gets_inverse_A(bs,A,pivots,W) (LINPACKdgefa((A),(bs),(pivots)) || LINPACKdgedi((A),(bs),(pivots),(W))) /* -----------------------------------------------------------------------*/ #if !defined(PETSC_USE_MAT_SINGLE) /* Version that calls the BLAS directly */ /* A = A * B A_gets_A_times_B A, B - square bs by bs arrays stored in column major order W - square bs by bs work array */ #define Kernel_A_gets_A_times_B(bs,A,B,W) \ { \ PetscBLASInt _bbs = (PetscBLASInt)bs;\ PetscScalar _one = 1.0,_zero = 0.0; \ PetscErrorCode _ierr; \ _ierr = PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \ BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs));\ } /* A = A - B * C A_gets_A_minus_B_times_C A, B, C - square bs by bs arrays stored in column major order */ #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \ { \ PetscBLASInt _bbs = (PetscBLASInt)bs;\ PetscScalar _mone = -1.0,_one = 1.0; \ BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs));\ } /* A = A + B^T * C A_gets_A_plus_Btranspose_times_C A, B, C - square bs by bs arrays stored in column major order */ #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \ { \ PetscBLASInt _bbs = (PetscBLASInt)bs;\ PetscScalar _one = 1.0; \ BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs));\ } /* v = v + A^T w v_gets_v_plus_Atranspose_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \ { \ PetscScalar _one = 1.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs, _ione = 1; \ BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione); \ } /* v = v - A w v_gets_v_minus_A_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \ { \ PetscScalar _mone = -1.0,_one = 1.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs, _ione = 1; \ BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione); \ } /* v = v + A w v_gets_v_plus_A_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \ { \ PetscScalar _one = 1.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs,_ione = 1; \ BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione); \ } /* v = v + A w v_gets_v_plus_Ar_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \ { \ PetscScalar _one = 1.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs,_bncols = (PetscBLASInt)ncols,_ione = 1; \ BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione); \ } /* w = A v w_gets_A_times_v v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_w_gets_A_times_v(bs,v,A,w) \ { \ PetscScalar _zero = 0.0,_one = 1.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs,_ione = 1; \ BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione); \ } /* z = A*x */ #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \ { \ PetscScalar _one = 1.0,_zero = 0.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs,_bncols = (PetscBLASInt)ncols,_ione = 1; \ BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione); \ } /* z = trans(A)*x */ #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \ { \ PetscScalar _one = 1.0; \ PetscBLASInt _bbs = (PetscBLASInt)bs,_bncols = (PetscBLASInt)ncols,_ione = 1; \ BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione); \ } #else /* Version that calls Fortran routines; can handle different precision of matrix (array) and vectors */ /* These are Fortran kernels: They replace certain BLAS routines but have some arguments that may be single precision,rather than double These routines are provided in src/fortran/kernels/sgemv.F They are pretty pitiful but get the job done. The intention is that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined code is used. */ #ifdef PETSC_HAVE_FORTRAN_CAPS #define msgemv_ MSGEMV #define msgemvp_ MSGEMVP #define msgemvm_ MSGEMVM #define msgemvt_ MSGEMVT #define msgemmi_ MSGEMMI #define msgemm_ MSGEMM #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE) #define msgemv_ msgemv #define msgemvp_ msgemvp #define msgemvm_ msgemvm #define msgemvt_ msgemvt #define msgemmi_ msgemmi #define msgemm_ msgemm #endif EXTERN_C_BEGIN EXTERN void msgemv_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*); EXTERN void msgemvp_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*); EXTERN void msgemvm_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*); EXTERN void msgemvt_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*); EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*); EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*); EXTERN_C_END /* A = A * B A_gets_A_times_B A, B - square bs by bs arrays stored in column major order W - square bs by bs work array */ #define Kernel_A_gets_A_times_B(bs,A,B,W) \ { \ PetscErrorCode _ierr = PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \ msgemmi_(&bs,A,B,W); \ } /* A = A - B * C A_gets_A_minus_B_times_C A, B, C - square bs by bs arrays stored in column major order */ #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \ { \ msgemm_(&bs,A,B,C); \ } /* v = v - A w v_gets_v_minus_A_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \ { \ msgemvm_(&bs,&bs,A,w,v); \ } /* v = v + A w v_gets_v_plus_A_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \ { \ msgemvp_(&bs,&bs,A,w,v);\ } /* v = v + A w v_gets_v_plus_Ar_times_w v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \ { \ msgemvp_(&bs,&ncol,A,v,w);\ } /* w = A v w_gets_A_times_v v - array of length bs A - square bs by bs array w - array of length bs */ #define Kernel_w_gets_A_times_v(bs,v,A,w) \ { \ msgemv_(&bs,&bs,A,v,w);\ } /* z = A*x */ #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \ { \ msgemv_(&bs,&ncols,A,x,z);\ } /* z = trans(A)*x */ #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \ { \ msgemvt_(&bs,&ncols,A,x,z);\ } /* These do not work yet */ #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) #endif #endif