PETSc matrix-vector kernels

From PERI

Jump to: navigation, search

Contents

Block matrix-vector multiplication

For fixed known block sizes <= 7, PETSc uses a hand-optimized size-dependent specialized routine. For example, for matrices made up of square blocks of size 4, the following function is used:

PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
{
  Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
  PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
  const PetscScalar *x,*xb;
  const MatScalar   *v;
  PetscErrorCode    ierr;
  PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
  PetscTruth        usecprow=a->compressedrow.use;

  PetscFunctionBegin;
  ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
  ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);

  idx = a->j;
  v   = a->a;
  if (usecprow){
    mbs  = a->compressedrow.nrows;
    ii   = a->compressedrow.i;
    ridx = a->compressedrow.rindex;
  } else {
    mbs = a->mbs;
    ii  = a->i;
    z   = zarray;
  }

  for (i=0; i<mbs; i++) {
    n  = ii[1] - ii[0]; ii++; 
    sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
    nonzerorow += (n>0);
    for (j=0; j<n; j++) {
      xb = x + 4*(*idx++);
      x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
      sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
      sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
      sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
      sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
      v += 16;
    }
    if (usecprow) z = zarray + 4*ridx[i];
    z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
    if (!usecprow) z += 4;
  }
  ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
  ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
  ierr = PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

For block sizes greater than 7, a generalized matrix-vector multiplication function is used (MatMult_SeqBAIJ), which in turn calls a macro wrapping the BLAS GEMV call.

PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
{
  Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
  PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
  MatScalar      *v;
  PetscErrorCode ierr;
  PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
  PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
  PetscTruth     usecprow=a->compressedrow.use;

  PetscFunctionBegin;
  ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
  ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);

  idx = a->j;
  v   = a->a;
  if (usecprow){
    mbs  = a->compressedrow.nrows;
    ii   = a->compressedrow.i;
    ridx = a->compressedrow.rindex;
  } else {
    mbs = a->mbs;
    ii  = a->i;
    z   = zarray;
  }

  if (!a->mult_work) {
    k    = PetscMax(A->rmap->n,A->cmap->n);
    ierr = PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
  }
  work = a->mult_work;
  for (i=0; i<mbs; i++) {
    n     = ii[1] - ii[0]; ii++;
    ncols = n*bs;
    workt = work;
    nonzerorow += (n>0);
    for (j=0; j<n; j++) {
      xb = x + bs*(*idx++);
      for (k=0; k<bs; k++) workt[k] = xb[k];
      workt += bs;
    }
    if (usecprow) z = zarray + bs*ridx[i];
    Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
    /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
    v += n*bs2;
    if (!usecprow) z += bs;
  }
  ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
  ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
  ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

The macro is:

/*
        z = A*x
*/
#define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
{ \
  PetscScalar _one = 1.0,_zero = 0.0; \
  PetscBLASInt _ione = 1,_bbs,_bncols; \
  _bbs = PetscBLASIntCast(bs); _bncols = PetscBLASIntCast(ncols); \
  BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione); \
}

Note that the above implementation does the following steps:

  1. Pack multiple blocks on the same block row into the same temporary matrix (work) of with blocksize rows and the same number of columns as the sequential portion of the matrix.
  2. Invoke a regular GEMV operation from BLAS.

An alternative implementation could skip the packing of the multiple blocks and invoke GEMV on the block matrix vector products (with square blocksize by blocksize matrices). The macro is Kernel_w_gets_A_times_v(bs,x,A,z).

Simplified versions

A general simple version of dense rectangular matrix-vector product for double precision (assuming row-major storage) is:

/* y = A * x     where A: matrix(m,n), x: vec(m), y: vec(n) */
void dgemv(const int m, const int n, const double *A, const double *x, double *y)
{
   int i, j;
   register double ytmp;

   for (i=0; i < m; ++i)
   {
      ytmp = 0.0;
      for (j=0; j < n; ++j) ytmp += A[i*n + j] * x[j];
      y[i] = ytmp;
   }
}

PETSc kernel MatSolve_SeqBAIJ_N from baijfac2.c

PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
{
  Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;      
  IS             iscol=a->col,isrow=a->row;      
  PetscErrorCode ierr;
  const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 
  PetscInt       i,n=a->mbs;      
  PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;   
  MatScalar      *aa=a->a,*v;                    
  PetscScalar    *x,*b,*s,*t,*ls;

  PetscFunctionBegin;
  ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 
  ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
  t  = a->solve_work;

  ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
  ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);

  /* forward solve the lower triangular */
  ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
  for (i=1; i<n; i++) {   
    v   = aa + bs2*ai[i]; 
    vi  = aj + ai[i];  
    nz  = a->diag[i] - ai[i];
    s = t + bs*i;
    ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
    while (nz--) {
      Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
      v += bs2; 
    }
  }
  /* backward solve the upper triangular */
  ls = a->solve_work + A->cmap->n;
  for (i=n-1; i>=0; i--){
    v   = aa + bs2*(a->diag[i] + 1);
    vi  = aj + a->diag[i] + 1;
    nz  = ai[i+1] - a->diag[i] - 1;
    ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
    while (nz--) {
      Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
      v += bs2;
    }
    Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
    ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
  }

  ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
  ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
  ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
  ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
  ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

Simplified version of MatSolve_SeqBAIJ_N:

PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
{
  Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
  IS             iscol=a->col,isrow=a->row;
  PetscErrorCode ierr;
  const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
  PetscInt       i,n=a->mbs;
  PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
  MatScalar      *aa=a->a,*v;
  PetscScalar    *x,*b,*s,*t,*ls;
  
  PetscInt       j,k;    /*--- to replace PETSc calls with 2-deep loop nest */
  PetscInt       idx;

  PetscFunctionBegin;
  ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 
  ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
  t  = a->solve_work;

  ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
  ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);

  /* forward solve the lower triangular */
  ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
  for (i=1; i<n; i++) {
    v   = aa + bs2*ai[i];
    vi  = aj + ai[i];
    nz  = a->diag[i] - ai[i];
    s = t + bs*i;
    /* -- ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); */
    idx = bs*(*r++);
    for (j=0; j<bs; j++) 
      s[j] = b[idx+j];  

    while (nz--) {
      /* -- Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); --- */
      /* -- begin Kernel_v_gets_v_minus_A-Times_w  */
      idx = bs*(*vi++);
      for (k=0; k<bs; k++) {
	for (j=0; j<bs; j++) {
	  s[k] -= v[bs*j+k] * t[idx+j];
	}
      } 
      /* -- end Kernel_v_gets_v_minus_A_times_w */
      v += bs2;
    }
    idx = bs*i;
    for (k=0; k<bs; k++) {
      t[idx+k] = s[k];
  }

  /* backward solve the upper triangular */
  ls = a->solve_work + A->cmap->n;
  for (i=n-1; i>=0; i--){
    v   = aa + bs2*(a->diag[i] + 1);
    vi  = aj + a->diag[i] + 1;
    nz  = ai[i+1] - a->diag[i] - 1;
    ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
    while (nz--) {
      /* -- Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); */
      /* -- begin Kernel_v_gets_v_minus_A_times_w */
      idx = bs*(*vi++);
      for (k=0; k<bs; k++) {
        for (j=0; j<bs; j++) {
	  ls[k] -= v[bs*j+k] * t[idx+j];
        }
      } /* -- end Kernel_v_gets_v_minus_A_times_w */	
      v += bs2;
    }

    Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
    ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
  }

  ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
  ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
  ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
  ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
  ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}