(file) Return to fresize.c CVS log (file) (dir) Up to [Development] / JSOC / proj / sharp / apps

File: [Development] / JSOC / proj / sharp / apps / fresize.c (download)
Revision: 1.1, Thu Aug 23 07:38:33 2012 UTC (11 years, 1 month ago) by xudong
Branch: MAIN
CVS Tags: Ver_LATEST, Ver_9-5, Ver_9-41, Ver_9-4, Ver_9-3, Ver_9-2, Ver_9-1, Ver_9-0, Ver_8-8, Ver_8-7, Ver_8-6, Ver_8-5, Ver_8-4, Ver_8-3, Ver_8-2, Ver_8-12, Ver_8-11, Ver_8-10, Ver_8-1, Ver_8-0, Ver_7-1, Ver_7-0, Ver_6-4, HEAD
Update functioning code sharp.c per Monica's request

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <string.h>

#include <math.h>

#include <mkl_blas.h>

#include <mkl_service.h>

#include <mkl_lapack.h>

#include <mkl_vml_functions.h>

#include <omp.h>

#include "fresize.h"

#define minval(x,y) (((x) < (y)) ? (x) : (y))

#define maxval(x,y) (((x) < (y)) ? (y) : (x))

#define fresize_sample 1

#define fresize_bin 2

#define fresize_1d 3

#define fresize_2d 4



double sinc(double x)

{

if (fabs(x) < (1.e-10))

  return 1.;

else

  return sin(M_PI*x)/(M_PI*x);

}



int init_fresize_sample(

  struct fresize_struct *pars,

  int nsub

)

{



  pars->method=fresize_sample;

  pars->nsub=nsub;



  return 0;

}



int init_fresize_bin(

  struct fresize_struct *pars,

  int nsub

)

{



  pars->method=fresize_bin;

  pars->nsub=nsub;



  return 0;

}



int init_fresize_boxcar(

  struct fresize_struct *pars,

  int hwidth, // Half width of boxcar. Full is 2*hwidth+1.

  int nsub // Distance between sampled points

)

{

  const int malign=32;

  int fwidth;

  int i,j;



  pars->method=fresize_1d;

  pars->nsub=nsub;

  pars->hwidth=hwidth;

  fwidth=2*hwidth+1;

  pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign));

  pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign));

  for (i=0;i<fwidth;i++) {

    pars->kerx[i]=1.0f/fwidth;

    pars->kery[i]=1.0f/fwidth;

  }



  return 0;

}



int init_fresize_gaussian(

  struct fresize_struct *pars,

  float sigma, // Shape is exp(-(d/sigma)^2/2)

  int hwidth, // Half (truncation) width of kernel. Full is 2*hwidth+1.

  int nsub // Distance between sampled points

)

{

  const int malign=32;

  int fwidth;

  int i,j;

  float sum,x,y;



  pars->method=fresize_1d;

  pars->nsub=nsub;

  pars->hwidth=hwidth;

  fwidth=2*hwidth+1;

  pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign));

  pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign));

  sum=0.0f;

  for (i=0;i<fwidth;i++) {

    x=(i-hwidth)/sigma;

    y=exp(-x*x/2);

    sum=sum+y;

  }

  for (i=0;i<fwidth;i++) {

    x=(i-hwidth)/sigma;

    y=exp(-x*x/2);

    pars->kerx[i]=y/sum;

    pars->kery[i]=y/sum;

  }



  return 0;

}



int free_fresize(

  struct fresize_struct *pars

)

{

  if (pars->method==fresize_1d) {

    MKL_free (pars->kerx);

    MKL_free (pars->kery);

  }



  return 0;

}



int fsample(

  float *image_in,

  float *image_out,

  int nxin,

  int nyin,

  int nleadin,

  int nxout,

  int nyout,

  int nleadout,

  int nsub,

  int xoff,

  int yoff,

  float fillval

)



{

  int i,j;

  int imin,imax,jmin,jmax;

  float *imp;



// Find first and last indices for which resulting input point is within image

  if (xoff >= 0) imin=0; else imin=((-xoff+nsub-1)/nsub);

  if (xoff <= 0) imax=nxout-1; else imax=minval(((nxin-xoff-1)/nsub),nxout-1);

  if (yoff >= 0) jmin=0; else jmin=((-yoff+nsub-1)/nsub);

  if (yoff <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoff-1)/nsub),nyout-1);



  imp=image_in+yoff*nleadin+xoff;



#pragma omp parallel default(none) private(i,j) shared(image_in,image_out,imp,fillval) shared(imin,imax,jmin,jmax) shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub)

  { // Needed to define parallel region



// Fill below valid region

#pragma omp for

    for (j=0; j<jmin; j++) {

      for (i=0; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



// Valid heights

#pragma omp for

    for (j=jmin; j<=jmax; j++) {

// Fill to the left

      for (i=0; i<imin; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

// Valid region

      for (i=imin; i<=imax; i++) {

//      image_out[j*nleadout+i]=image_in[j*nsub*nleadin+i*nsub];

        image_out[j*nleadout+i]=imp[j*nsub*nleadin+i*nsub];

      } // i=

// Fill to the right

      for (i=imax+1; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



// Fill above valid region

#pragma omp for

    for (j=jmax+1; j<nyout; j++) {

      for (i=0; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



  } // End of parallel region



  return 0;

}



int fbin(

  float *image_in,

  float *image_out,

  int nxin,

  int nyin,

  int nleadin,

  int nxout,

  int nyout,

  int nleadout,

  int nsub,

  int xoff,

  int yoff,

  float fillval

)



{

  int i,j,i1,j1;

  int imin,imax,jmin,jmax;

  float *imp,*impi;

  float sum;



// Find first and last indices for which resulting input point is within image

  if (xoff >= 0) imin=0; else imin=((-xoff+nsub-1)/nsub);

  if (xoff <= 0) imax=nxout-1; else imax=minval(((nxin-xoff-nsub)/nsub),nxout-1);

  if (yoff >= 0) jmin=0; else jmin=((-yoff+nsub-1)/nsub);

  if (yoff <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoff-nsub)/nsub),nyout-1);



  imp=image_in+yoff*nleadin+xoff;



#pragma omp parallel default(none) private(i,j,i1,j1,impi,sum) shared(image_in,image_out,imp,fillval) shared(imin,imax,jmin,jmax) shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub)

  { // Needed to define parallel region



// Fill below valid region

#pragma omp for

    for (j=0; j<jmin; j++) {

      for (i=0; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



// Valid heights

#pragma omp for

    for (j=jmin; j<=jmax; j++) {

// Fill to the left

      for (i=0; i<imin; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

// Valid region

      for (i=imin; i<=imax; i++) {

        impi=imp+j*nleadin*nsub+i*nsub;

        sum=0.0f;

        for (j1=0; j1<nsub; j1++) {

          for (i1=0; i1<nsub; i1++) {

            sum=sum+impi[i1];

          }

          impi=impi+nleadin;

        }

        image_out[j*nleadout+i]=sum/(nsub*nsub);

      } // i=

// Fill to the right

      for (i=imax+1; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



// Fill above valid region

#pragma omp for

    for (j=jmax+1; j<nyout; j++) {

      for (i=0; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



  } // End of parallel region



  return 0;

}



int f1d(

  struct fresize_struct *pars,

  float *image_in,

  float *image_out,

  int nxin,

  int nyin,

  int nleadin,

  int nxout,

  int nyout,

  int nleadout,

  int xoff,

  int yoff,

  float fillval

)



{

  const int malign=32;

  char transpose[] = "t";

  char normal[] = "n";

  const int ione = 1;

  const float fone = 1.0f;

  const float fzero = 0.0f;

  int i,j,i1,j1;

  int imin,imax,jmin,jmax;

  float *inp,*inpi,*inpj;

  float sum;

  int nsub,hwidth;

  float *kerx,*kery,*work;

  int xoffl,xoffh,yoffl,yoffh;

  int nwork;

  double t1,t2,t3;

  int n1,n2,nchunk;



  nsub=pars->nsub;

  hwidth=pars->hwidth;

  kerx=pars->kerx;

  kery=pars->kery;



// Find first and last indices for which resulting input point is within image

  xoffl=xoff-hwidth;

  xoffh=xoff+hwidth;

  if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub);

  if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1);



  yoffl=yoff-hwidth;

  yoffh=yoff+hwidth;

  if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub);

  if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1);



// Get work array big enough to hold convolution in x

  nwork=nxout*nyin;

  work=(float *)(MKL_malloc(nwork*sizeof(float),malign));



nchunk=64;

n1=(imax-imin+1); // Size of matrix for sgemv

n2=2*hwidth+1; // Size of matrix for sgemv

//t1=dsecnd();

 

#pragma omp parallel default(none) private(i,j,i1,j1,inpi,inpj,sum) shared(image_in,image_out,fillval) shared(imin,imax,jmin,jmax,hwidth,kerx,kery,work,xoffl,yoffl) shared(normal,transpose,n1,n2,nchunk) shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub)

  { // Needed to define parallel region



// First convolve in x

// Brute force loop takes longer than it ought to, but have not

// found an obvious solution.

#pragma omp for

    for (j=0;j<nyin;j++) {

      inpj=image_in+j*nleadin+xoffl; // Note offset to match kernels.

      for (i=imin; i<=imax; i++) {

        sum=0.0f;

        inpi=inpj+i*nsub;

        for (i1=0; i1<=2*hwidth; i1++) {

          sum=sum+kerx[i1]*inpi[i1];

        }

        work[j*nleadout+i]=sum;

// This works but is slower

//      work[j*nleadout+i]=sdot(&n2,kerx,&ione,inpj+i*nsub,&ione);

      }

// This clever trick gives error message

//sgemv(transpose,&n2,&n1,&fone,inpj+imin*nsub,&nsub,kerx,&ione,&fzero,work+j*nleadout+imin,&ione);

    }

/*

// This also works but is slower

#pragma omp for

for (j=0;j<nyin;j=j+nchunk) {

  for (i=imin; i<=imax; i++) {

    sgemv(transpose,&n2,&nchunk,&fone,image_in+j*nleadin+xoffl+i*nsub,&nleadin,kerx,&ione,&fzero,work+j*nleadout+i,&nleadout);

}

*/



// Then convolve in y



#pragma omp for

    for (j=jmin; j<=jmax; j++) {

      inpj=work+(yoffl+j*nsub)*nleadout;

/* Old brute force code

      for (i=imin; i<=imax; i++) {

        sum=0.0f;

        for (j1=0; j1<=2*hwidth; j1++) {

          sum=sum+inpj[j1*nleadout+i]*kery[j1];

        }

        image_out[j*nleadout+i]=sum;

      }

*/

      sgemv(normal,&n1,&n2,&fone,inpj+imin,&nleadout,kery,&ione,&fzero,image_out+j*nleadout+imin,&ione);

    }



// Fill below valid region

#pragma omp for

    for (j=0; j<jmin; j++) {

      for (i=0; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



// Valid heights

#pragma omp for

    for (j=jmin; j<=jmax; j++) {

// Fill to the left

      for (i=0; i<imin; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

// Fill to the right

      for (i=imax+1; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



// Fill above valid region

#pragma omp for

    for (j=jmax+1; j<nyout; j++) {

      for (i=0; i<nxout; i++) {

        image_out[j*nleadout+i]=fillval;

      } // i=

    } //j=



  } // End of parallel region



  return 0;

}





int fresize(

  struct fresize_struct *pars,

  float *image_in,

  float *image_out,

  int nxin,

  int nyin,

  int nleadin,

  int nx,

  int ny,

  int nlead,

  int xoff,

  int yoff,

  float fillval

)

{

  int status;



  switch (pars->method) {



  case fresize_sample:

    status=fsample(image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,pars->nsub,xoff,yoff,fillval);

    break;



  case fresize_bin:

    status=fbin(image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,pars->nsub,xoff,yoff,fillval);

    break;



  case fresize_1d:

    status=f1d(pars,image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,xoff,yoff,fillval);

    break;



  default:

    status=1;

    break;

  }



return status;



}




Karen Tian
Powered by
ViewCVS 0.9.4