![]() ![]() |
![]() |
File: [Development] / JSOC / proj / libs / interpolate / fresize.c
(download)
Revision: 1.5, Thu Mar 13 17:26:57 2014 UTC (9 years ago) by schou Branch: MAIN CVS Tags: Ver_LATEST, Ver_9-5, Ver_9-41, Ver_9-4, Ver_9-3, Ver_9-2, Ver_9-1, Ver_9-0, Ver_8-8, Ver_8-7, Ver_8-6, Ver_8-5, Ver_8-4, Ver_8-12, Ver_8-11, Ver_8-10, HEAD Changes since 1.4: +66 -72 lines Fixed bug for case where kernel is larger than input array. Also removed some old code with tests of alternative implementations as these may no longer work. |
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <string.h> #include <math.h> #include <mkl_blas.h> #include <mkl_service.h> #include <mkl_lapack.h> #include <mkl_vml_functions.h> #include <omp.h> #include <complex.h> #include <fftw3.h> #include "fresize.h" #define minval(x,y) (((x) < (y)) ? (x) : (y)) #define maxval(x,y) (((x) < (y)) ? (y) : (x)) #define fresize_sample 1 #define fresize_bin 2 #define fresize_1d 3 #define fresize_2d 4 #define fresize_1d_fft 5 #define fresize_2d_fft 6 static double sinc(double x) { if (fabs(x) < (1.e-10)) return 1.; else return sin(M_PI*x)/(M_PI*x); } int make_fft1d( struct fresize_struct *pars, int nxin, int nyin ) { fftwf_complex *helpc,*fkernel,*fkernely; float *helpin,*helpout; fftwf_plan plan1,plan2,plan1y,plan2y; int nxinp,nyinp; // Complex array size int nin,ninp; int hwidth /*,fwidth */; int i,/*j,*/i1/*,j1*/; float c; hwidth=pars->hwidth; /* fwidth=2*hwidth+1; */ nxinp=(nxin/2+1); nyinp=(nyin/2+1); nin=maxval(nxin,nyin); // Max value to use same arrays ninp=maxval(nxinp,nyinp); // Max value to use same arrays helpin=(float*) fftwf_malloc(sizeof(float) * nin); fkernel=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex) * nxinp); fkernely=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex) * nyinp); helpc=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex) * ninp); helpout=(float*) fftwf_malloc(sizeof(float) * nin); plan1=fftwf_plan_dft_r2c_1d(nxin,helpin,helpc,FFTW_ESTIMATE); plan2=fftwf_plan_dft_c2r_1d(nxin,helpc,helpout,FFTW_ESTIMATE); plan1y=fftwf_plan_dft_r2c_1d(nxin,helpin,helpc,FFTW_ESTIMATE); plan2y=fftwf_plan_dft_c2r_1d(nxin,helpc,helpout,FFTW_ESTIMATE); pars->helpin=helpin; pars->helpout=helpout; pars->fkernel=fkernel; pars->fkernely=fkernely; pars->helpc=helpc; pars->plan1=plan1; pars->plan2=plan2; pars->plan1y=plan1y; pars->plan2y=plan2y; pars->nxin=nxin; pars->nyin=nyin; pars->nxinp=nxinp; pars->nyinp=nyinp; pars->method=fresize_1d_fft; /* First transform kernel in x*/ /* Zero entire array */ for (i=0;i<nxin;i++) { helpin[i]=0; } // Copy kernel to new array for (i=-hwidth;i<=hwidth;i++) { i1=(i+nxin) % nxin; helpin[i1]=pars->kerx[i+hwidth]; } /* Transform kernel */ fftwf_execute(plan1); /* Copy transform and scale */ c=1.0f/nxin; for (i=0;i<nxinp;i++) { fkernel[i]=c*helpc[i]; } /* Then transform kernel in y*/ /* Zero entire array */ for (i=0;i<nxin;i++) { helpin[i]=0; } /* Copy kernel to new array */ for (i=-hwidth;i<=hwidth;i++) { i1=(i+nyin) % nyin; helpin[i1]=pars->kery[i+hwidth]; } /* Transform kernel */ fftwf_execute(plan1y); /* Copy transform and scale */ c=1.0f/nyin; for (i=0;i<nyinp;i++) { fkernely[i]=c*helpc[i]; } return 0; } int make_fft2d( struct fresize_struct *pars, int nxin, int nyin ) { fftwf_complex *helpc,*fkernel; float *helpin,*helpout; fftwf_plan plan1,plan2; int nxinp; // Complex array size int hwidth,fwidth; int i,j,i1,j1; float c; hwidth=pars->hwidth; fwidth=2*hwidth+1; nxinp=(nxin/2+1); helpin=(float*) fftwf_malloc(sizeof(float) * nxin*nyin); fkernel=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex) * nxinp*nyin); helpc=(fftwf_complex*) fftwf_malloc(sizeof(fftwf_complex) * nxinp*nyin); helpout=(float*) fftwf_malloc(sizeof(float) * nxin*nyin); plan1=fftwf_plan_dft_r2c_2d(nyin,nxin,helpin,helpc,FFTW_ESTIMATE); plan2=fftwf_plan_dft_c2r_2d(nyin,nxin,helpc,helpout,FFTW_ESTIMATE); pars->helpin=helpin; pars->helpout=helpout; pars->fkernel=fkernel; pars->helpc=helpc; pars->plan1=plan1; pars->plan2=plan2; pars->nxin=nxin; pars->nyin=nyin; pars->nxinp=nxinp; pars->method=fresize_2d_fft; /* First transform kernel */ /* Zero entire array */ for (j=0;j<nyin;j++) { for (i=0;i<nxin;i++) { helpin[j*nxin+i]=0; } } /* helpin[0]=1.0f; */ /* Copy kernel to new array */ for (j=-hwidth;j<=hwidth;j++) { j1=(j+nyin) % nyin; for (i=-hwidth;i<=hwidth;i++) { i1=(i+nxin) % nxin; helpin[j1*nxin+i1]=pars->ker[(j+hwidth)*fwidth+(i+hwidth)]; } } /* Transform kernel */ fftwf_execute(plan1); /* Copy transform and scale */ c=1.0f/nxin/nyin; for (j=0;j<nyin;j++) { for (i=0;i<nxinp;i++) { fkernel[j*nxinp+i]=c*helpc[j*nxinp+i]; } } return 0; } int init_fresize_sample( struct fresize_struct *pars, int nsub ) { pars->method=fresize_sample; pars->nsub=nsub; return 0; } int init_fresize_bin( struct fresize_struct *pars, int nsub ) { pars->method=fresize_bin; pars->nsub=nsub; return 0; } int init_fresize_boxcar( struct fresize_struct *pars, int hwidth, // Half width of boxcar. Full is 2*hwidth+1. int nsub // Distance between sampled points ) { const int malign=32; int fwidth; int i /*,j*/; pars->method=fresize_1d; pars->nsub=nsub; pars->hwidth=hwidth; fwidth=2*hwidth+1; pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign)); pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign)); for (i=0;i<fwidth;i++) { pars->kerx[i]=1.0f/fwidth; pars->kery[i]=1.0f/fwidth; } return 0; } int init_fresize_boxcar_fft( struct fresize_struct *pars, int hwidth, // Half width of boxcar. Full is 2*hwidth+1. int nsub, // Distance between sampled points int nxin, // Array size int nyin // Array size ) { int status; status=init_fresize_boxcar(pars,hwidth,nsub); if (status != 0) return status; status=make_fft1d(pars,nxin,nyin); return status; } int init_fresize_gaussian( struct fresize_struct *pars, float sigma, // Shape is exp(-(d/sigma)^2/2) int hwidth, // Half (truncation) width of kernel. Full is 2*hwidth+1. int nsub // Distance between sampled points ) { const int malign=32; int fwidth; int i /*,j*/; float sum,x,y; pars->method=fresize_1d; pars->nsub=nsub; pars->hwidth=hwidth; fwidth=2*hwidth+1; pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign)); pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign)); sum=0.0f; for (i=0;i<fwidth;i++) { x=(i-hwidth)/sigma; y=(float)exp(-x*x/2); /* There is expf (a float version), but not sure what impact this has. */ sum=sum+y; } for (i=0;i<fwidth;i++) { x=(i-hwidth)/sigma; y=(float)exp(-x*x/2); pars->kerx[i]=y/sum; pars->kery[i]=y/sum; } return 0; } int init_fresize_gaussian_fft( // Simple square truncated Gaussian. FFT version. struct fresize_struct *pars, float sigma, // Shape is exp(-(d/sigma)^2/2) int hwidth, // Half (truncation) width of kernel. Full is 2*hwidth+1. int nsub, // Distance between sampled points int nxin, // Array size int nyin // Array size ) { int status; status=init_fresize_gaussian(pars,sigma,hwidth,nsub); if (status != 0) return status; status=make_fft1d(pars,nxin,nyin); return status; } int init_fresize_sinc( // Sinc filter struct fresize_struct *pars, float wsinc, /* Shape is sinc(d/wsinc)*ap(d) wsinc is the amount by which the Nyquist is reduced. May want wsinc=nsub. */ int hwidth, // Half width of kernel. Full is 2*hwidth+1. int iap, /* Apodization method. Always ap=0 for d>nap*wsinc. iap=0 means no apodization ap=1 iap=1 uses parabola ap=1-(d/(nap*wsinc))^2 iap=2 uses sinc ap=sinc(d/(nap*wsinc)) */ int nap, /* Sinc apodization width in units of wsinc. Normally hwidth=nap*wsinc, but hwidth=nap*wsinc-1 works for integer */ int nsub // Distance between sampled points ) { const int malign=32; int fwidth; int i /*,j*/; float sum,x,y; pars->method=fresize_1d; pars->nsub=nsub; pars->hwidth=hwidth; fwidth=2*hwidth+1; pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign)); pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign)); sum=0.0f; for (i=0;i<fwidth;i++) { x=(float)(i-hwidth); if (abs(x) > (nap*wsinc)) { y=0.0f; } else { y=(float)sinc(x/wsinc); } if (iap == 1) y=y*(1-(x/(nap*wsinc))*(x/(nap*wsinc))); if (iap == 2) y=(float)(y*sinc(x/(nap*wsinc))); pars->kerx[i]=y; sum=sum+y; } for (i=0;i<fwidth;i++) { pars->kerx[i]=pars->kerx[i]/sum; pars->kery[i]=pars->kerx[i]; } return 0; } int init_fresize_sinc_fft( // Sinc filter. FFT version. struct fresize_struct *pars, float wsinc, /* Shape is sinc(d/wsinc)*ap(d) wsinc is the amount by which the Nyquist is reduced. May want wsinc=nsub. */ int hwidth, // Half width of kernel. Full is 2*hwidth+1. int iap, /* Apodization method. Always ap=0 for d>nap*wsinc. iap=0 means no apodization ap=1 iap=1 uses parabola ap=1-(d/(nap*wsinc))^2 iap=2 uses sinc ap=sinc(d/(nap*wsinc)) all other cases give ap=1 (not guaranteed) */ int nap, /* Sinc apodization width in units of wsinc. Normally hwidth=nap*wsinc, but hwidth=nap*wsinc-1 works for integer */ int nsub, // Distance between sampled points int nxin, // Array size int nyin // Array size ) { int status; status=init_fresize_sinc(pars,wsinc,hwidth,iap,nap,nsub); if (status != 0) return status; status=make_fft1d(pars,nxin,nyin); return status; } int init_fresize_gaussian2( // Circularly truncated Gaussian struct fresize_struct *pars, float sigma, // Shape is exp(-(d/sigma)^2/2) float rmax, // Truncation radius. Probably rmax<=hwidth. int hwidth, // Half (truncation) width of kernel. Full is 2*hwidth+1. int nsub // Distance between sampled points ) { const int malign=32; int fwidth; int i,j; float sum,xi,xj,y,r2,rmax2; pars->method=fresize_2d; pars->nsub=nsub; pars->hwidth=hwidth; fwidth=2*hwidth+1; pars->ker=(float *)(MKL_malloc(fwidth*fwidth*sizeof(float),malign)); rmax2=rmax*rmax/sigma/sigma; sum=0.0f; for (j=0;j<fwidth;j++) { xj=(j-hwidth)/sigma; for (i=0;i<fwidth;i++) { xi=(i-hwidth)/sigma; r2=xi*xi+xj*xj; if (r2 <= rmax2 ) { y=(float)exp(-r2/2); } else { y=0.0f; } pars->ker[fwidth*j+i]=y; sum=sum+y; } } // Normalize kernel for (j=0;j<fwidth;j++) { for (i=0;i<fwidth;i++) { pars->ker[fwidth*j+i]=pars->ker[fwidth*j+i]/sum; } } return 0; } int init_fresize_gaussian2_fft( // Circularly truncated Gaussian. FFT version. struct fresize_struct *pars, float sigma, // Shape is exp(-(d/sigma)^2/2) float rmax, // Truncation radius. Probably rmax<=hwidth. int hwidth, // Half (truncation) width of kernel. Full is 2*hwidth+1. int nsub, // Distance between sampled points int nxin, // Input size int nyin // Input size ) { int status; status=init_fresize_gaussian2(pars,sigma,rmax,hwidth,nsub); if (status != 0) return status; status=make_fft2d(pars,nxin,nyin); return status; } int init_fresize_airy( // 2D Airy filter struct fresize_struct *pars, float cdown, /* cdown is the amount by which the Nyquist is reduced. */ int hwidth, /* Half width of kernel. Full is 2*hwidth+1. Set to <0 to make routine set appropriate value */ int iap, /* Apodization method. Always ap=0 for d>Z_nap, where Z_nap os the position of the nap'th zero. iap=0 means no apodization ap=1 iap=1 uses parabola ap=1-(d/Z_nap)^2 iap=2 uses sinc ap=sinc(d/(Z_nap)) iap=3 uses Airy with first zero at Z_nap all other cases give ap=1 (not guaranteed) */ int nap, /* Apodizes to nap'th zero */ int nsub // Distance between sampled points ) { const int malign=32; const float beszeros[20]={0.0000000,3.8317060,7.0155867,10.173468,13.323692,16.470630,19.615859,22.760084,25.903672,29.046829,32.189680,35.332308,38.474766,41.617094,44.759319,47.901461,51.043535,54.185554,57.327525,60.469458}; // first 20 zeros in Bessel function float cb; int fwidth; int i,j; float sum,xi,xj; float r2,r,rc,airy,ra; float nzero; cb=(float)(cdown/M_PI); // Amount to scale Bessel function to get zeros right. nzero=beszeros[nap]*cb; // Position of nap'th zero if (hwidth < 0) hwidth=(float)floor(nzero); pars->method=fresize_2d; pars->nsub=nsub; pars->hwidth=hwidth; fwidth=2*hwidth+1; pars->ker=(float *)(MKL_malloc(fwidth*fwidth*sizeof(float),malign)); sum=0.0f; for (j=0;j<fwidth;j++) { xj=(float)(j-hwidth); for (i=0;i<fwidth;i++) { xi=(float)(i-hwidth); r2=xi*xi+xj*xj; r=(float)maxval(sqrt(r2),1e-20); rc=r/cb; if (r <= nzero ) { airy=j1f(rc)/rc; } else { airy=0.0f; } ra=r/nzero; if (iap == 1) airy=airy*(1-ra*ra); if (iap == 2) airy=(float)(airy*sinc(ra)); if (iap == 3) { ra=rc*beszeros[1]/beszeros[nap]; airy=airy*j1f(ra)/ra; } pars->ker[fwidth*j+i]=airy; sum=sum+airy; } } // Normalize kernel for (j=0;j<fwidth;j++) { for (i=0;i<fwidth;i++) { pars->ker[fwidth*j+i]=pars->ker[fwidth*j+i]/sum; } } return 0; } int init_fresize_airy_fft( // 2D Airy filter. FFT version. struct fresize_struct *pars, float cdown, /* cdown is the amount by which the Nyquist is reduced. */ int hwidth, /* Half width of kernel. Full is 2*hwidth+1. Set to <0 to make routine set appropriate value */ int iap, /* Apodization method. Always ap=0 for d>Z_nap, where Z_nap os the position of the nap'th zero. iap=0 means no apodization ap=1 iap=1 uses parabola ap=1-(d/Z_nap)^2 iap=2 uses sinc ap=sinc(d/(Z_nap)) iap=3 uses Airy with first zero at Z_nap all other cases give ap=1 (not guaranteed) */ int nap, /* Apodizes to nap'th zero */ int nsub, // Distance between sampled points int nxin, // Input size int nyin // Input size ) { int status; status=init_fresize_airy(pars,cdown,hwidth,iap,nap,nsub); if (status != 0) return status; status=make_fft2d(pars,nxin,nyin); return status; } int free_fresize( struct fresize_struct *pars ) { if (pars->method==fresize_1d) { MKL_free (pars->kerx); MKL_free (pars->kery); } if (pars->method==fresize_2d) { MKL_free (pars->ker); } if (pars->method==fresize_1d_fft) { MKL_free (pars->kerx); MKL_free (pars->kery); fftwf_free(pars->helpin); fftwf_free(pars->fkernel); fftwf_free(pars->fkernely); fftwf_free(pars->helpc); fftwf_free(pars->helpout); fftwf_destroy_plan(pars->plan1); fftwf_destroy_plan(pars->plan2); fftwf_destroy_plan(pars->plan1y); fftwf_destroy_plan(pars->plan2y); } if (pars->method==fresize_2d_fft) { MKL_free (pars->ker); fftwf_free(pars->helpin); fftwf_free(pars->fkernel); fftwf_free(pars->helpc); fftwf_free(pars->helpout); fftwf_destroy_plan(pars->plan1); fftwf_destroy_plan(pars->plan2); } return 0; } int fsample( float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int nsub, int xoff, int yoff, float fillval ) { int i,j; int imin,imax,jmin,jmax; float *imp; // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image if (xoff >= 0) imin=0; else imin=((-xoff+nsub-1)/nsub); if (xoff <= 0) imax=nxout-1; else imax=minval(((nxin-xoff-1)/nsub),nxout-1); if (yoff >= 0) jmin=0; else jmin=((-yoff+nsub-1)/nsub); if (yoff <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoff-1)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); imp=image_in+yoff*nleadin+xoff; #pragma omp parallel default(none) \ private(i,j) \ shared(image_in,image_out,imp,fillval) \ shared(imin,imax,jmin,jmax) \ shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub) { // Needed to define parallel region // Fill below valid region #pragma omp for for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights #pragma omp for for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Valid region for (i=imin; i<=imax; i++) { // image_out[j*nleadout+i]=image_in[j*nsub*nleadin+i*nsub]; image_out[j*nleadout+i]=imp[j*nsub*nleadin+i*nsub]; } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region #pragma omp for for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= } // End of parallel region return 0; } int fbin( float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int nsub, int xoff, int yoff, float fillval ) { int i,j,i1,j1; int imin,imax,jmin,jmax; float *imp,*impi; float sum; // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image if (xoff >= 0) imin=0; else imin=((-xoff+nsub-1)/nsub); if (xoff <= 0) imax=nxout-1; else imax=minval(((nxin-xoff-nsub)/nsub),nxout-1); if (yoff >= 0) jmin=0; else jmin=((-yoff+nsub-1)/nsub); if (yoff <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoff-nsub)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); imp=image_in+yoff*nleadin+xoff; #pragma omp parallel default(none) \ private(i,j,i1,j1,impi,sum) \ shared(image_in,image_out,imp,fillval) \ shared(imin,imax,jmin,jmax) \ shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub) { // Needed to define parallel region // Fill below valid region #pragma omp for for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights #pragma omp for for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Valid region for (i=imin; i<=imax; i++) { impi=imp+j*nleadin*nsub+i*nsub; sum=0.0f; for (j1=0; j1<nsub; j1++) { for (i1=0; i1<nsub; i1++) { sum=sum+impi[i1]; } impi=impi+nleadin; } image_out[j*nleadout+i]=sum/(nsub*nsub); } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region #pragma omp for for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= } // End of parallel region return 0; } int f1d( struct fresize_struct *pars, float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int xoff, int yoff, float fillval ) { const int malign=32; /*char transpose[] = "t"; */ char normal[] = "n"; const int ione = 1; const float fone = 1.0f; const float fzero = 0.0f; int i,j,i1 /*,j1*/; int imin,imax,jmin,jmax; float /**inp*,*/ *inpi,*inpj; float sum; int nsub,hwidth; float *kerx,*kery,*work; int xoffl,xoffh,yoffl,yoffh; int nwork; /*double t1,t2,t3;*/ int n1,n2 /*,nchunk*/; nsub=pars->nsub; hwidth=pars->hwidth; kerx=pars->kerx; kery=pars->kery; // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image xoffl=xoff-hwidth; xoffh=xoff+hwidth; if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub); if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1); yoffl=yoff-hwidth; yoffh=yoff+hwidth; if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub); if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); // Get work array big enough to hold convolution in x nwork=nxout*nyin; work=(float *)(MKL_malloc(nwork*sizeof(float),malign)); /*nchunk=64;*/ n1=(imax-imin+1); // Size of matrix for sgemv n2=2*hwidth+1; // Size of matrix for sgemv /*t1=dsecnd();*/ #pragma omp parallel default(none) \ private(i,j,i1,/*j1,*/inpi,inpj,sum) \ shared(image_in,image_out,fillval) \ shared(imin,imax,jmin,jmax,hwidth,kerx,kery,work,xoffl,yoffl) \ shared(normal,/*transpose,*/n1,n2/*,nchunk*/) \ shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub) { // Needed to define parallel region // First convolve in x // Brute force loop takes longer than it ought to, but have not // found an obvious solution. #pragma omp for for (j=0;j<nyin;j++) { inpj=image_in+j*nleadin+xoffl; // Note offset to match kernels. for (i=imin; i<=imax; i++) { sum=0.0f; inpi=inpj+i*nsub; for (i1=0; i1<=2*hwidth; i1++) { sum=sum+kerx[i1]*inpi[i1]; } work[j*nleadout+i]=sum; } } // Then convolve in y // Code using sgemv if (n1 > 0) { #pragma omp for for (j=jmin; j<=jmax; j++) { inpj=work+(yoffl+j*nsub)*nleadout; sgemv(normal,&n1,&n2,&fone,inpj+imin,&nleadout,kery,&ione,&fzero,image_out+j*nleadout+imin,&ione); } } // Fill below valid region #pragma omp for for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights #pragma omp for for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region #pragma omp for for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= } // End of parallel region MKL_free(work); return 0; } int f1d_fft( struct fresize_struct *pars, float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int xoff, int yoff, float fillval ) { const int malign=32; int i,j,i1 /*,j1*/; int imin,imax,jmin,jmax; float /**inp,*inpi,*/ *inpj; /*float sum;*/ int nsub,hwidth; float /**kerx,*kery,*/ *work; int xoffl,xoffh,yoffl,yoffh; int nwork; double t1, /*t2,t3,*/ t4; int /*n1,n2,*/ nchunk; fftwf_complex *helpc,*fkernel,*fkernely; float *helpin,*helpout; int nxinp,nyinp; float *iwork,*owork; if (pars->nxin != nxin) return 1; if (pars->nyin != nyin) return 1; nsub=pars->nsub; hwidth=pars->hwidth; /*kerx=pars->kerx;*/ /*kery=pars->kery;*/ helpc=pars->helpc; fkernel=pars->fkernel; fkernely=pars->fkernely; helpin=pars->helpin; helpout=pars->helpout; nxinp=pars->nxinp; nyinp=pars->nyinp; // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image xoffl=xoff-hwidth; xoffh=xoff+hwidth; if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub); if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1); yoffl=yoff-hwidth; yoffh=yoff+hwidth; if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub); if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); // Get work array big enough to hold convolution in x nwork=nxout*nyin; work=(float *)(MKL_malloc(nwork*sizeof(float),malign)); nchunk=64; iwork=(float *)(MKL_malloc(nchunk*nyin*sizeof(float),malign)); owork=(float *)(MKL_malloc(nchunk*nyout*sizeof(float),malign)); // No OpenMP yet, but have left statements in for good measure. //#pragma omp parallel default(none) \ //private(i,j,i1,j1,inpi,inpj,sum) \ //shared(image_in,image_out,fillval) \ //shared(imin,imax,jmin,jmax,hwidth,kerx,kery,work,xoffl,yoffl) \ //shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub) { // Needed to define parallel region t1=dsecnd(); // First convolve in x //#pragma omp for for (j=0;j<nyin;j++) { inpj=image_in+j*nleadin; for (i=0;i<nxin;i++) helpin[i]=inpj[i]; fftwf_execute(pars->plan1); for (i=0; i<nxinp; i++) helpc[i]=fkernel[i]*helpc[i]; fftwf_execute(pars->plan2); for (i=imin; i<=imax; i++) work[j*nleadout+i]=helpout[i*nsub+xoff]; } t1=dsecnd()-t1; // Then convolve in y // Block both the input output arrays. t4=dsecnd(); //#pragma omp for for (i1=imin;i1<=imax;i1=i1+nchunk) { for (j=0;j<nyin;j++) { for (i=i1; i<=minval(i1+nchunk-1,imax); i++) { iwork[(i-i1)*nyin+j]=work[j*nleadout+i]; } } for (i=i1; i<=minval(i1+nchunk-1,imax); i++) { for (j=0;j<nyin;j++) helpin[j]=iwork[(i-i1)*nyin+j]; fftwf_execute(pars->plan1y); for (j=0; j<nyinp; j++) helpc[j]=fkernely[j]*helpc[j]; fftwf_execute(pars->plan2y); for (j=jmin; j<=jmax; j++) owork[j+(i-i1)*nyout]=helpout[j*nsub+yoff]; } for (j=jmin; j<=jmax; j++) { for (i=i1; i<=minval(i1+nchunk-1,imax); i++) { image_out[j*nleadout+i]=owork[j+(i-i1)*nyout]; } } } t4=dsecnd()-t4; // Fill below valid region //#pragma omp for for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights //#pragma omp for for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region //#pragma omp for for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= } // End of parallel region MKL_free(iwork); MKL_free(owork); MKL_free(work); return 0; } int f2d( struct fresize_struct *pars, float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int xoff, int yoff, float fillval ) { /*const int malign=32;*/ /*char transpose[] = "t";*/ /*char normal[] = "n";*/ /*const int ione = 1;*/ /*const float fone = 1.0f;*/ /*const float fzero = 0.0f;*/ int i,j,i1,j1; int imin,imax,jmin,jmax; float *inpi,*kerp; float sum; int nsub,hwidth,fwidth; float *ker; int xoffl,xoffh,yoffl,yoffh; /*double t1,t2,t3;*/ nsub=pars->nsub; hwidth=pars->hwidth; ker=pars->ker; fwidth=2*hwidth+1; // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image xoffl=xoff-hwidth; xoffh=xoff+hwidth; if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub); if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1); yoffl=yoff-hwidth; yoffh=yoff+hwidth; if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub); if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); /*t1=dsecnd();*/ #pragma omp parallel default(none) \ private(i,j,i1,j1,inpi,kerp,sum) \ shared(image_in,image_out,fillval) \ shared(imin,imax,jmin,jmax,hwidth,fwidth,ker,xoffl,yoffl) \ shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub) { // Needed to define parallel region #pragma omp for for (j=jmin; j<=jmax; j++) { for (i=imin; i<=imax; i++) { inpi=image_in+(j*nsub+yoffl)*nleadin+i*nsub+xoffl; // Note offset to match kernels. sum=0.0f; for (j1=0; j1<=2*hwidth; j1++) { kerp=ker+j1*fwidth; for (i1=0; i1<=2*hwidth; i1++) { sum=sum+kerp[i1]*inpi[i1]; } inpi=inpi+nleadin; } image_out[j*nleadout+i]=sum; } } // Fill below valid region #pragma omp for for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights #pragma omp for for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region #pragma omp for for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= } // End of parallel region return 0; } // this version uses matrix vector multiplys. Not much faster. int f2d_mat( struct fresize_struct *pars, float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int xoff, int yoff, float fillval ) { const int malign=32; char transpose[] = "t"; /*char normal[] = "n";*/ const int ione = 1; const float fone = 1.0f; /*const float fzero = 0.0f;*/ int i,j,/*i1,*/j1; int imin,imax,jmin,jmax; float *inpi,*kerp; /*float sum;*/ int nsub,hwidth,fwidth; float *ker; int xoffl,xoffh,yoffl,yoffh; /*double t1,t2,t3;*/ int /*n1,*/n2,nchunk,jc,nc; float *work; nsub=pars->nsub; hwidth=pars->hwidth; ker=pars->ker; fwidth=2*hwidth+1; // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image xoffl=xoff-hwidth; xoffh=xoff+hwidth; if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub); if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1); yoffl=yoff-hwidth; yoffh=yoff+hwidth; if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub); if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); /*t1=dsecnd();*/ n2=nleadin*nsub; nchunk=64; #pragma omp parallel default(none) \ private(i,j,/*i1,*/j1,inpi,kerp,/*sum,*/work,nc) \ shared(/*normal,*/transpose,/*n1,*/n2,nchunk) \ shared(image_in,image_out,fillval) \ shared(imin,imax,jmin,jmax,hwidth,fwidth,ker,xoffl,yoffl) \ shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub) { // Needed to define parallel region work=(float *)(MKL_malloc(nyout*sizeof(float),malign)); #pragma omp for for (j=jmin; j<=jmax; j++) { for (i=imin; i<=imax; i++) { image_out[j*nleadout+i]=0.0f; } } #pragma omp for for (jc=jmin;jc<=jmax;jc=jc+nchunk) { nc=minval(jc+nchunk-1,jmax)-jc+1; for (j1=0; j1<=2*hwidth; j1++) { kerp=ker+j1*fwidth; for (i=imin; i<=imax; i++) { inpi=image_in+(jc*nsub+yoffl+j1)*nleadin+i*nsub+xoffl; sgemv(transpose,&fwidth,&nc,&fone,inpi,&n2,kerp,&ione,&fone,image_out+jc*nleadout+i,&nleadout); } } } // Fill below valid region #pragma omp for for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights #pragma omp for for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region #pragma omp for for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= MKL_free(work); } // End of parallel region return 0; } int f2d_fft( struct fresize_struct *pars, float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nxout, int nyout, int nleadout, int xoff, int yoff, float fillval ) { fftwf_complex *helpc,*fkernel; float *helpin,*helpout; fftwf_plan plan1,plan2; int nxinp; // Complex array size int hwidth/*,fwidth*/; int i,j,/*i1,*/j1; /*float c;*/ int imin,imax,jmin,jmax; int xoffl,xoffh,yoffl,yoffh; int nsub; if (pars->nxin != nxin) return 1; if (pars->nyin != nyin) return 1; nsub=pars->nsub; hwidth=pars->hwidth; /*fwidth=2*hwidth+1;*/ nxinp=(nxin/2+1); helpin=pars->helpin; fkernel=pars->fkernel; helpc=pars->helpc; helpout=pars->helpout; plan1=pars->plan1; plan2=pars->plan2; /* Now copy and transform input image */ for (j=0;j<nyin;j++) { for (i=0;i<nxin;i++) { helpin[j*nxin+i]=image_in[j*nleadin+i]; } } fftwf_execute(plan1); /* Multiply by kernel transform */ for (j=0;j<nyin;j++) { for (i=0;i<nxinp;i++) { helpc[j*nxinp+i]=fkernel[j*nxinp+i]*helpc[j*nxinp+i]; } } fftwf_execute(plan2); // Copy data to output array // Find first ([ij]min) and last ([ij]max) indices in output image // for which resulting input point is within image xoffl=xoff-hwidth; xoffh=xoff+hwidth; if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub); if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1); yoffl=yoff-hwidth; yoffh=yoff+hwidth; if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub); if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1); // Also ensure that indices are within output array imin=maxval(0,minval(imin,nxout-1)); imax=maxval(0,minval(imax,nxout-1)); jmin=maxval(0,minval(jmin,nyout-1)); jmax=maxval(0,minval(jmax,nyout-1)); for (j=jmin; j<=jmax; j++) { j1=(j*nsub+yoff)*nxin+xoff; for (i=imin; i<=imax; i++) { image_out[j*nleadout+i]=helpout[j1+i*nsub]; } } // Fill below valid region for (j=0; j<jmin; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Valid heights for (j=jmin; j<=jmax; j++) { // Fill to the left for (i=0; i<imin; i++) { image_out[j*nleadout+i]=fillval; } // i= // Fill to the right for (i=imax+1; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= // Fill above valid region for (j=jmax+1; j<nyout; j++) { for (i=0; i<nxout; i++) { image_out[j*nleadout+i]=fillval; } // i= } //j= return 0; } int fresize( struct fresize_struct *pars, float *image_in, float *image_out, int nxin, int nyin, int nleadin, int nx, int ny, int nlead, int xoff, int yoff, float fillval ) { int status; switch (pars->method) { case fresize_sample: status=fsample(image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,pars->nsub,xoff,yoff,fillval); break; case fresize_bin: status=fbin(image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,pars->nsub,xoff,yoff,fillval); break; case fresize_1d: status=f1d(pars,image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,xoff,yoff,fillval); break; case fresize_1d_fft: status=f1d_fft(pars,image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,xoff,yoff,fillval); break; case fresize_2d: status=f2d_mat(pars,image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,xoff,yoff,fillval); break; case fresize_2d_fft: status=f2d_fft(pars,image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,xoff,yoff,fillval); break; default: status=1; break; } return status; } int init_fresize_user( struct fresize_struct *pars, int hwidth, // Half width of kernel. Full is 2*hwidth+1. int nsub, // Distance between sampled points float *user_ker // User specified kernel to convolve with. // Must be of size (2*hwidth+1) x (2*hwidth+1). // Kernel need not be and will not be normalized. ) { const int malign=32; int fwidth; int i,j; pars->method=fresize_2d; pars->nsub=nsub; pars->hwidth=hwidth; fwidth=2*hwidth+1; pars->ker=(float *)(MKL_malloc(fwidth*fwidth*sizeof(float),malign)); memcpy(pars->ker,user_ker,sizeof(float)*fwidth*fwidth); return 0; }
Karen Tian |
Powered by ViewCVS 0.9.4 |