1 xudong 1.1 #include <stdio.h>
2
3 #include <stdlib.h>
4
5 #include <time.h>
6
7 #include <string.h>
8
9 #include <math.h>
10
11 #include <mkl_blas.h>
12
13 #include <mkl_service.h>
14
15 #include <mkl_lapack.h>
16
17 #include <mkl_vml_functions.h>
18
19 #include <omp.h>
20
21 #include "fresize.h"
22 xudong 1.1
23 #define minval(x,y) (((x) < (y)) ? (x) : (y))
24
25 #define maxval(x,y) (((x) < (y)) ? (y) : (x))
26
27 #define fresize_sample 1
28
29 #define fresize_bin 2
30
31 #define fresize_1d 3
32
33 #define fresize_2d 4
34
35
36
37 double sinc(double x)
38
39 {
40
41 if (fabs(x) < (1.e-10))
42
43 xudong 1.1 return 1.;
44
45 else
46
47 return sin(M_PI*x)/(M_PI*x);
48
49 }
50
51
52
53 int init_fresize_sample(
54
55 struct fresize_struct *pars,
56
57 int nsub
58
59 )
60
61 {
62
63
64 xudong 1.1
65 pars->method=fresize_sample;
66
67 pars->nsub=nsub;
68
69
70
71 return 0;
72
73 }
74
75
76
77 int init_fresize_bin(
78
79 struct fresize_struct *pars,
80
81 int nsub
82
83 )
84
85 xudong 1.1 {
86
87
88
89 pars->method=fresize_bin;
90
91 pars->nsub=nsub;
92
93
94
95 return 0;
96
97 }
98
99
100
101 int init_fresize_boxcar(
102
103 struct fresize_struct *pars,
104
105 int hwidth, // Half width of boxcar. Full is 2*hwidth+1.
106 xudong 1.1
107 int nsub // Distance between sampled points
108
109 )
110
111 {
112
113 const int malign=32;
114
115 int fwidth;
116
117 int i,j;
118
119
120
121 pars->method=fresize_1d;
122
123 pars->nsub=nsub;
124
125 pars->hwidth=hwidth;
126
127 xudong 1.1 fwidth=2*hwidth+1;
128
129 pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign));
130
131 pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign));
132
133 for (i=0;i<fwidth;i++) {
134
135 pars->kerx[i]=1.0f/fwidth;
136
137 pars->kery[i]=1.0f/fwidth;
138
139 }
140
141
142
143 return 0;
144
145 }
146
147
148 xudong 1.1
149 int init_fresize_gaussian(
150
151 struct fresize_struct *pars,
152
153 float sigma, // Shape is exp(-(d/sigma)^2/2)
154
155 int hwidth, // Half (truncation) width of kernel. Full is 2*hwidth+1.
156
157 int nsub // Distance between sampled points
158
159 )
160
161 {
162
163 const int malign=32;
164
165 int fwidth;
166
167 int i,j;
168
169 xudong 1.1 float sum,x,y;
170
171
172
173 pars->method=fresize_1d;
174
175 pars->nsub=nsub;
176
177 pars->hwidth=hwidth;
178
179 fwidth=2*hwidth+1;
180
181 pars->kerx=(float *)(MKL_malloc(fwidth*sizeof(float),malign));
182
183 pars->kery=(float *)(MKL_malloc(fwidth*sizeof(float),malign));
184
185 sum=0.0f;
186
187 for (i=0;i<fwidth;i++) {
188
189 x=(i-hwidth)/sigma;
190 xudong 1.1
191 y=exp(-x*x/2);
192
193 sum=sum+y;
194
195 }
196
197 for (i=0;i<fwidth;i++) {
198
199 x=(i-hwidth)/sigma;
200
201 y=exp(-x*x/2);
202
203 pars->kerx[i]=y/sum;
204
205 pars->kery[i]=y/sum;
206
207 }
208
209
210
211 xudong 1.1 return 0;
212
213 }
214
215
216
217 int free_fresize(
218
219 struct fresize_struct *pars
220
221 )
222
223 {
224
225 if (pars->method==fresize_1d) {
226
227 MKL_free (pars->kerx);
228
229 MKL_free (pars->kery);
230
231 }
232 xudong 1.1
233
234
235 return 0;
236
237 }
238
239
240
241 int fsample(
242
243 float *image_in,
244
245 float *image_out,
246
247 int nxin,
248
249 int nyin,
250
251 int nleadin,
252
253 xudong 1.1 int nxout,
254
255 int nyout,
256
257 int nleadout,
258
259 int nsub,
260
261 int xoff,
262
263 int yoff,
264
265 float fillval
266
267 )
268
269
270
271 {
272
273 int i,j;
274 xudong 1.1
275 int imin,imax,jmin,jmax;
276
277 float *imp;
278
279
280
281 // Find first and last indices for which resulting input point is within image
282
283 if (xoff >= 0) imin=0; else imin=((-xoff+nsub-1)/nsub);
284
285 if (xoff <= 0) imax=nxout-1; else imax=minval(((nxin-xoff-1)/nsub),nxout-1);
286
287 if (yoff >= 0) jmin=0; else jmin=((-yoff+nsub-1)/nsub);
288
289 if (yoff <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoff-1)/nsub),nyout-1);
290
291
292
293 imp=image_in+yoff*nleadin+xoff;
294
295 xudong 1.1
296
297 #pragma omp parallel default(none) private(i,j) shared(image_in,image_out,imp,fillval) shared(imin,imax,jmin,jmax) shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub)
298
299 { // Needed to define parallel region
300
301
302
303 // Fill below valid region
304
305 #pragma omp for
306
307 for (j=0; j<jmin; j++) {
308
309 for (i=0; i<nxout; i++) {
310
311 image_out[j*nleadout+i]=fillval;
312
313 } // i=
314
315 } //j=
316 xudong 1.1
317
318
319 // Valid heights
320
321 #pragma omp for
322
323 for (j=jmin; j<=jmax; j++) {
324
325 // Fill to the left
326
327 for (i=0; i<imin; i++) {
328
329 image_out[j*nleadout+i]=fillval;
330
331 } // i=
332
333 // Valid region
334
335 for (i=imin; i<=imax; i++) {
336
337 xudong 1.1 // image_out[j*nleadout+i]=image_in[j*nsub*nleadin+i*nsub];
338
339 image_out[j*nleadout+i]=imp[j*nsub*nleadin+i*nsub];
340
341 } // i=
342
343 // Fill to the right
344
345 for (i=imax+1; i<nxout; i++) {
346
347 image_out[j*nleadout+i]=fillval;
348
349 } // i=
350
351 } //j=
352
353
354
355 // Fill above valid region
356
357 #pragma omp for
358 xudong 1.1
359 for (j=jmax+1; j<nyout; j++) {
360
361 for (i=0; i<nxout; i++) {
362
363 image_out[j*nleadout+i]=fillval;
364
365 } // i=
366
367 } //j=
368
369
370
371 } // End of parallel region
372
373
374
375 return 0;
376
377 }
378
379 xudong 1.1
380
381 int fbin(
382
383 float *image_in,
384
385 float *image_out,
386
387 int nxin,
388
389 int nyin,
390
391 int nleadin,
392
393 int nxout,
394
395 int nyout,
396
397 int nleadout,
398
399 int nsub,
400 xudong 1.1
401 int xoff,
402
403 int yoff,
404
405 float fillval
406
407 )
408
409
410
411 {
412
413 int i,j,i1,j1;
414
415 int imin,imax,jmin,jmax;
416
417 float *imp,*impi;
418
419 float sum;
420
421 xudong 1.1
422
423 // Find first and last indices for which resulting input point is within image
424
425 if (xoff >= 0) imin=0; else imin=((-xoff+nsub-1)/nsub);
426
427 if (xoff <= 0) imax=nxout-1; else imax=minval(((nxin-xoff-nsub)/nsub),nxout-1);
428
429 if (yoff >= 0) jmin=0; else jmin=((-yoff+nsub-1)/nsub);
430
431 if (yoff <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoff-nsub)/nsub),nyout-1);
432
433
434
435 imp=image_in+yoff*nleadin+xoff;
436
437
438
439 #pragma omp parallel default(none) private(i,j,i1,j1,impi,sum) shared(image_in,image_out,imp,fillval) shared(imin,imax,jmin,jmax) shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub)
440
441 { // Needed to define parallel region
442 xudong 1.1
443
444
445 // Fill below valid region
446
447 #pragma omp for
448
449 for (j=0; j<jmin; j++) {
450
451 for (i=0; i<nxout; i++) {
452
453 image_out[j*nleadout+i]=fillval;
454
455 } // i=
456
457 } //j=
458
459
460
461 // Valid heights
462
463 xudong 1.1 #pragma omp for
464
465 for (j=jmin; j<=jmax; j++) {
466
467 // Fill to the left
468
469 for (i=0; i<imin; i++) {
470
471 image_out[j*nleadout+i]=fillval;
472
473 } // i=
474
475 // Valid region
476
477 for (i=imin; i<=imax; i++) {
478
479 impi=imp+j*nleadin*nsub+i*nsub;
480
481 sum=0.0f;
482
483 for (j1=0; j1<nsub; j1++) {
484 xudong 1.1
485 for (i1=0; i1<nsub; i1++) {
486
487 sum=sum+impi[i1];
488
489 }
490
491 impi=impi+nleadin;
492
493 }
494
495 image_out[j*nleadout+i]=sum/(nsub*nsub);
496
497 } // i=
498
499 // Fill to the right
500
501 for (i=imax+1; i<nxout; i++) {
502
503 image_out[j*nleadout+i]=fillval;
504
505 xudong 1.1 } // i=
506
507 } //j=
508
509
510
511 // Fill above valid region
512
513 #pragma omp for
514
515 for (j=jmax+1; j<nyout; j++) {
516
517 for (i=0; i<nxout; i++) {
518
519 image_out[j*nleadout+i]=fillval;
520
521 } // i=
522
523 } //j=
524
525
526 xudong 1.1
527 } // End of parallel region
528
529
530
531 return 0;
532
533 }
534
535
536
537 int f1d(
538
539 struct fresize_struct *pars,
540
541 float *image_in,
542
543 float *image_out,
544
545 int nxin,
546
547 xudong 1.1 int nyin,
548
549 int nleadin,
550
551 int nxout,
552
553 int nyout,
554
555 int nleadout,
556
557 int xoff,
558
559 int yoff,
560
561 float fillval
562
563 )
564
565
566
567 {
568 xudong 1.1
569 const int malign=32;
570
571 char transpose[] = "t";
572
573 char normal[] = "n";
574
575 const int ione = 1;
576
577 const float fone = 1.0f;
578
579 const float fzero = 0.0f;
580
581 int i,j,i1,j1;
582
583 int imin,imax,jmin,jmax;
584
585 float *inp,*inpi,*inpj;
586
587 float sum;
588
589 xudong 1.1 int nsub,hwidth;
590
591 float *kerx,*kery,*work;
592
593 int xoffl,xoffh,yoffl,yoffh;
594
595 int nwork;
596
597 double t1,t2,t3;
598
599 int n1,n2,nchunk;
600
601
602
603 nsub=pars->nsub;
604
605 hwidth=pars->hwidth;
606
607 kerx=pars->kerx;
608
609 kery=pars->kery;
610 xudong 1.1
611
612
613 // Find first and last indices for which resulting input point is within image
614
615 xoffl=xoff-hwidth;
616
617 xoffh=xoff+hwidth;
618
619 if (xoffl >= 0) imin=0; else imin=((-xoffl+nsub-1)/nsub);
620
621 if (xoffh <= 0) imax=nxout-1; else imax=minval(((nxin-xoffh-1)/nsub),nxout-1);
622
623
624
625 yoffl=yoff-hwidth;
626
627 yoffh=yoff+hwidth;
628
629 if (yoffl >= 0) jmin=0; else jmin=((-yoffl+nsub-1)/nsub);
630
631 xudong 1.1 if (yoffh <= 0) jmax=nyout-1; else jmax=minval(((nyin-yoffh-1)/nsub),nyout-1);
632
633
634
635 // Get work array big enough to hold convolution in x
636
637 nwork=nxout*nyin;
638
639 work=(float *)(MKL_malloc(nwork*sizeof(float),malign));
640
641
642
643 nchunk=64;
644
645 n1=(imax-imin+1); // Size of matrix for sgemv
646
647 n2=2*hwidth+1; // Size of matrix for sgemv
648
649 //t1=dsecnd();
650
651
652 xudong 1.1
653 #pragma omp parallel default(none) private(i,j,i1,j1,inpi,inpj,sum) shared(image_in,image_out,fillval) shared(imin,imax,jmin,jmax,hwidth,kerx,kery,work,xoffl,yoffl) shared(normal,transpose,n1,n2,nchunk) shared(nxin,nyin,nleadin,nxout,nyout,nleadout,nsub)
654
655 { // Needed to define parallel region
656
657
658
659 // First convolve in x
660
661 // Brute force loop takes longer than it ought to, but have not
662
663 // found an obvious solution.
664
665 #pragma omp for
666
667 for (j=0;j<nyin;j++) {
668
669 inpj=image_in+j*nleadin+xoffl; // Note offset to match kernels.
670
671 for (i=imin; i<=imax; i++) {
672
673 xudong 1.1 sum=0.0f;
674
675 inpi=inpj+i*nsub;
676
677 for (i1=0; i1<=2*hwidth; i1++) {
678
679 sum=sum+kerx[i1]*inpi[i1];
680
681 }
682
683 work[j*nleadout+i]=sum;
684
685 // This works but is slower
686
687 // work[j*nleadout+i]=sdot(&n2,kerx,&ione,inpj+i*nsub,&ione);
688
689 }
690
691 // This clever trick gives error message
692
693 //sgemv(transpose,&n2,&n1,&fone,inpj+imin*nsub,&nsub,kerx,&ione,&fzero,work+j*nleadout+imin,&ione);
694 xudong 1.1
695 }
696
697 /*
698
699 // This also works but is slower
700
701 #pragma omp for
702
703 for (j=0;j<nyin;j=j+nchunk) {
704
705 for (i=imin; i<=imax; i++) {
706
707 sgemv(transpose,&n2,&nchunk,&fone,image_in+j*nleadin+xoffl+i*nsub,&nleadin,kerx,&ione,&fzero,work+j*nleadout+i,&nleadout);
708
709 }
710
711 */
712
713
714
715 xudong 1.1 // Then convolve in y
716
717
718
719 #pragma omp for
720
721 for (j=jmin; j<=jmax; j++) {
722
723 inpj=work+(yoffl+j*nsub)*nleadout;
724
725 /* Old brute force code
726
727 for (i=imin; i<=imax; i++) {
728
729 sum=0.0f;
730
731 for (j1=0; j1<=2*hwidth; j1++) {
732
733 sum=sum+inpj[j1*nleadout+i]*kery[j1];
734
735 }
736 xudong 1.1
737 image_out[j*nleadout+i]=sum;
738
739 }
740
741 */
742
743 sgemv(normal,&n1,&n2,&fone,inpj+imin,&nleadout,kery,&ione,&fzero,image_out+j*nleadout+imin,&ione);
744
745 }
746
747
748
749 // Fill below valid region
750
751 #pragma omp for
752
753 for (j=0; j<jmin; j++) {
754
755 for (i=0; i<nxout; i++) {
756
757 xudong 1.1 image_out[j*nleadout+i]=fillval;
758
759 } // i=
760
761 } //j=
762
763
764
765 // Valid heights
766
767 #pragma omp for
768
769 for (j=jmin; j<=jmax; j++) {
770
771 // Fill to the left
772
773 for (i=0; i<imin; i++) {
774
775 image_out[j*nleadout+i]=fillval;
776
777 } // i=
778 xudong 1.1
779 // Fill to the right
780
781 for (i=imax+1; i<nxout; i++) {
782
783 image_out[j*nleadout+i]=fillval;
784
785 } // i=
786
787 } //j=
788
789
790
791 // Fill above valid region
792
793 #pragma omp for
794
795 for (j=jmax+1; j<nyout; j++) {
796
797 for (i=0; i<nxout; i++) {
798
799 xudong 1.1 image_out[j*nleadout+i]=fillval;
800
801 } // i=
802
803 } //j=
804
805
806
807 } // End of parallel region
808
809
810
811 return 0;
812
813 }
814
815
816
817
818
819 int fresize(
820 xudong 1.1
821 struct fresize_struct *pars,
822
823 float *image_in,
824
825 float *image_out,
826
827 int nxin,
828
829 int nyin,
830
831 int nleadin,
832
833 int nx,
834
835 int ny,
836
837 int nlead,
838
839 int xoff,
840
841 xudong 1.1 int yoff,
842
843 float fillval
844
845 )
846
847 {
848
849 int status;
850
851
852
853 switch (pars->method) {
854
855
856
857 case fresize_sample:
858
859 status=fsample(image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,pars->nsub,xoff,yoff,fillval);
860
861 break;
862 xudong 1.1
863
864
865 case fresize_bin:
866
867 status=fbin(image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,pars->nsub,xoff,yoff,fillval);
868
869 break;
870
871
872
873 case fresize_1d:
874
875 status=f1d(pars,image_in,image_out,nxin,nyin,nleadin,nx,ny,nlead,xoff,yoff,fillval);
876
877 break;
878
879
880
881 default:
882
883 xudong 1.1 status=1;
884
885 break;
886
887 }
888
889
890
891 return status;
892
893
894
895 }
896
897
898
|