00001 #include "mex.h" /* must appear first */ 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <math.h> 00005 #include <limits.h> 00006 #include <pthread.h> 00007 #include <sys/time.h> // for timing 00008 #include "mexhead.h" 00009 #include "Doc/smoothsphere_docstring.h" /* autogenerated from this file */ 00010 00011 /************************************************************** 00012 00013 %smoothsphere: smooth a projected sphere with a kernel 00014 % 00015 % [y,s,p]=smoothsphere(x,geom,k,kparam,kwt,bws) 00016 % * Given a solar image of location given by `geom', smooth it 00017 % using the radially-symmetric kernel `k'. 00018 % * This version is threaded, using 8 threads by default. 00019 % Set MXT_NUM_THREADS in the environment to lower this number. 00020 % * Off-disk values are indicated by NaN in the output. 00021 % * The kernel used is dependent only on the weighted distance 00022 % between two positions, say P1 and P2, in three-dimensional 00023 % coordinates, normalized to live on the unit sphere: 00024 % d = P1 - P2 00025 % dist = d' W d (>= 0) 00026 % for a diagonal weight matrix W. 00027 % The kernel values in `k' are specified for values of `dist' 00028 % in a linear range from 0 to kparam(1), typically 0.015. 00029 % Given a certain value of `dist' found between an image pixel 00030 % and the kernel center, the associated weight is interpolated 00031 % linearly using the table. If dot > kparam(1), a zero weight 00032 % is used. 00033 % * We have typically let k be a gaussian kernel of width 00034 % (standard deviation) = 0.0325, i.e., 00035 % k(dist) = C * exp[ -0.5 * dist / sqr(0.0325) ], 00036 % since dist is a squared quantity already, and C is a constant 00037 % chosen to make k have unit norm as a spherical kernel. 00038 % * As a shortcut, if k is a scalar, the kernel is taken to 00039 % be the above function, but with width 0.0325 multiplied by 00040 % k(1). In this case, the LUT consists of 256 points evenly 00041 % spaced over [0,kparam(1)]. 00042 % * The diagonal portion of the distance weight matrix W may be 00043 % supplied as a triple kwt. This weights distances between 00044 % P1 and P2 in the x, y, and z directions respectively. The 00045 % P-angle (rotation in the x-y plane) is taken into account in 00046 % this weighting, so that y is cross-track, and x and z are 00047 % always along-track. 00048 % * Typically the kernel falls to zero rather quickly. As a 00049 % computational shortcut, it is assumed that the kernel extends 00050 % only kparam(2) pixels on each side of its center. For W = I, 00051 % this implies that an "on" pixel in x extends to influence at 00052 % most a "swath" 2*kparam(2)+1 pixels on a side. Both the P-angle 00053 % and the W matrix are taken into account in finding the swath. 00054 % For instance, a W_y > 1 will cause the y portion of the swath 00055 % to shrink, because distance decreases faster. The swath is 00056 % always parallel to the (i,j) image axes, so P-angles not a 00057 % multiple of 90 degrees will cause the swath to be enlarged 00058 % to contain a rotated rectangle. 00059 % * To decrease computational load, nearby pixels may be grouped 00060 % into meta-pixels called blocks. This is especially trouble-free 00061 % away from the limb, where local geometry is nearly planar, so 00062 % the blocking is given as a table that depends on z (in [0,1]). 00063 % A row in the table of (z,bw) means: above the value z, use 00064 % a blocking of bw (>1). For z=0 to bws(1,1), no blocking is used. 00065 % * For example, the small-image default means to use single pixels 00066 % below 0.4, 2x2 blocks above 0.4, and 4x4 above 0.6. For typical 00067 % MDI images, this is 16%, 20%, and 64% of on-disk pixels, respectively. 00068 % Using block sizes that do not pack together is legal, but causes 00069 % inefficiency due to poor fits between abutting blocks; diagnose 00070 % bad packing with the p output. 00071 % * Supply bws=[] to treat all pixels singly (turn off blocking). 00072 % * The optional output s is the z-coordinate (in [0,R]). 00073 % The optional p output is the block (patch) number of each 00074 % pixel. 00075 % 00076 % Inputs: 00077 % real x[m,n]; 00078 % real geom[5]; -- [x0 y0 rsun b0 p0] 00079 % real k[p] or k[1]; 00080 % real kparam[2]; -- [top window] 00081 % opt real kwt = [1 1 1]; 00082 % opt real bws[bwnum,2] = [0.4 2;0.6 4]; -- m < 2048 00083 % = [0.3 2;0.5 4; 0.7 8]; -- m >= 2048 00084 % 00085 % Outputs: 00086 % real y[m,n]; 00087 % opt real s[m,n]; 00088 % opt int p[m,n]; 00089 % 00090 % See also: 00091 00092 % implemented as a mex file 00093 00094 ****************************************************************/ 00095 00096 /**************************************************************** 00097 Notes October 2010: 00098 00099 * Pixel ordering. I have started using a "mode" input that can be given 00100 as SESW for HMI and SENE for the old, transposed-MDI image ordering. See 00101 roi_stats_mag for how this works. Doing this allows this one code to be 00102 easily used for both cases. It is implemented by adding a stride 00103 parameter for both x and y to the underlying computational routines. The 00104 stride, and the x/y loop boundaries, are switched depending on "mode". 00105 For now, it's easier to just leave it alone; the user has to monkey with 00106 x0 and y0 to get the right behavior. 00107 00108 * Beta. I pass in beta-angle, but I don't use it. I should. 00109 Perceived computational burden, as well as lack of technical clarity, 00110 has been the issue. 00111 Technical clarity can come from the following observation. 00112 Consider two points, P1 (the center point of the convolution) and P2 00113 (the point where the kernel is evaluated). Now, the distance is a 00114 function of: 00115 00116 D = P1 - P2 (in 3d coordinates: x, y, and z) 00117 00118 In 3D coordinates, it is hard to account for beta, while still performing 00119 the right weighting according to kwt. One way out is to realize there are 00120 two angles (distances along the sphere) that are important: longitudinal 00121 and latitudinal. You can always talk about the offset of P2 from P1 in 00122 terms of a delta-lat and a delta-lon. The kwt penalty should have two 00123 terms in this case, the penalty for a given delta-lat and that for a given 00124 delta-lon. (We don't measure delta-lat and delta-lon in degrees, because 00125 the polar singularity would cause delta-lon to be unstable.) 00126 00127 One way to find delta-lat and delta-lon is to inner-product the 00128 difference D with an along-track unit vector and a cross-track unit 00129 vector. (These are per-pixel triplets, found once at P1 and used for all 00130 P2.) It might be best to just use a cross-track vector, and then 00131 dump the entire residual into the along-track vector, so that the entire 00132 difference D is penalized using either the along-track or the cross-track 00133 weighting. (I.e., D will not lie in the space spanned by just the 00134 along-track and cross-track vectors; there will be a third degree of 00135 freedom because the planar approximation is not exact.) 00136 00137 Depending on how it was done, this could correspond to a chord 00138 length or an arc length. It probably doesn't much matter which, because 00139 the kernel is a merely a computational device to obtain smoothing, and 00140 a simple scaling rule relates the two. 00141 00142 This does not seem that burdensome computationally; it's about the same 00143 amount of computation in the inner loop as now, just a handful of multiplies 00144 and adds. It should be prototyped to see how the results look for high beta. 00145 00146 ****************************************************************/ 00147 00148 /* constants and globals used for error checking */ 00149 00150 #define NARGIN_MIN 4 /* min number of inputs */ 00151 #define NARGIN_MAX 6 /* max number of inputs */ 00152 #define NARGOUT_MIN 0 /* min number of outputs */ 00153 #define NARGOUT_MAX 3 /* max number of outputs */ 00154 00155 #define ARG_X 0 00156 #define ARG_GEOM 1 00157 #define ARG_K 2 00158 #define ARG_KParam 3 00159 #define ARG_KWt 4 00160 #define ARG_BWs 5 00161 00162 #define ARG_Y 0 00163 #define ARG_S 1 00164 #define ARG_P 2 00165 00166 static const char *progname = "smoothsphere"; 00167 #define PROGNAME smoothsphere 00168 static const char *in_specs[NARGIN_MAX] = { 00169 "RM", 00170 "RV(5)", 00171 "RV", 00172 "RV(2)", 00173 "RV(0)|RV(3)", 00174 "RM"}; 00175 static const char *in_names[NARGIN_MAX] = { 00176 "x", 00177 "geom", 00178 "k", 00179 "kparam", 00180 "kwt", 00181 "bws"}; 00182 static const char *out_names[NARGOUT_MAX] = { 00183 "y", "s", "p"}; 00184 00185 00186 // short form of PROGNAME for #include identifiers 00187 #define SHORTNAME ssp 00188 00189 #define KWt_count 3 00190 #define Default_KWt_count 3 00191 static double Default_KWt[Default_KWt_count] = { 1.0, 1.0, 1.0 }; 00192 #define Default_BWs_count_small 4 00193 static double Default_BWs_small[Default_BWs_count_small] = { 0.4, 0.6, 2.0, 4.0 }; 00194 #define Default_BWs_count_large 6 00195 static double Default_BWs_large[Default_BWs_count_large] = { 0.3, 0.5, 0.7, 2.0, 4.0, 8.0 }; 00196 00197 /******************************************************************************** 00198 * 00199 * Utility for making kernels 00200 * 00201 ********************************************************************************/ 00202 00203 /* 00204 * make_kernel_lut: make a lookup table for a gaussian kernel 00205 * lut has Nker points, from 0..hi_ker (hi_ker typically = 0.015) 00206 * width of the kernel is kwid, which is given in scaled units, 00207 * so that kwid is order unity. 00208 * The lut, of length Nker, must be already allocated. 00209 */ 00210 static 00211 int 00212 make_kernel_lut(double *ker, int Nker, double hi_ker, double kwid) 00213 { 00214 const double del = hi_ker/(Nker-1); // Nker fenceposts, Nker-1 gaps 00215 double x; 00216 double scalefac; 00217 double sum; 00218 double norm; 00219 int i; 00220 00221 if (kwid <= 0 || isnan(kwid) || 00222 hi_ker <= 0 || isnan(hi_ker) || 00223 Nker <= 1 || ker == NULL) 00224 return 0; 00225 kwid *= 0.0325; // = the rescaled kernel width 00226 scalefac = 1/(2*kwid*kwid); // = the multiplier in the exp() call 00227 // fill in exp(- x / (2*kwid*kwid)) = exp(- x * scalefac) 00228 for (sum = i = 0; i < Nker; i++) { 00229 x = i*del; 00230 ker[i] = exp(- x * scalefac); 00231 sum += ker[i]; 00232 } 00233 // find the normalization factor for unit mass 00234 // this should perhaps use del, but we use hi_ker/Nker because 00235 // that is what we have historically used. 00236 norm = M_PI * (hi_ker/Nker) * sum; 00237 scalefac = 1/norm; 00238 // re-normalize ker 00239 for (i = 0; i < Nker; i++) 00240 ker[i] *= scalefac; 00241 return 1; 00242 } 00243 00244 00245 /******************************************************************************** 00246 * 00247 * Utilities for blocked convolution 00248 * 00249 ********************************************************************************/ 00250 00251 /* 00252 * Idea: Divide source pixels into classes. The class is a blocknum_t (like an 00253 * int) and it is a tag for a "meta-pixel" that subsumes a whole block of 00254 * source-image pixels. Each block is summarized by a blockstats_t, and is 00255 * convolved all at once. 00256 * 00257 * Blocking is done because the main source of computational cost in this 2d convolution 00258 * is large AR's. Formerly, each pixel in a large AR was mapped individually to the 00259 * output. If the kernel width is KW, this one pixel results in KW^2 updates to the 00260 * output, each of which requires many flops for computation of a distance, and 00261 * kernel lookup of the distance. (The issue is *not* tiny one or two-pixel AR's; this 00262 * has been checked on real data; the typical AR pixel has 5-6 neighbors.) Much 00263 * of this computation was wasted, because the smoothed activity is thresholded. 00264 * 00265 * To cut this time, we collect AR pixels that are relatively far from the limb into 00266 * blocks of size BWxBW, which are handled together. The scheme below is more flexible 00267 * than this; it allows any blocking scheme (not just uniform square blocks). Any mapping 00268 * of pixels to blocks is OK. To change it, see block_create. 00269 * 00270 * The scheme assigns each pixel a blocknum_t. If this is <0, it indicates the 00271 * pixel is handled individually. If >0, it is a block number; all pixels with 00272 * that block number are handled in a single convolution inner loop over KW*KW 00273 * pixels. The blocks are assigned by one routine, and then another loop over 00274 * the image computes the overall weight (sum of the weights in the block), and 00275 * the net position (weighted by the convolved input). The summary of each block 00276 * is put into a blockstat_t structure. It would even be possible to combine 00277 * blocks by manipulating just the blockstat_t summaries. 00278 * 00279 * Fundamentally, because considerable processing is devoted to each nonzero, ondisk 00280 * pixel, considerable intelligence can be used to optimize blocks. Note that no 00281 * blocks are created for singletons (they are handled individually), and that 00282 * each block must contain some activity; the whole away-from-limb disk is not blocked 00283 * in its entirety. It's typical to have a few thousand blocks (1024^2 image). 00284 * The current scheme uses blocks of width BW that are aligned on multiples of BW 00285 * in the input image, this is more efficient than fully ad-hoc block placement, 00286 * which tends to cause interference. 00287 * 00288 * We tried simpler combination schemes, but it turns out to be hard that every 00289 * value in the source is being used exactly once, because of the interaction of 00290 * the blocks, the limb, and the offdisk stuff. 00291 * 00292 * In the final convolution loop, we look at the blocknum_t of each pixel. 00293 * If it is a block (and not a singleton), we look up its block in the list, 00294 * and use its average position and cumulative weight as a "metapixel". We 00295 * also mark the block as used. When we come to next pixel that is a member 00296 * of that block, we do not count it again. 00297 */ 00298 00299 // legal block values: 00300 // offdisk, skip, singleton, clear; or anything from 1 to #blocks 00301 // (thus, any negative values are OK for the explicit block constants) 00302 // offdisk = outside visible disk 00303 // skip = visible, but zero weight. skip during convolution 00304 // singleton = visible, nonzero weight, treated singly during the convolution 00305 // clear = un-initialized. after block_create runs, *all* clears are gone 00306 // first, first+1, ... = part of the given block 00307 // Thus, block 0 is un-used. The valid blocks are 1..BlockNum, inclusive 00308 typedef enum { 00309 Block_Min = -3, // always holds the minimum value, for external saves 00310 Block_Offdisk = -3, // not on the visible disk 00311 Block_Skip = -2, // on-disk, zero weight (skippable) 00312 Block_Singleton = -1, // on-disk, nonzero weight, treated individually 00313 Block_Clear = 0, // un-initialized 00314 Block_First = 1, // this and following numbers: members of a given block 00315 Block_MAX = INT_MAX // ensures it's as big as an int 00316 } blocknum_t; 00317 // is the given block number part of a group 00318 #define BLOCK_GROUPED(b) (b>=Block_First) 00319 00320 /* 00321 * statistics summarizing a block: its mass-weighted location (x,y,z), 00322 * its weight (wt>0), and its function value (f). If the function 00323 * value is zero, the block is effectively turned off. This is done 00324 * during the iteration once the block has been smoothed into the output. 00325 */ 00326 typedef struct { 00327 double x; // location 00328 double y; 00329 double z; 00330 double wt; // weight (sum of abs(f)) 00331 double f; // function value 00332 } blockstat_t; 00333 00334 00335 /* 00336 * block_lut_validate: determine if a bwlen x 2 lookup table 00337 * supplied as an input argument is valid. returns an 00338 * error message. LUT order is: 00339 * 00340 * tau1 block1 00341 * tau2 block2 00342 * tau3 block3 00343 * 00344 * There is an implicit 0 above the tau's, and an implicit 1 below 00345 * them. For 0..tau1, no blocking. For tau1..tau2, block1. Etc. 00346 * Above tau3 and up to 1, block3. 00347 */ 00348 static 00349 char * 00350 block_lut_validate(double *bws, int bwlen) 00351 { 00352 int i; 00353 00354 for (i = 0; i < bwlen; i++) { 00355 // bin boundaries 00356 if (bws[i] < 0 || bws[i] > 1) 00357 return "bin boundaries must be in [0,1]"; 00358 if (i > 0 && (bws[i-1] >= bws[i])) 00359 return "bin boundaries must be monotone increasing"; 00360 // block sizes 00361 if (bws[bwlen+i] < 2) 00362 return "block sizes must be at least 2"; 00363 if (((int) bws[bwlen+i]) != bws[bwlen+i]) 00364 return "block sizes must be integers"; 00365 } 00366 return NULL; 00367 } 00368 00369 /* 00370 * block_lut_reformat: alter the lut as supplied slightly 00371 * so we can more readily use it. 00372 */ 00373 static 00374 void 00375 block_lut_reformat(double *tau1, double *taus, int *bws, 00376 double *bws_in, int bwlen, double R) 00377 { 00378 int i; 00379 00380 // empty case 00381 if (bwlen == 0) { 00382 *tau1 = R + 1; // always use singleton rule 00383 return; 00384 } 00385 // non-empty 00386 *tau1 = bws_in[0] * R; 00387 // bin boundaries 00388 for (i = 1; i < bwlen; i++) 00389 taus[i-1] = bws_in[i] * R; 00390 taus[bwlen-1] = R + 1; // last bin > largest z 00391 // block sizes 00392 for (i = 0; i < bwlen; i++) 00393 bws[i] = (int) bws_in[bwlen+i]; 00394 } 00395 00396 /* 00397 * block_create: make matrix of block-tags from (z,tau), the input map, 00398 * and a blockwidth parameter. 00399 * Returns the number of distinct tags needed, or -1 00400 * for error (should never happen). 00401 */ 00402 static 00403 blocknum_t 00404 block_create(blocknum_t *blocks, double *s, double *a, 00405 double tau1, double *taus, int *bws, int m, int n) 00406 { 00407 int k; 00408 int x1, y1; 00409 int x2, y2; 00410 int xlo, xhi, ylo, yhi; // loop bounds 00411 blocknum_t bn = Block_First; 00412 double s1; 00413 int bw; 00414 00415 /* loop over image pixels */ 00416 for (x1 = 0; x1 < n; x1++) 00417 for (y1 = 0; y1 < m; y1++) { 00418 // ensure blocks(x1,y1) is assigned a block number 00419 s1 = s[m*x1+y1]; 00420 if (isnan(s1)) { 00421 blocks[m*x1+y1] = Block_Offdisk; 00422 } else if (a[m*x1+y1] == 0) { 00423 blocks[m*x1+y1] = Block_Skip; // zero contribution 00424 } else if (s1 < tau1) { 00425 // note, strict < allows tau1 = 0 to suppress all singletons 00426 blocks[m*x1+y1] = Block_Singleton; // singleton 00427 } else if (blocks[m*x1+y1] == Block_Clear) { 00428 // This code block fully controls the discretization 00429 // scheme. Another scheme would just fill in the 00430 // block number "bn" in a different pattern, being sure 00431 // to not over-write blocks that are already assigned. 00432 // 00433 // determine the block width using s1 and the LUT 00434 // - if the LUT was empty, tau1 will be > R, and 00435 // this case will not be reached. 00436 // - otherwise, the last tau[] will be > R, and will 00437 // cause exit of the loop 00438 for (k = 0; s1 > taus[k]; k++) 00439 ; 00440 bw = bws[k]; 00441 // make a new block numbered bn 00442 xlo = (x1/bw)*bw; // round down 00443 xhi = xlo + bw; 00444 if (xhi > n) xhi = n; 00445 ylo = (y1/bw)*bw; 00446 yhi = ylo + bw; 00447 if (yhi > m) yhi = m; 00448 for (x2 = xlo; x2 < xhi; x2++) 00449 for (y2 = ylo; y2 < yhi; y2++) 00450 if (blocks[m*x2+y2] == Block_Clear) 00451 blocks[m*x2+y2] = bn; 00452 bn++; 00453 } else if (BLOCK_GROUPED(blocks[m*x1+y1])) { 00454 continue; // already in a block: this is OK 00455 } else { 00456 // should not happen 00457 mexWarnMsgTxt("create: bad blocknum.\n"); 00458 return Block_Min; // anything < 0 00459 } 00460 } 00461 return (blocknum_t) ((int) bn - 1); // the last valid block number 00462 } 00463 00464 /* 00465 * block_stats: compute statistics over all blocks 00466 * Valid blocks run from Block_First (=1) to btot, inclusive of both endpoints. 00467 * Returns 0 for out-of-range block number. 00468 */ 00469 static 00470 int 00471 block_stats(blockstat_t *stats, 00472 blocknum_t *blocks, 00473 double *z, // z-coord 00474 double *a, // the image to be convolved 00475 int m, 00476 int n, 00477 blocknum_t btot) 00478 { 00479 int x1, y1; 00480 blocknum_t bn = Block_First; 00481 double wt, norm; 00482 00483 // stats[].* is already zeroed out 00484 /* loop over image pixels, accumulating statistics */ 00485 for (x1 = 0; x1 < n; x1++) 00486 for (y1 = 0; y1 < m; y1++) { 00487 bn = blocks[m*x1+y1]; 00488 if (BLOCK_GROUPED(bn)) { 00489 if (bn > btot) 00490 return 0; // error: out-of-range block number 00491 wt = fabs(a[m*x1+y1]); 00492 stats[bn].x += x1 * wt; // un-centered 00493 stats[bn].y += y1 * wt; // un-centered 00494 stats[bn].z += z[m*x1+y1] * wt; 00495 stats[bn].wt += wt; 00496 stats[bn].f += a[m*x1+y1]; 00497 } 00498 } // end for x1, y1 00499 /* loop over blocks, computing average */ 00500 for (bn = Block_First; bn <= btot; bn++) { 00501 if (stats[bn].wt == 0) 00502 return 0; // error: empty block 00503 norm = 1.0 / stats[bn].wt; 00504 stats[bn].x *= norm; 00505 stats[bn].y *= norm; 00506 stats[bn].z *= norm; 00507 } 00508 return 1; 00509 } 00510 00511 /* 00512 * block_check: check that blocks have been cleared 00513 */ 00514 static 00515 int 00516 block_check(blockstat_t *stats, int btot) 00517 { 00518 int bn; 00519 int bn_nc = 0; // count blocks not cleared 00520 00521 /* loop over blocks, checking for uncleared block */ 00522 for (bn = Block_First; bn <= btot; bn++) 00523 if (stats[bn].f != 0) 00524 bn_nc++; 00525 return bn_nc; 00526 } 00527 00528 /* 00529 * convert blocks for output to matlab: int-to-double 00530 */ 00531 static 00532 void 00533 block_export(double *p, int *blocks, int m, int n) 00534 { 00535 int inx; 00536 00537 for (inx = 0; inx < m*n; inx++) 00538 p[inx] = (double) blocks[inx]; 00539 } 00540 00541 00542 /****************************************************************** 00543 * 00544 * Threading Support 00545 * 00546 * Structures to hold state passed down to computational threads, 00547 * a routine for thread-number selection 00548 * 00549 ****************************************************************** 00550 * 00551 * Note on threading strategy: 00552 * The key problem with threading this, is that the convolution is 00553 * pretty wide (typically 200 pixels wide) and so the footprint of 00554 * pixels in the output `b' that are modified by looping over a 00555 * rectangle of pixels in the input `a' is pretty large. 00556 * 00557 * We do the convolution by splitting the input into strips of size 00558 * Rwid along the x coordinate, each thread smoothing one strip at a 00559 * time. These partial results, which are wider than Rwid, are 00560 * accumulated into the final output array. E.g., consider: 00561 * 00562 * Input: X = [x0|x1|x2|x3|x4|x5] 00563 * 00564 * The thread corresponding to x2 = X(:,Rwid*2+[1:Rwid]), say, computes 00565 * a result y2 which is roughly of width Rwid + Kswath, where Kswath is 00566 * the full width of the kernel. This partial result must be summed in 00567 * to the full output starting at the correct offset. It will extend 00568 * Kswath/2 rows "below" Rwid*2, and Kswath/2 rows "above" Rwid*3. 00569 * (The partial result is located in a thread-specific buffer, but the 00570 * full output array is shared across threads.) 00571 * While it's being accumulated, other threads must not alter the 00572 * output array. 00573 * 00574 * We pay a penalty for separating the input into chunks, because 00575 * the corresponding output chunks are enlarged. Smaller chunks 00576 * means more granularity in the threading, so all threads are always 00577 * working (good), but too small means that we spend too much time 00578 * accumulating partial results into a full output (bad). In general, 00579 * each output array location will be accumulated-into 00580 * 00581 * 1 + Kswath/Rwid times, 00582 * 00583 * because in total, if there are k strips, there will be k*Rwid output 00584 * rows, but k*(Rwid+Kswath) result-rows found. Therefore each output 00585 * row will be written (Rwid+Kswath)/Rwid = 1 + Kswath/Rwid times. 00586 * For HMI, Kswath = 200 00587 * 00588 * We put most of the logic for partitioning into the individual threads. 00589 * We maintain an index of the largest row still remaining unfinished, and 00590 * let each thread check that index to determine whether to carve off 00591 * another rectangle. 00592 * 00593 ******************************************************************/ 00594 00595 /* 00596 * these structures are needed because the pthread interface passes 00597 * only one item to the thread as an argument, so we put all the 00598 * convolution arguments in that one argument 00599 */ 00600 00601 // one instance of this flavor of structure holds common info 00602 struct thread_info { 00603 // unchanging info, like geometry and overall parameters 00604 // (see smooth_rect for descriptions) 00605 // sizes 00606 int m; // m = y 00607 int n; // n = x 00608 // strides 00609 int strIx; 00610 int strIy; 00611 // disk geometry 00612 double xcen; 00613 double ycen; 00614 double R; 00615 double cos_a; 00616 double sin_a; 00617 // input matrices 00618 double *a; 00619 double *s; 00620 // output matrix 00621 double *b; 00622 // kernel 00623 double *kernel; 00624 int Klen; 00625 double Ktop; 00626 int Kswath; 00627 double *Kwt; 00628 // block structure of input 00629 blocknum_t *blocks; 00630 blockstat_t *stats; 00631 // swath rectangle sizes 00632 int xwid; // max size along x direction 00633 int ywid; // max size along y direction 00634 int Rwid; // target size for each thread's rectangle 00635 // next row to be assigned to a thread 00636 int smooth_bb_top; 00637 }; 00638 00639 // MAX_THREAD instances of this flavor of structure holds per-thread info 00640 struct thread_args { 00641 // per-thread info 00642 int tnum; // my own thread number, just informational 00643 // output error status (zero is OK, nonzero is not) 00644 int status; 00645 }; 00646 00647 /* 00648 * thread info, declared as local to this file for simplicity 00649 * (in particular, SA could be local to a function, and it could contain 00650 * pointers to the mutex and to SI, but this way keeps the declaration 00651 * close to the structure definition) 00652 */ 00653 // this is a hard max, but it can be lowered by an environment variable 00654 #define MAX_THREAD 8 00655 static struct thread_info SI; // one slot for common info 00656 static struct thread_args SA[MAX_THREAD]; // slots for per-thread info 00657 static pthread_mutex_t blockstat_mutex; 00658 static pthread_mutex_t accum_mutex; 00659 static pthread_mutex_t smooth_bb_top_mutex; 00660 00661 /* 00662 * get the requested number of threads, either from environment 00663 * (MXT_NUM_THREAD) or from the compiled-in default 00664 */ 00665 static 00666 int 00667 get_thread_num() 00668 { 00669 int tn, tn1; 00670 char *thread_env; 00671 00672 if ((thread_env = getenv("MXT_NUM_THREAD")) != NULL) { 00673 // get from environment 00674 tn = strtol(thread_env, NULL, 10); 00675 if (tn == 0) 00676 tn = MAX_THREAD; // error in conversion 00677 } else { 00678 // use compiled-in default 00679 tn = MAX_THREAD; 00680 } 00681 // clip to 1..thread_max 00682 if (tn > MAX_THREAD) 00683 tn = MAX_THREAD; 00684 else if (tn <= 0) 00685 tn = 1; 00686 return tn; 00687 } 00688 00689 /******************************************************************************** 00690 * 00691 * Spherical convolution utility 00692 * 00693 ********************************************************************************/ 00694 00695 /* swath_extent: establish loop boundaries 00696 * 00697 * move Kswath in each direction away from (x0,y0), 00698 * adjusting for both rotation angle (cos_a, sin_a) and 00699 * distance weighting (Kwt), to determine the four corners 00700 * of a box containing the region to include in the 00701 * convolution. Returns the largest and smallest x and y 00702 * indices which contain the whole box. 00703 */ 00704 static 00705 void 00706 swath_extent(int *xlo_out, int *xhi_out, 00707 int *ylo_out, int *yhi_out, 00708 double x0, double y0, 00709 double xcen, double ycen, double R, 00710 double cos_a, double sin_a, 00711 double *Kwt, int Kswath, int m, int n) 00712 { 00713 int xB, yB; 00714 double x1, y1; 00715 double xp1, yp1; 00716 double xlo, ylo, xhi, yhi; 00717 00718 // high and low markers 00719 xlo = ylo = m+n; // higher than either alone 00720 xhi = yhi = 0; // lower than either 00721 // loop over 4 corners 00722 for (xB = -1; xB <= 1; xB += 2) 00723 for (yB = -1; yB <= 1; yB += 2) { 00724 // scaled point 00725 x1 = xB * Kswath / sqrt(Kwt[0]); 00726 y1 = yB * Kswath / sqrt(Kwt[1]); 00727 // rotate by p-angle, place in image coords 00728 xp1 = x0 + x1 * cos_a + y1 * sin_a; 00729 yp1 = y0 + -x1 * sin_a + y1 * cos_a; 00730 // save low and high 00731 if (xp1 < xlo) xlo = xp1; 00732 if (xp1 > xhi) xhi = xp1; 00733 if (yp1 < ylo) ylo = yp1; 00734 if (yp1 > yhi) yhi = yp1; 00735 } 00736 // clip to Rsun 00737 if (xlo < xcen-R) xlo = xcen-R; 00738 if (xhi > xcen+R) xhi = xcen+R; 00739 if (ylo < ycen-R) ylo = ycen-R; 00740 if (yhi > ycen+R) yhi = ycen+R; 00741 // round outward to integers 00742 xlo = floor(xlo); ylo = floor(ylo); 00743 xhi = ceil (xhi); yhi = ceil (yhi); 00744 // clip to image 00745 if (xlo < 0) xlo = 0; 00746 if (xhi > n-1) xhi = n-1; 00747 if (ylo < 0) ylo = 0; 00748 if (yhi > m-1) yhi = m-1; 00749 // make output 00750 *xlo_out = (int) xlo; 00751 *xhi_out = (int) xhi; 00752 *ylo_out = (int) ylo; 00753 *yhi_out = (int) yhi; 00754 } 00755 00756 /* 00757 * 00758 * swath_max_size: determine an upper size limit on the footprint 00759 * of the kernel. In other words, this returns an upper bound on the 00760 * length of the x and y dimensions, generated by swath_extent above, 00761 * needed to hold a convolution result array. We want to be able to 00762 * alloc xwid*ywid doubles to hold the result from convolving one input pixel. 00763 * 00764 * This routine assumes that the size of the swath can be upper-bounded 00765 * independent of position (x0,y0). Currently, the only need for an 00766 * upper bound (rather than an exact figure) is that differences in 00767 * rounding can cause +/-1 changes in size at each end of the convolution 00768 * footprint. Gross differences are not possible. 00769 */ 00770 static 00771 void 00772 swath_max_size(int *xwid, 00773 int *ywid, 00774 double xcen, double ycen, double R, 00775 double cos_a, double sin_a, 00776 double *Kwt, int Kswath, int m, int n) 00777 { 00778 int xlo, xhi, ylo, yhi; 00779 00780 swath_extent(&xlo, &xhi, &ylo, &yhi, 00781 xcen, ycen, // use image center as the center of the swath 00782 xcen, ycen, R, 00783 cos_a, sin_a, 00784 Kwt, Kswath, m, n); 00785 // first, +1 because a range of lo = hi still includes 1 pixel 00786 // then, +2 because an unpredictable roundoff at either the lower 00787 // or upper endpoint, or both, might result in a larger range. 00788 *xwid = (xhi - xlo + 1) + 2; 00789 *ywid = (yhi - ylo + 1) + 2; 00790 } 00791 00792 /******************************************************************************** 00793 * 00794 * Spherical convolution core 00795 * 00796 ********************************************************************************/ 00797 00798 /* 00799 * Background on the convolution 00800 * 00801 * There are two aspects of this code that are different from standard 00802 * convolution (say, in 1 or 2 dimensions, in a periodic setting): 00803 * a - the kernel can be viewed as spatially varying 00804 * b - the discretization is not uniform 00805 * 00806 * Point (a) is debatable. If we imagine the functions x, y, k defined 00807 * on the sphere, then k can be expressed as depending only on s-t where 00808 * we are convolving x(t) and k(s). That is, we can write 00809 * y(s) = int_S x(t) k(s-t) dt 00810 * where t, s are 3-vectors of unit norm (on the sphere). In this case, 00811 * ||s - t||^2 = (s-t)'(s-t) = 2(1-t's), or 00812 * t's = 1 - 0.5 ||s-t||^2 00813 * and the dot product is a function of only s-t. 00814 * In this interpretation, the kernel is not spatially varying. The 00815 * convolution is computed on the sphere S and then, in a separate 00816 * operation, projected down to the unit disk. 00817 * 00818 * As an aside, if an exponential kernel k is used as input this function, 00819 * k(dot) = const * exp(lambda * dot), where dot = t's, 00820 * then this is equivalent to a gaussian function of the difference 00821 * delta = s - t 00822 * since 00823 * k(dot) = k(t's) = const * exp(lambda * 0.5 * ||s - t||^2) 00824 * This is a pretty good rationale for using a kernel that is of exponential 00825 * form, i.e. k = const1 * exp(const2 * linspace(-1, 0, kparam(2))), where 00826 * const2 sets the fwhm and const1 is chosen so k is of unit norm. 00827 * 00828 * Point (b) is tricky. It arises because one pixel at disk-center 00829 * corresponds to seeing a region of (say) "one unit", but at the limb, 00830 * the pixel could correspond to seeing many units. The conversion factor 00831 * is 1/z, where z is the distance from the x-y (image) plane outward to 00832 * the sphere boundary (z = sqrt(1-x^2-y^2)). The limb pixels carry 00833 * information about more area, so they will influence other pixels 00834 * proportionally more strongly. We just need to quantify the effect. 00835 * 00836 * Before we go on to do that, a word about the (1/z) factor, which is 00837 * the area of a pixel. As we go closer to the limb, z goes (slowly) 00838 * to zero, and 1/z, which approximates the area, goes (slowly) to 00839 * infinity. Especially for high-pixel-density images, this will 00840 * make the integral blow up. The area ~= 1/z approximation thus 00841 * breaks down there. One way out is to say that a pixel with a center 00842 * precisely at the limb (z=0) may as well be regarded as having area 00843 * equal to that of its maximally-on-disk part, which is half a pixel 00844 * farther onto the disk. Drawing the right triangle, we see that the 00845 * z of a point 0.5 pixel from the limb is sqrt(R) (where the max z, at 00846 * disc center, is R). To tame the normalization factor, we take 00847 * sqrt(R) as the lowest allowed z for the area computation. This 00848 * typically does not arise in practice -- even for HMI, at 3 pixels in, 00849 * z = sqrt(6*R) ~ 100, the min-z = sqrt(R) ~ 44. 00850 * 00851 * It's natural to compute the convolution as a discrete approximation 00852 * to a continuous convolution: 00853 * 00854 * I-1: y(s) = \int_S x(t) k(s,t) dt (using k(s,t) for k(s-t)) 00855 * y(i) = \sum_j x(j) k(i,j) |S_j| [*] 00856 * 00857 * where the whole sphere S is partitioned into {S_j}, and we assume that 00858 * to first order, the function x(t) equals a number x(j) on the patch S_j, 00859 * which has volume |S_j|. 00860 * The problem is that one can expand the integral the other way: 00861 * 00862 * I-2: y(s) = \int_S x(s-t) k(t,0) dt (using k(t,0) for k(t-0)) 00863 * y(i) = \sum_j x(i-j) k(j,0) |S_j| 00864 * 00865 * and the discretized version weights x(i-j) differently. There are some 00866 * subtleties here, because I-2 assumes the kernel is translation-invariant. 00867 * But, in any case, the two forms are not symmetric. And one then 00868 * wonders about the exact interpretation of the terms within [*]. 00869 * (E.g., does k(i,j) have to be k(s,t) weighted by |S_i| or |S_j|?) 00870 * 00871 * These questions become focused when you wonder how to normalize k() so 00872 * that the convolution integral has unit norm. 00873 * 00874 * It's possible to clarify these questions, which have to do with 00875 * weighting of nonnegative functions x and y, by analogy with 00876 * probability. The analogy is that x(t) and y(s) are both 00877 * densities with respect to a reference measure A(.), and k(s,t) 00878 * is a conditional density tying them together: 00879 * 00880 * x(t): pdf of an r.v. X relative to measure A(t) 00881 * y(s): pdf of an r.v. Y " " " " 00882 * k(s,t): conditional density p(Y=s|X=t) " " 00883 * 00884 * If the density x(t) is relatively smooth over patches S_j, we can 00885 * discretize everything, where |S_j| = A(S_j), the area of the patch: 00886 * 00887 * x_j: pmf of the r.v. X. Thus Pr(X in S_j) = x_j = x(t) |S_j| 00888 * for any t in S_j 00889 * y_i: pmf of the r.v. Y. Thus Pr(Y in S_i) = y_i = y(s) |S_i| 00890 * for any s in S_i 00891 * k_(i,j): conditional density p(Y|X), so k_(i,j) = k(s,t) |S_i| 00892 * (t and s as above). This is so because we are *given* 00893 * the value of X, so the average is over Y (S_i). 00894 * 00895 * To relate this back to the original problem, at large (limb) patches, 00896 * this says we are equally sure, throughout the patch, that the 00897 * patch has a given label (x(t) = const on the patch). But, over 00898 * the whole patch, there is a greater mass than the at disk center 00899 * (since the |S_j| term is larger). So, the limb patch as a whole 00900 * will have greater influence than disk-center patches as a whole. 00901 * 00902 * In short: the x(t), y(s) are the observed and smoothed values. 00903 * The x_j, y_i capture the discretization effects that make some 00904 * pixels "more strong" than others. 00905 * 00906 * To relate Y to X, note 00907 * 00908 * Pr(Y in S_i) = \sum_j Pr(X in S_j) * Pr(Y in S_i | X in S_j) 00909 * = \sum_j [x(t) |S_j|] * [k(s,t) |S_i|] 00910 * 00911 * and, noting Pr(Y in S_i) = y_i = y(s) |S_i|, 00912 * 00913 * y(s) = \sum_j x(t) k(s,t) |S_j| [t from S_j & s from S_i always] 00914 * 00915 * This recovers the formula [*]. 00916 * 00917 * It also tells us how to interpret "unit norm". For y(s) and x(t), 00918 * it means integrating to one against the area measure A(.). This is 00919 * identical to the discretized y_i and x_j summing to one against 00920 * |S_i|. In short: the discretized versions sum to unity, not the 00921 * continuous ones. In matlab, this would mean: 00922 * >> z = sqrt(1 - x^2 - y^2) or 00923 * >> z = rsun - ringmatrix([m n], [], [], center) 00924 * >> z(z < 0) = Inf; % make them drop out 00925 * and then, since the patch areas |S_j| are proportional to 1/z: 00926 * >> sum(sum(x .* (1./z))) = 1 00927 * >> sum(sum(y .* (1./z))) = 1 00928 * 00929 * More important, for k(s,t), it means 00930 * 00931 * (forall t) \int_S k(s,t) dA(s) = 1 00932 * 00933 * Because k(s,t) depends only on the radial distance between 00934 * s and t, we can take t = (0 0 1), and then s runs from 0..1 with 00935 * dot = s't = sqrt(1 - s^2). However, this simplifies considerably. 00936 * As we have parameterized it, k(s,t) = k(dot), which is given 00937 * explicitly in the input k(.). We need only that 00938 * 00939 * \int_{dot=0..1} k(dot) dA(dot) = 1 00940 * 00941 * In fact, the minimum value of dot is kparam(1). The area element 00942 * corresponding to s't in [dot, dot+d_dot] is just 2 pi d_dot. Thus 00943 * we want 00944 * 00945 * 2 pi \int_{dot=kparam(1)..1} k(dot) d_dot = 1 00946 * 00947 * The integral, when discretized, needs only the width of a length 00948 * element, which is delta_dot (roughly corresponding to d_dot above): 00949 * 00950 * >> delta_dot = (1 - kparam(1))/length(k); 00951 * >> normer = 2*pi*delta_dot*sum(k); 00952 * 00953 * Letting k = k/normer makes k have unit norm. 00954 */ 00955 00956 // error codes for smooth(), must be < 0 00957 #define SMOOTH_NOMEM -1 00958 #define SMOOTH_INTERNAL -2 00959 #define SMOOTH_NOTHREAD -3 00960 00961 /* 00962 * smooth_rect: convolve input a with kernel and put into output b 00963 * 00964 * Does convolution only in an input rectangle given by {x,y}R{lo,hi}. 00965 * The output will be written into a larger range, depending on the kernel 00966 * width Kswath, the kernel weights Kwt, and the tilt angle (cos_a, sin_a). 00967 * There are separate parameters for input and output strides, so the two 00968 * can be differently-sized. 00969 * This routine operates in a thread, but is not highly aware that it's 00970 * threaded (we need one mutex reference). 00971 * 00972 * returns 1 if error, 0 otherwise 00973 * (the only error condition is an internal logic error) 00974 */ 00975 static 00976 int 00977 smooth_rect(int m, // input image size (m = y) 00978 int n, // (n = x) 00979 // rectangle and strides 00980 int xRlo, // x rect range lo/hi (input images) 00981 int xRhi, 00982 int yRlo, // y rect range lo/hi (input images) 00983 int yRhi, 00984 int xROlo, // x rect offset (output image offset from input, typ. >= 0) 00985 int yROlo, // y rect offset (output image offset from input, typ. >= 0) 00986 int strIx, // stride in x, input (a, s arrays) 00987 int strIy, // stride in y, input 00988 int strOx, // stride in x, output (b array) 00989 int strOy, // stride in y, output 00990 // disk geometry 00991 double xcen, // center, n or x, in C coords, origin at 0 00992 double ycen, // center, m or y, in C coords, origin at 0 00993 double R, // radius, in pixels 00994 double cos_a, // cos(alpha) 00995 double sin_a, // sin(alpha) 00996 // input + output matrices 00997 double *a, // input is mXn 00998 double *s, // mXn (z coordinate, already found) 00999 double *b, // output (convolved result) 01000 // kernel 01001 double *kernel, // kernel lookup table 01002 int Klen, // length of the table 01003 double Ktop, // upper limit of the table (0..Ktop) 01004 int Kswath, // box enclosing kernel 01005 double *Kwt, // weights on x, y, z 01006 // block structure of input 01007 blocknum_t *blocks, 01008 blockstat_t *stats) 01009 { 01010 const double Kstep = (Klen-1)/Ktop; // map [0,Ktop] -> [0,K-1] 01011 const double min_Zwt = sqrt(R); // sanity bound on area-weighting, see above 01012 int x1, y1; // p1 coords in 2D, bounded by xRlo, xRhi, etc. 01013 int x2, y2; // p2 coords in 2D, bounded by xlo, xhi, etc. 01014 int xlo, xhi, ylo, yhi; // inner loop boundaries 01015 double p1x, p1y, p1z; // point P1 coords in 3D 01016 double p2x, p2y, p2z; // point P2 coords in 3D 01017 int p1_off; // array offset into a,s for the point (x1,y1) 01018 int x2_off, y2_off; // array offsets into a,s for the point (x2,y2) 01019 int xO_off, yO_off; // array offsets into b for the point (x2,y2) 01020 blocknum_t bn; // block number of (x1,y1) 01021 double dist; // distance from p1 to p2 01022 double contribution; // point (x1,y1)'s contribution to integral 01023 double scale; // scale factor for convolution integral 01024 double xx_term; 01025 double wt_xx, wt_yy, wt_zz, wt_xy; 01026 double inxC; // real-valued index into kernel table 01027 int inxD; // integer index into kernel table 01028 01029 /* set up weights */ 01030 wt_xx = Kwt[0] * cos_a * cos_a + Kwt[1] * sin_a * sin_a; 01031 wt_yy = Kwt[0] * sin_a * sin_a + Kwt[1] * cos_a * cos_a; 01032 wt_xy = 2.0 * (Kwt[0] - Kwt[1]) * cos_a * sin_a; 01033 wt_zz = Kwt[2]; 01034 /* include 1/(R^2) factor in each */ 01035 wt_xx /= R*R; 01036 wt_yy /= R*R; 01037 wt_zz /= R*R; 01038 wt_xy /= R*R; 01039 01040 /* loop over image pixels */ 01041 for (x1 = xRlo; x1 <= xRhi; x1++) 01042 for (y1 = yRlo; y1 <= yRhi; y1++) { 01043 p1_off = x1*strIx + y1*strIy; 01044 bn = blocks[p1_off]; // block number 01045 if (bn == Block_Clear) 01046 return 1; // should never happen 01047 if (bn == Block_Offdisk || bn == Block_Skip) 01048 continue; // off-limb or zero weight: no action needed 01049 if (bn == Block_Singleton) { 01050 // singleton 01051 /* find original vector coordinates P1 = (px1, py1, pz1) */ 01052 p1x = x1; 01053 p1y = y1; 01054 p1z = s[p1_off]; 01055 contribution = a[p1_off]; 01056 } else { 01057 // look up coordinates and function value via per-block statistics 01058 p1x = stats[bn].x; // un-centered 01059 p1y = stats[bn].y; // un-centered 01060 p1z = stats[bn].z; 01061 // skip the mutex in the case where the block was already counted 01062 if (stats[bn].f == 0) continue; 01063 // ensure only this thread is convolving with this block 01064 pthread_mutex_lock(&blockstat_mutex); 01065 contribution = stats[bn].f; 01066 stats[bn].f = 0; // indicate this thread has put it in the integral 01067 pthread_mutex_unlock(&blockstat_mutex); 01068 } 01069 // contrib == 0 when the block has been included once already, 01070 // in this thread or in another thread 01071 if (contribution == 0) 01072 continue; 01073 /* establish loop boundaries */ 01074 swath_extent(&xlo, &xhi, &ylo, &yhi, p1x, p1y, 01075 xcen, ycen, R, cos_a, sin_a, Kwt, Kswath, m, n); 01076 if (0) 01077 printf("(%.1f,%.1f) loop [%d,%d]x[%d,%d] (%dx%d)\n", 01078 p1x, p1y, xlo, xhi, ylo, yhi, xhi-xlo+1, yhi-ylo+1); 01079 // scale factor is the target of the convolution divided by an 01080 // area-based weight; the lowest weight allowed is min_Zwt 01081 scale = contribution / (((p1z < min_Zwt) ? min_Zwt : p1z) * R); 01082 /* loop over kernel */ 01083 for (x2 = xlo, 01084 x2_off = xlo *strIx, 01085 xO_off = (xlo - xROlo)*strOx; 01086 x2 <= xhi; 01087 x2++, 01088 x2_off += strIx, 01089 xO_off += strOx) { 01090 // optimizations for xx_term, offsets help a little (~5%): Apr 2010 01091 xx_term = (p1x - x2)*(p1x - x2)*wt_xx; 01092 for (y2 = ylo, 01093 y2_off = ylo *strIy, 01094 yO_off = (ylo - yROlo)*strOy; 01095 y2 <= yhi; 01096 y2++, 01097 y2_off += strIy, 01098 yO_off += strOy) { 01099 if (blocks[x2_off+y2_off] == Block_Offdisk) continue; 01100 p2z = s[x2_off+y2_off]; 01101 dist = xx_term + 01102 (p1y - y2) * (p1y - y2) * wt_yy + 01103 (p1y - y2) * (p1x - x2) * wt_xy + 01104 (p1z - p2z) * (p1z - p2z) * wt_zz; // distance ||P1-P2|| 01105 /* do kernel lookup on distance */ 01106 if (dist >= Ktop) continue; // dist >= Ktop: no contribution to b 01107 inxC = dist * Kstep; 01108 inxD = floor(inxC); 01109 if (inxD >= Klen-1) continue; // rounding puts dist on cusp: skip 01110 // note, now 0 <= inxD < Klen-1 01111 dist = inxC - inxD; /* in [0,1), smaller means closer to inxD */ 01112 /* 01113 printf(" adding %g->%d, wt %g, interp %g\n", inxC, inxD, kernel[inxD], dist); 01114 */ 01115 b[xO_off+yO_off] += 01116 (kernel[inxD]*(1.0-dist) + kernel[inxD+1]*dist) * scale; 01117 } // for y2 01118 } // for x2 01119 } // for (x1,y1) 01120 return 0; 01121 } 01122 01123 /* 01124 * smooth_thread: shell for Pthread invocation of the smoothing operation 01125 * 01126 * Each thread operates largely independently, selecting a rectangular region 01127 * of the input a, dividing along x and taking all of y, and sending that to 01128 * the core smoother. Once the smoothed rectangle comes back, it is accumulated 01129 * into the overall output matrix. 01130 * The thread control at this level needs two mutexes. One to control the 01131 * division of the input among the threads (smooth_bb_top, which indicates 01132 * the next un-smoothed row in the input). Another to control accumulation 01133 * into the result. We make it simple, so that only one thread can touch the 01134 * output at a time. By looking at timings, there seems to be little contention 01135 * at this point. 01136 * 01137 */ 01138 static 01139 void * 01140 smooth_thread(void *arg) 01141 { 01142 const int verbose = 0; 01143 const int bb_max = SI.n-1; // the maximum value the rectangle needs to encompass 01144 double * const b = SI.b; // points to output matrix 01145 double *b1; // individual output matrix, lumped into b 01146 int rc; 01147 struct thread_args *sa; 01148 struct timeval tv1, tv2; 01149 int bb_top, bb_low, bb_hi; // bounding box boundary bookkeeping 01150 int bb_total, bb_count; // internal counts for informational printouts 01151 int b_zero; // offset to y origin of bb passed to smooth_rect 01152 int xsize, ysize; 01153 int x, y; 01154 int p1_off, p2_off; 01155 01156 // arguments are taken from global arg list SI, and per-thread arg list *sa 01157 gettimeofday(&tv1, NULL); 01158 sa = (struct thread_args *) arg; 01159 // split the input along x (0...n-1) 01160 bb_total = bb_count = 0; 01161 while (1) { 01162 // carve off a new range of rows 01163 pthread_mutex_lock(&smooth_bb_top_mutex); 01164 bb_top = SI.smooth_bb_top; // NB, bb_top row itself is NOT assigned to a thread 01165 if (bb_top <= bb_max) { 01166 // more rows to do: find new range 01167 bb_low = bb_top; 01168 bb_hi = bb_top + SI.Rwid - 1; // bb_hi is assigned to this thread 01169 if (bb_hi > bb_max) bb_hi = bb_max; // cap at the max x 01170 // install new top 01171 SI.smooth_bb_top = bb_hi + 1; 01172 } 01173 pthread_mutex_unlock(&smooth_bb_top_mutex); 01174 if (bb_top > bb_max) 01175 break; // there were no more rows to do 01176 bb_count++; 01177 bb_total += bb_hi - bb_low + 1; 01178 // new range of rows is [bb_low,bb_hi], inclusive 01179 // calloc a rectangle big enough to hold the convolved rectangle 01180 xsize = (bb_hi - bb_low + 1) + SI.xwid; 01181 b_zero = bb_low - SI.xwid/2; // b1(y==b_zero) will go into b(y==bb_low) 01182 ysize = SI.m; 01183 // fresh calloc each iteration is faster than clearing b1 with memset() 01184 b1 = calloc(xsize*ysize, sizeof(*b1)); 01185 rc = smooth_rect(SI.m, SI.n, 01186 // bounding rectangle (input images) 01187 bb_low, bb_hi, 01188 0, SI.m-1, 01189 b_zero, 0, 01190 // strides (input images) 01191 SI.strIx, SI.strIy, 01192 // strides (output image) 01193 ysize, 1, 01194 // disk geometry 01195 SI.xcen, SI.ycen, SI.R, SI.cos_a, SI.sin_a, 01196 // input matrices 01197 SI.a, SI.s, 01198 // output matrix 01199 b1, 01200 // kernel 01201 SI.kernel, SI.Klen, SI.Ktop, SI.Kswath, SI.Kwt, 01202 // block structure of input 01203 SI.blocks, SI.stats); 01204 sa->status = rc; // exit status, 0 is OK 01205 if (rc != 0) { 01206 free(b1); 01207 pthread_exit(NULL); // trouble 01208 } 01209 //printf(" thread #%d: handled [%d,%d] * %d @ %d\n", sa->tnum, 01210 // bb_low, bb_hi, SI.xwid, b_zero); 01211 // accumulate into the final result 01212 pthread_mutex_lock(&accum_mutex); 01213 // x loop is over all of b1, some of which may be aligned with 01214 // an off-image part of the output b 01215 for (x = 0; x < xsize; x++) 01216 for (y = 0; y < ysize; y++) { 01217 // off-image check: is (x,y) on-image for b? 01218 if ((x + b_zero < 0) || (x + b_zero >= SI.n)) continue; 01219 p1_off = x*ysize + y*1; // offset into b1 01220 p2_off = (x+b_zero)*SI.strIx + y*SI.strIy; // offset into b 01221 b[p2_off] += b1[p1_off]; 01222 } 01223 pthread_mutex_unlock(&accum_mutex); 01224 free(b1); 01225 } 01226 // thread is done 01227 gettimeofday(&tv2, NULL); 01228 int dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 01229 (((int) tv2.tv_usec) - ((int) tv1.tv_usec)); 01230 if (verbose) 01231 mexPrintf(" ST: exiting thread #%d, did %d/%d, dt = %.1f ms\n", 01232 sa->tnum, bb_count, bb_total, dt/1000.0); 01233 pthread_exit(NULL); 01234 } 01235 01236 01237 /* control: coordinates the smoothing operation 01238 * 01239 * sets up blocking data structures, 01240 * launches threads, combines results 01241 */ 01242 static 01243 int 01244 control(int m, // image size 01245 int n, 01246 double *a, // input is mXn 01247 double *b, // output is mXn (convolved result) 01248 double *s, // output is mXn (z coordinate) 01249 double *p, // output, mXn (patch indexes) 01250 double xcen, // center, n or x, in C coords, origin at 0 01251 double ycen, // center, m or y, in C coords, origin at 0 01252 double R, // radius, in pixels 01253 double p0, // tilt angle 01254 double *kernel, // kernel lookup table 01255 int Klen, // length of the table 01256 double Ktop, // upper limit of the table (0..Ktop) 01257 int Kswath, // box enclosing kernel 01258 double *Kwt, // weights on x, y, z; angle 01259 double *bws_in, // block lookup table (length x 2) 01260 int bwlen) // its length 01261 01262 { 01263 const int verbose = 0; 01264 const int mn = m*n; 01265 const double cos_a = cos(p0); // cos(alpha) 01266 const double sin_a = sin(p0); // sin(alpha) 01267 const double nan = mxt_getnand(); // cache nan 01268 int s_alloc_local; // did we alloc s here? 01269 blocknum_t blockNum, bn; 01270 blocknum_t *blocks; 01271 blockstat_t *stats; 01272 double tau1, *taus; // z-thresholds for blocking 01273 int *bws; // block widths (int's) 01274 int x1, y1, off; // array position and offset 01275 double p1x, p1y; // image position 01276 double temp; 01277 int xwid, ywid, Rwid; // prototype rectangle widths 01278 int t, NT; // thread counts 01279 pthread_t threads[MAX_THREAD];// thread info 01280 struct timeval tv1, tv2; 01281 int dt; 01282 01283 gettimeofday(&tv1, NULL); 01284 /* 01285 * 1: Allocations 01286 */ 01287 // s, if necessary 01288 if (s_alloc_local = (s == NULL)) 01289 s = calloc(mn, sizeof(double)); 01290 if (!s) return SMOOTH_NOMEM; 01291 // block-number array 01292 blocks = calloc(mn, sizeof(blocknum_t)); /* must be cleared */ 01293 if (!blocks) return SMOOTH_NOMEM; 01294 01295 /* 01296 * 2: Find z 01297 */ 01298 // loop over image pixels 01299 off = 0; // in general, off = x1*m + y1 01300 for (x1 = 0; x1 < n; x1++) 01301 for (y1 = 0; y1 < m; y1++) { 01302 // find original vector coordinates P1 = (px1, py1, pz1) 01303 p1x = x1 - xcen; 01304 p1y = y1 - ycen; 01305 temp = (R + p1x) * (R - p1x) - p1y*p1y; /* R*R-x*x = (R-x)*(R+x) */ 01306 if (temp < 0) 01307 b[off] = s[off] = nan; // off-disk 01308 else 01309 s[off] = sqrt(temp); // +sqrt: visible half-sphere 01310 off++; 01311 } 01312 gettimeofday(&tv2, NULL); 01313 dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 01314 (((int) tv2.tv_usec) - ((int) tv1.tv_usec)); 01315 if (verbose) 01316 mexPrintf(" do_SSa: dt = %.1f ms\n", dt/1000.0); 01317 01318 /* 01319 * 3: Set up blocking info 01320 */ 01321 // juggle the block table we were given 01322 taus = calloc(bwlen, sizeof(*taus)); 01323 bws = calloc(bwlen, sizeof(*bws)); 01324 if (!taus || !bws) return SMOOTH_NOMEM; 01325 block_lut_reformat(&tau1, taus, bws, bws_in, bwlen, R); 01326 // sets up blocks, finds number of blocks 01327 blockNum = block_create(blocks, s, a, tau1, taus, bws, m, n); 01328 if (blockNum == Block_Min) 01329 return SMOOTH_INTERNAL; // logic error in code 01330 // don't need these any more 01331 free(taus); free(bws); taus = NULL; bws = NULL; 01332 // printf("Using %d blocks\n", blockNum); 01333 // compute aggregate block statistics 01334 stats = calloc(blockNum+1, sizeof(*stats)); 01335 if (!stats) return SMOOTH_NOMEM; 01336 if (!block_stats(stats, blocks, s, a, m, n, blockNum)) 01337 return SMOOTH_INTERNAL; // out-of-range block number 01338 gettimeofday(&tv2, NULL); 01339 dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 01340 (((int) tv2.tv_usec) - ((int) tv1.tv_usec)); 01341 if (verbose) 01342 mexPrintf(" do_SSb: dt = %.1f ms\n", dt/1000.0); 01343 01344 /* 01345 * 4: Set up thread info 01346 */ 01347 NT = get_thread_num(); // check env for advice 01348 // determine maximum rectangle width 01349 swath_max_size(&xwid, &ywid, xcen, ycen, R, cos_a, sin_a, Kwt, Kswath, m, n); 01350 // could make Rwid dependent on Kswath... 01351 // n=4096, NT=8, Kswath = 200 => Rwid = 128, 1+Ks/Rwid = 2.5 01352 Rwid = n / (NT * 4); 01353 if (Rwid < 4) Rwid = 4; // best not to go too small, certainly need > 0 01354 if (NT == 1) Rwid = n; // if one thread, do it all at once 01355 // record rectangle extent, common across threads 01356 SI.xwid = xwid; // max size along x direction 01357 SI.ywid = ywid; // max size along y 01358 SI.Rwid = Rwid; // target size for each thread's rectangle 01359 // set up geometry and param info valid for all calls to the smoother 01360 SI.m = m; SI.n = n; 01361 SI.strIx = m; SI.strIy = 1; // could generalize these strides 01362 SI.xcen = xcen; SI.ycen = ycen; 01363 SI.R = R; SI.cos_a = cos_a, SI.sin_a = sin_a; 01364 // input arrays (image size) 01365 SI.a = a; SI.s = s; 01366 // output array (image size) 01367 SI.b = b; 01368 // kernel 01369 SI.kernel = kernel; SI.Klen = Klen; SI.Ktop = Ktop; 01370 SI.Kswath = Kswath; SI.Kwt = Kwt; 01371 SI.blocks = blocks; SI.stats = stats; 01372 01373 /* 01374 * 5: Do the smoothing in threads 01375 */ 01376 pthread_mutex_init(&blockstat_mutex, NULL); 01377 pthread_mutex_init(&accum_mutex, NULL); 01378 pthread_mutex_init(&smooth_bb_top_mutex, NULL); 01379 // shared across threads, this marks our current position in filling 01380 // in the output -- it indicates the "next row to convolve" 01381 SI.smooth_bb_top = 0; 01382 // initiate NT threads 01383 for (t = 0; t < NT; t++) { 01384 // pack changing args 01385 SA[t].tnum = t; // informational for the thread 01386 SA[t].status = 0; // status output 01387 // start the thread 01388 if (pthread_create(threads+t, NULL, smooth_thread, (void *) &SA[t]) != 0) 01389 return SMOOTH_NOTHREAD; 01390 } 01391 // join all threads 01392 for (t = 0; t < NT; t++) { 01393 if (pthread_join(threads[t], NULL) != 0) 01394 return SMOOTH_NOTHREAD; 01395 if (SA[t].status != 0) 01396 return SMOOTH_INTERNAL; // internal error in the rectangle smoother 01397 } 01398 pthread_mutex_destroy(&blockstat_mutex); 01399 pthread_mutex_destroy(&accum_mutex); 01400 pthread_mutex_destroy(&smooth_bb_top_mutex); 01401 gettimeofday(&tv2, NULL); 01402 dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 01403 (((int) tv2.tv_usec) - ((int) tv1.tv_usec)); 01404 if (verbose) 01405 mexPrintf(" do_SSc: dt = %.1f ms\n", dt/1000.0); 01406 01407 /* 01408 * 6: Clean up and exit 01409 */ 01410 if (block_check(stats, blockNum) != 0) { 01411 mexWarnMsgTxt("Error: did not integrate with all blocks\n"); 01412 return SMOOTH_INTERNAL; 01413 } 01414 // optionally copy blocks into p 01415 if (p) 01416 block_export(p, blocks, m, n); 01417 free(blocks); 01418 free(stats); 01419 if (s_alloc_local) 01420 free(s); 01421 return blockNum; // always >= 0 01422 } 01423 01424 #ifdef StaticP /* undefined under mex */ 01425 StaticP 01426 #endif 01427 void 01428 mexFunction(int nlhs, 01429 mxArray *plhs[], 01430 int nrhs, 01431 const mxArray *prhs[]) 01432 { 01433 const int verbose = 0; 01434 char errstr[160]; 01435 char *msg; 01436 double datamin, datamax; 01437 double *s_data; // place to put s in case we don't return it 01438 double *p_data; // place to put p in case we don't return it 01439 mxArray *wt_holder; // place to put KWt if it wasn't supplied 01440 int m, n; // image size 01441 double xcen, ycen, rsun, p0;// disk params 01442 double *Klut; // kernel lookup table 01443 int Nker; // and its length 01444 double Ktop; // lut spans [0,Ktop] 01445 double KswathR; int KswathI;// box enclosing kernel 01446 double *bws; int bwlen; // block width lookup table 01447 int ok; 01448 struct timeval tv1, tv2; 01449 01450 gettimeofday(&tv1, NULL); 01451 /* Hook for introspection (function signature, docstring) */ 01452 if (nrhs < 0) { 01453 plhs[0] = mxt_PackSignature((mxt_Signature) (-nrhs), 01454 NARGIN_MIN, NARGIN_MAX, 01455 NARGOUT_MIN, NARGOUT_MAX, 01456 in_names, in_specs, out_names, docstring); 01457 return; 01458 } 01459 /* check number of arguments */ 01460 if ((nrhs < NARGIN_MIN) || (nrhs > NARGIN_MAX)) 01461 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01462 "%s: Expect %d <= input args <= %d", 01463 progname, NARGIN_MIN, NARGIN_MAX), errstr)); 01464 if ((nlhs < NARGOUT_MIN) || (nlhs > NARGOUT_MAX)) 01465 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01466 "%s: Expect %d <= output args <= %d", 01467 progname, NARGOUT_MIN, NARGOUT_MAX), errstr)); 01468 mexargparse(nrhs, prhs, in_names, in_specs, NULL, progname); 01469 01470 /* get size of input image x */ 01471 m = (int) mxGetM(prhs[ARG_X]); // m = y 01472 n = (int) mxGetN(prhs[ARG_X]); // n = x 01473 /* get disk params */ 01474 xcen = mxGetPr(prhs[ARG_GEOM])[0] - 1; // arg #1 = x0 (convert to C origin) 01475 ycen = mxGetPr(prhs[ARG_GEOM])[1] - 1; // arg #2 = y0 (convert to C origin) 01476 rsun = mxGetPr(prhs[ARG_GEOM])[2]; // arg #3 = radius 01477 // (do not use beta angle now) 01478 p0 = mxGetPr(prhs[ARG_GEOM])[4]; // p-angle 01479 p0 *= M_PI/180.0; // to radians 01480 // kernel params 01481 Ktop = mxGetPr(prhs[ARG_KParam])[0]; 01482 KswathR= mxGetPr(prhs[ARG_KParam])[1]; 01483 KswathI= (int) KswathR; 01484 if (Ktop <= 0 || Ktop >= 1) 01485 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01486 "%s: Expect %g <= Kpar(1) < %g, got %g", 01487 progname, 0.0, 1.0, Ktop), errstr)); 01488 if (KswathI < 1 || KswathI != KswathR) 01489 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01490 "%s: Expect integer Kpar(2) >= %d, got %g", 01491 progname, -1, KswathR), errstr)); 01492 // kernel LUT - make or use supplied one 01493 if (mxGetNumberOfElements(prhs[ARG_K]) == 1) { 01494 // a Gaussian kernel width was given 01495 Nker = 256; // this is a reasonable default 01496 Klut = calloc(Nker, sizeof(*Klut)); 01497 if (!Klut) 01498 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01499 "%s: Could not allocate kernel LUT (length %d)", 01500 progname, Nker), errstr)); 01501 // make_lut checks for negatives, NaNs 01502 if (!make_kernel_lut(Klut, Nker, Ktop, mxGetScalar(prhs[ARG_K]))) 01503 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01504 "%s: Illegal spec for kernel LUT (check kparam, k(1))", 01505 progname), errstr)); 01506 } else { 01507 // explicit kernel given 01508 Klut = mxGetPr(prhs[ARG_K]); 01509 Nker = mxGetNumberOfElements(prhs[ARG_K]); 01510 } 01511 if (0) 01512 for (ok = 0; ok < Nker; ok++) 01513 printf("k[% 3d] = %.6f\n", ok, Klut[ok]); 01514 01515 // KWt 01516 // if there's no KWt arg, make up one to fill in later 01517 if (nrhs > ARG_KWt) 01518 wt_holder = (mxArray *) prhs[ARG_KWt]; /* if uncast, the RHS is const */ 01519 else 01520 wt_holder = mxCreateDoubleMatrix(0,0,mxREAL); 01521 01522 // bin params 01523 if (nrhs > ARG_BWs) { 01524 bws = mxGetPr(prhs[ARG_BWs]); 01525 bwlen = mxGetM(prhs[ARG_BWs]); 01526 if (bwlen > 0 && mxGetN(prhs[ARG_BWs]) != 2) 01527 // (empty is OK) 01528 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01529 "%s: bws LUT should have 2 columns", 01530 progname), errstr)); 01531 } else { 01532 if (m < 2048) { 01533 // "small" image case 01534 bws = Default_BWs_small; // just a list of numbers 01535 bwlen = Default_BWs_count_small/2; // always 2 columns 01536 } else { 01537 // "large" image case 01538 bws = Default_BWs_large; // just a list of numbers 01539 bwlen = Default_BWs_count_large/2; // always 2 columns 01540 } 01541 } 01542 if ((msg = block_lut_validate(bws, bwlen)) != NULL) 01543 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01544 "%s: bws LUT format error: %s", 01545 progname, msg), errstr)); 01546 01547 // output y 01548 plhs[ARG_Y] = mxCreateDoubleMatrix(m, n, mxREAL); // always returned 01549 if (!plhs[ARG_Y]) 01550 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01551 "%s: Could not allocate y output", 01552 progname), errstr)); 01553 // s, if we want it 01554 if (nlhs > ARG_S) { 01555 plhs[ARG_S] = mxCreateDoubleMatrix(m, n, mxREAL); // optionally returned 01556 if (!plhs[ARG_S]) 01557 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01558 "%s: Could not allocate s output", 01559 progname), errstr)); 01560 s_data = mxGetPr(plhs[ARG_S]); 01561 } else { 01562 s_data = NULL; 01563 } 01564 // p, if we want it 01565 if (nlhs > ARG_P) { 01566 plhs[ARG_P] = mxCreateDoubleMatrix(m, n, mxREAL); // optionally returned 01567 if (!plhs[ARG_P]) 01568 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01569 "%s: Could not allocate p output", 01570 progname), errstr)); 01571 p_data = mxGetPr(plhs[ARG_P]); 01572 } else { 01573 p_data = NULL; 01574 } 01575 01576 /* call the function */ 01577 ok = control(m, n, // sizes 01578 mxGetPr(prhs[ARG_X]), // input is mXn 01579 mxGetPr(plhs[ARG_Y]), // convolved result is mXn 01580 s_data, // z coord, mXn (opt. returned if non-null) 01581 p_data, // patch indexes, mXn (opt. returned if non-null) 01582 xcen, // center, n or x, in C coords, origin at 0 01583 ycen, // center, m or y, in C coords, origin at 0 01584 rsun, // radius, in pixels 01585 p0, // p-angle, in radians 01586 Klut, // kernel LUT 01587 Nker, // and its size 01588 Ktop, // lower range of the table 01589 KswathI, // box enclosing kernel 01590 mxt_make_vector(wt_holder, KWt_count,Default_KWt, Default_KWt_count), 01591 bws, 01592 bwlen); 01593 // ok holds number of patches, or negative if trouble 01594 if (ok < 0) 01595 mexErrMsgTxt((snprintf(errstr, sizeof(errstr), 01596 "%s: Failed calloc, thread, or internal error in control(), code = %d", 01597 progname, ok), errstr)); 01598 01599 /* set up range of first output */ 01600 // getrange(prhs[ARG_X], &datamin, &datamax); 01601 datamin = 0; datamax = 2; /* input is treated as an indicator */ 01602 setrange(plhs[ARG_Y], datamin, datamax); 01603 /* optional "s" output */ 01604 if (nlhs > ARG_S) { 01605 setrange(plhs[ARG_S], 0.0, rsun); /* "z" index in pixels */ 01606 } 01607 /* optional "p" output */ 01608 if (nlhs > ARG_P) { 01609 // ok is number of patches 01610 setrange(plhs[ARG_P], (double) Block_Min, (double) ok); 01611 } 01612 // free up kernel lut if we generated it internally 01613 if (mxGetNumberOfElements(prhs[ARG_K]) == 1) 01614 free(Klut); 01615 // free up weight matrix placeholder if it was used 01616 if (nrhs <= ARG_KWt) mxDestroyArray(wt_holder); 01617 gettimeofday(&tv2, NULL); 01618 int dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 01619 (((int) tv2.tv_usec) - ((int) tv1.tv_usec)); 01620 if (verbose) 01621 mexPrintf("SS: dt = %.1f ms\n", dt/1000.0); 01622 } 01623 01624 01625 01626 /* Hook for generic tail matter */ 01627 #ifdef MEX2C_TAIL_HOOK 01628 #include "mex2c_tail.h" 01629 #endif 01630