JSOC_Documentation: proj/mag/harp/libs/matlab/mfile-mex/hmi-mask-patch/smoothsphere.c Source File

00001 #include "mex.h"  /* must appear first */
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <math.h>
00005 #include <limits.h>
00006 #include <pthread.h>
00007 #include <sys/time.h> // for timing
00008 #include "mexhead.h"
00009 #include "Doc/smoothsphere_docstring.h"  /* autogenerated from this file */
00010 
00011 /**************************************************************
00012 
00013 %smoothsphere: smooth a projected sphere with a kernel
00014 % 
00015 % [y,s,p]=smoothsphere(x,geom,k,kparam,kwt,bws)
00016 % * Given a solar image of location given by `geom', smooth it 
00017 % using the radially-symmetric kernel `k'.
00018 % * This version is threaded, using 8 threads by default.
00019 % Set MXT_NUM_THREADS in the environment to lower this number.
00020 % * Off-disk values are indicated by NaN in the output.
00021 % * The kernel used is dependent only on the weighted distance 
00022 % between two positions, say P1 and P2, in three-dimensional 
00023 % coordinates, normalized to live on the unit sphere:
00024 %   d = P1 - P2
00025 %   dist = d' W d  (>= 0)
00026 % for a diagonal weight matrix W.
00027 % The kernel values in `k' are specified for values of `dist'
00028 % in a linear range from 0 to kparam(1), typically 0.015.  
00029 % Given a certain value of `dist' found between an image pixel 
00030 % and the kernel center, the associated weight is interpolated 
00031 % linearly using the table.  If dot > kparam(1), a zero weight 
00032 % is used.
00033 % * We have typically let k be a gaussian kernel of width
00034 % (standard deviation) = 0.0325, i.e., 
00035 %   k(dist) = C * exp[ -0.5 * dist / sqr(0.0325) ], 
00036 % since dist is a squared quantity already, and C is a constant
00037 % chosen to make k have unit norm as a spherical kernel.
00038 % * As a shortcut, if k is a scalar, the kernel is taken to 
00039 % be the above function, but with width 0.0325 multiplied by
00040 % k(1).  In this case, the LUT consists of 256 points evenly
00041 % spaced over [0,kparam(1)].
00042 % * The diagonal portion of the distance weight matrix W may be 
00043 % supplied as a triple kwt.  This weights distances between 
00044 % P1 and P2 in the x, y, and z directions respectively.   The
00045 % P-angle (rotation in the x-y plane) is taken into account in
00046 % this weighting, so that y is cross-track, and x and z are
00047 % always along-track.
00048 % * Typically the kernel falls to zero rather quickly.  As a
00049 % computational shortcut, it is assumed that the kernel extends 
00050 % only kparam(2) pixels on each side of its center.  For W = I,
00051 % this implies that an "on" pixel in x extends to influence at 
00052 % most a "swath" 2*kparam(2)+1 pixels on a side.  Both the P-angle 
00053 % and the W matrix are taken into account in finding the swath.
00054 % For instance, a W_y > 1 will cause the y portion of the swath
00055 % to shrink, because distance decreases faster.  The swath is 
00056 % always parallel to the (i,j) image axes, so P-angles not a 
00057 % multiple of 90 degrees will cause the swath to be enlarged
00058 % to contain a rotated rectangle.
00059 % * To decrease computational load, nearby pixels may be grouped
00060 % into meta-pixels called blocks.  This is especially trouble-free
00061 % away from the limb, where local geometry is nearly planar, so
00062 % the blocking is given as a table that depends on z (in [0,1]).
00063 % A row in the table of (z,bw) means: above the value z, use
00064 % a blocking of bw (>1).  For z=0 to bws(1,1), no blocking is used.
00065 % * For example, the small-image default means to use single pixels 
00066 % below 0.4, 2x2 blocks above 0.4, and 4x4 above 0.6.  For typical 
00067 % MDI images, this is 16%, 20%, and 64% of on-disk pixels, respectively.
00068 % Using block sizes that do not pack together is legal, but causes 
00069 % inefficiency due to poor fits between abutting blocks; diagnose
00070 % bad packing with the p output.
00071 % * Supply bws=[] to treat all pixels singly (turn off blocking).
00072 % * The optional output s is the z-coordinate (in [0,R]).
00073 % The optional p output is the block (patch) number of each
00074 % pixel.
00075 % 
00076 % Inputs:
00077 %   real x[m,n];
00078 %   real geom[5];  -- [x0 y0 rsun b0 p0]
00079 %   real k[p] or k[1];
00080 %   real kparam[2];  -- [top window]
00081 %   opt real kwt = [1 1 1];
00082 %   opt real bws[bwnum,2] = [0.4 2;0.6 4];         -- m <  2048
00083 %                         = [0.3 2;0.5 4; 0.7 8];  -- m >= 2048
00084 % 
00085 % Outputs:
00086 %   real y[m,n];
00087 %   opt real s[m,n];
00088 %   opt int  p[m,n];
00089 % 
00090 % See also: 
00091 
00092 % implemented as a mex file
00093 
00094 ****************************************************************/
00095 
00096 /****************************************************************
00097  Notes October 2010:
00098 
00099  * Pixel ordering.  I have started using a "mode" input that can be given
00100  as SESW for HMI and SENE for the old, transposed-MDI image ordering.  See
00101  roi_stats_mag for how this works.  Doing this allows this one code to be
00102  easily used for both cases.  It is implemented by adding a stride
00103  parameter for both x and y to the underlying computational routines.  The
00104  stride, and the x/y loop boundaries, are switched depending on "mode".
00105  For now, it's easier to just leave it alone; the user has to monkey with
00106  x0 and y0 to get the right behavior.
00107 
00108  * Beta.  I pass in beta-angle, but I don't use it.  I should.
00109  Perceived computational burden, as well as lack of technical clarity,
00110  has been the issue.  
00111  Technical clarity can come from the following observation.  
00112  Consider two points, P1 (the center point of the convolution) and P2 
00113  (the point where the kernel is evaluated).  Now, the distance is a 
00114  function of:
00115 
00116    D = P1 - P2 (in 3d coordinates: x, y, and z)
00117 
00118  In 3D coordinates, it is hard to account for beta, while still performing
00119  the right weighting according to kwt.  One way out is to realize there are
00120  two angles (distances along the sphere) that are important: longitudinal
00121  and latitudinal.  You can always talk about the offset of P2 from P1 in
00122  terms of a delta-lat and a delta-lon.  The kwt penalty should have two
00123  terms in this case, the penalty for a given delta-lat and that for a given
00124  delta-lon.  (We don't measure delta-lat and delta-lon in degrees, because
00125  the polar singularity would cause delta-lon to be unstable.)
00126 
00127  One way to find delta-lat and delta-lon is to inner-product the 
00128  difference D with an along-track unit vector and a cross-track unit
00129  vector.  (These are per-pixel triplets, found once at P1 and used for all
00130  P2.)  It might be best to just use a cross-track vector, and then 
00131  dump the entire residual into the along-track vector, so that the entire
00132  difference D is penalized using either the along-track or the cross-track
00133  weighting.  (I.e., D will not lie in the space spanned by just the
00134  along-track and cross-track vectors; there will be a third degree of 
00135  freedom because the planar approximation is not exact.)
00136 
00137  Depending on how it was done, this could correspond to a chord 
00138  length or an arc length.  It probably doesn't much matter which, because
00139  the kernel is a merely a computational device to obtain smoothing, and
00140  a simple scaling rule relates the two.
00141 
00142  This does not seem that burdensome computationally; it's about the same
00143  amount of computation in the inner loop as now, just a handful of multiplies
00144  and adds.  It should be prototyped to see how the results look for high beta.
00145 
00146 ****************************************************************/
00147 
00148 /* constants and globals used for error checking */
00149 
00150 #define NARGIN_MIN  4      /* min number of inputs */
00151 #define NARGIN_MAX  6      /* max number of inputs */
00152 #define NARGOUT_MIN 0      /* min number of outputs */
00153 #define NARGOUT_MAX 3      /* max number of outputs */
00154 
00155 #define ARG_X      0
00156 #define ARG_GEOM   1
00157 #define ARG_K      2
00158 #define ARG_KParam 3
00159 #define ARG_KWt    4
00160 #define ARG_BWs    5
00161 
00162 #define ARG_Y      0
00163 #define ARG_S      1
00164 #define ARG_P      2
00165 
00166 static const char *progname = "smoothsphere";
00167 #define PROGNAME smoothsphere
00168 static const char *in_specs[NARGIN_MAX] = {
00169   "RM",
00170   "RV(5)",
00171   "RV",
00172   "RV(2)",
00173   "RV(0)|RV(3)",
00174   "RM"};
00175 static const char *in_names[NARGIN_MAX] = {
00176   "x",
00177   "geom",
00178   "k",
00179   "kparam",
00180   "kwt",
00181   "bws"};
00182 static const char *out_names[NARGOUT_MAX] = {
00183   "y", "s", "p"};
00184 
00185 
00186 // short form of PROGNAME for #include identifiers
00187 #define SHORTNAME ssp
00188 
00189 #define KWt_count 3
00190 #define Default_KWt_count 3
00191 static double Default_KWt[Default_KWt_count] = { 1.0, 1.0, 1.0 };
00192 #define Default_BWs_count_small 4
00193 static double Default_BWs_small[Default_BWs_count_small] = { 0.4, 0.6, 2.0, 4.0 };
00194 #define Default_BWs_count_large 6
00195 static double Default_BWs_large[Default_BWs_count_large] = { 0.3, 0.5, 0.7, 2.0, 4.0, 8.0 };
00196 
00197 /********************************************************************************
00198  *
00199  * Utility for making kernels
00200  *
00201  ********************************************************************************/
00202 
00203 /*
00204  * make_kernel_lut: make a lookup table for a gaussian kernel
00205  * lut has Nker points, from 0..hi_ker (hi_ker typically = 0.015)
00206  * width of the kernel is kwid, which is given in scaled units,
00207  * so that kwid is order unity.
00208  * The lut, of length Nker, must be already allocated.
00209  */
00210 static
00211 int
00212 make_kernel_lut(double *ker, int Nker, double hi_ker, double kwid)
00213 {
00214   const double del = hi_ker/(Nker-1); // Nker fenceposts, Nker-1 gaps
00215   double x;
00216   double scalefac;
00217   double sum;
00218   double norm;
00219   int i;
00220 
00221   if (kwid <= 0   || isnan(kwid) || 
00222       hi_ker <= 0 || isnan(hi_ker) ||
00223       Nker <= 1   || ker == NULL)
00224     return 0;
00225   kwid *= 0.0325; // = the rescaled kernel width
00226   scalefac = 1/(2*kwid*kwid); // = the multiplier in the exp() call
00227   // fill in exp(- x / (2*kwid*kwid)) = exp(- x * scalefac)
00228   for (sum = i = 0; i < Nker; i++) {
00229     x = i*del;
00230     ker[i] = exp(- x * scalefac);
00231     sum += ker[i];
00232   }
00233   // find the normalization factor for unit mass
00234   // this should perhaps use del, but we use hi_ker/Nker because
00235   // that is what we have historically used.
00236   norm = M_PI * (hi_ker/Nker) * sum;
00237   scalefac = 1/norm;
00238   // re-normalize ker
00239   for (i = 0; i < Nker; i++)
00240     ker[i] *= scalefac;
00241   return 1;
00242 }
00243 
00244 
00245 /********************************************************************************
00246  *
00247  * Utilities for blocked convolution
00248  *
00249  ********************************************************************************/
00250 
00251 /*
00252  * Idea: Divide source pixels into classes.  The class is a blocknum_t (like an
00253  * int) and it is a tag for a "meta-pixel" that subsumes a whole block of 
00254  * source-image pixels.  Each block is summarized by a blockstats_t, and is 
00255  * convolved all at once.
00256  *
00257  * Blocking is done because the main source of computational cost in this 2d convolution
00258  * is large AR's.  Formerly, each pixel in a large AR was mapped individually to the
00259  * output.  If the kernel width is KW, this one pixel results in KW^2 updates to the
00260  * output, each of which requires many flops for computation of a distance, and 
00261  * kernel lookup of the distance.  (The issue is *not* tiny one or two-pixel AR's; this
00262  * has been checked on real data; the typical AR pixel has 5-6 neighbors.)  Much
00263  * of this computation was wasted, because the smoothed activity is thresholded.
00264  *
00265  * To cut this time, we collect AR pixels that are relatively far from the limb into
00266  * blocks of size BWxBW, which are handled together.  The scheme below is more flexible
00267  * than this; it allows any blocking scheme (not just uniform square blocks).  Any mapping
00268  * of pixels to blocks is OK.  To change it, see block_create.
00269  *
00270  * The scheme assigns each pixel a blocknum_t.  If this is <0, it indicates the
00271  * pixel is handled individually.  If >0, it is a block number; all pixels with
00272  * that block number are handled in a single convolution inner loop over KW*KW
00273  * pixels.  The blocks are assigned by one routine, and then another loop over
00274  * the image computes the overall weight (sum of the weights in the block), and 
00275  * the net position (weighted by the convolved input).  The summary of each block
00276  * is put into a blockstat_t structure.  It would even be possible to combine 
00277  * blocks by manipulating just the blockstat_t summaries.
00278  *
00279  * Fundamentally, because considerable processing is devoted to each nonzero, ondisk
00280  * pixel, considerable intelligence can be used to optimize blocks.  Note that no
00281  * blocks are created for singletons (they are handled individually), and that
00282  * each block must contain some activity; the whole away-from-limb disk is not blocked
00283  * in its entirety.  It's typical to have a few thousand blocks (1024^2 image).
00284  * The current scheme uses blocks of width BW that are aligned on multiples of BW
00285  * in the input image, this is more efficient than fully ad-hoc block placement,
00286  * which tends to cause interference.
00287  *
00288  * We tried simpler combination schemes, but it turns out to be hard that every 
00289  * value in the source is being used exactly once, because of the interaction of
00290  * the blocks, the limb, and the offdisk stuff.
00291  *
00292  * In the final convolution loop, we look at the blocknum_t of each pixel.
00293  * If it is a block (and not a singleton), we look up its block in the list,
00294  * and use its average position and cumulative weight as a "metapixel".  We
00295  * also mark the block as used.  When we come to next pixel that is a member 
00296  * of that block, we do not count it again.
00297  */
00298 
00299 // legal block values: 
00300 // offdisk, skip, singleton, clear; or anything from 1 to #blocks
00301 // (thus, any negative values are OK for the explicit block constants)
00302 //  offdisk = outside visible disk
00303 //  skip = visible, but zero weight.  skip during convolution
00304 //  singleton = visible, nonzero weight, treated singly during the convolution
00305 //  clear = un-initialized.  after block_create runs, *all* clears are gone
00306 //  first, first+1, ... = part of the given block
00307 // Thus, block 0 is un-used.  The valid blocks are 1..BlockNum, inclusive
00308 typedef enum {
00309   Block_Min       = -3,      // always holds the minimum value, for external saves
00310   Block_Offdisk   = -3,      // not on the visible disk
00311   Block_Skip      = -2,      // on-disk, zero weight (skippable)
00312   Block_Singleton = -1,      // on-disk, nonzero weight, treated individually
00313   Block_Clear     = 0,       // un-initialized
00314   Block_First     = 1,       // this and following numbers: members of a given block
00315   Block_MAX       = INT_MAX  // ensures it's as big as an int
00316 } blocknum_t;
00317 // is the given block number part of a group
00318 #define BLOCK_GROUPED(b) (b>=Block_First)
00319 
00320 /*
00321  * statistics summarizing a block: its mass-weighted location (x,y,z),
00322  * its weight (wt>0), and its function value (f).  If the function
00323  * value is zero, the block is effectively turned off.  This is done
00324  * during the iteration once the block has been smoothed into the output.
00325  */
00326 typedef struct {
00327   double x;  // location
00328   double y;
00329   double z;
00330   double wt; // weight (sum of abs(f))
00331   double f;  // function value
00332 } blockstat_t;
00333 
00334 
00335 /*
00336  * block_lut_validate: determine if a bwlen x 2 lookup table
00337  * supplied as an input argument is valid.  returns an
00338  * error message.  LUT order is:
00339  *
00340  *    tau1 block1
00341  *    tau2 block2
00342  *    tau3 block3
00343  *
00344  * There is an implicit 0 above the tau's, and an implicit 1 below
00345  * them.  For 0..tau1, no blocking.  For tau1..tau2, block1.  Etc.
00346  * Above tau3 and up to 1, block3.
00347  */
00348 static
00349 char *
00350 block_lut_validate(double *bws, int bwlen)
00351 {
00352   int i;
00353 
00354   for (i = 0; i < bwlen; i++) {
00355     // bin boundaries
00356     if (bws[i] < 0 || bws[i] > 1)
00357       return "bin boundaries must be in [0,1]";
00358     if (i > 0 && (bws[i-1] >= bws[i]))
00359       return "bin boundaries must be monotone increasing";
00360     // block sizes
00361     if (bws[bwlen+i] < 2)
00362       return "block sizes must be at least 2";
00363     if (((int) bws[bwlen+i]) != bws[bwlen+i])
00364       return "block sizes must be integers";
00365   }
00366   return NULL;
00367 }
00368 
00369 /*
00370  * block_lut_reformat: alter the lut as supplied slightly
00371  * so we can more readily use it.
00372  */
00373 static
00374 void
00375 block_lut_reformat(double *tau1, double *taus, int *bws, 
00376            double *bws_in, int bwlen, double R)
00377 {
00378   int i;
00379 
00380   // empty case
00381   if (bwlen == 0) {
00382     *tau1 = R + 1; // always use singleton rule
00383     return;
00384   }
00385   // non-empty
00386   *tau1 = bws_in[0] * R;
00387   // bin boundaries
00388   for (i = 1; i < bwlen; i++)
00389     taus[i-1] = bws_in[i] * R;
00390   taus[bwlen-1] = R + 1; // last bin > largest z
00391   // block sizes
00392   for (i = 0; i < bwlen; i++)
00393     bws[i] = (int) bws_in[bwlen+i];
00394 }
00395 
00396 /* 
00397  * block_create: make matrix of block-tags from (z,tau), the input map,
00398  * and a blockwidth parameter.  
00399  * Returns the number of distinct tags needed, or -1
00400  * for error (should never happen).
00401  */
00402 static
00403 blocknum_t
00404 block_create(blocknum_t *blocks, double *s, double *a, 
00405          double tau1, double *taus, int *bws, int m, int n)
00406 {
00407   int k;
00408   int x1, y1;
00409   int x2, y2;
00410   int xlo, xhi, ylo, yhi;     // loop bounds
00411   blocknum_t bn = Block_First;
00412   double s1;
00413   int bw;
00414 
00415   /* loop over image pixels */
00416   for (x1 = 0; x1 < n; x1++)
00417     for (y1 = 0; y1 < m; y1++) {
00418       // ensure blocks(x1,y1) is assigned a block number
00419       s1 = s[m*x1+y1];
00420       if (isnan(s1)) {
00421     blocks[m*x1+y1] = Block_Offdisk;
00422       } else if (a[m*x1+y1] == 0) {
00423     blocks[m*x1+y1] = Block_Skip; // zero contribution
00424       } else if (s1 < tau1) {
00425     // note, strict < allows tau1 = 0 to suppress all singletons
00426     blocks[m*x1+y1] = Block_Singleton; // singleton
00427       } else if (blocks[m*x1+y1] == Block_Clear) {
00428     // This code block fully controls the discretization
00429     // scheme.  Another scheme would just fill in the
00430     // block number "bn" in a different pattern, being sure
00431     // to not over-write blocks that are already assigned.
00432     //
00433     // determine the block width using s1 and the LUT
00434     // - if the LUT was empty, tau1 will be > R, and
00435     // this case will not be reached.
00436     // - otherwise, the last tau[] will be > R, and will 
00437     // cause exit of the loop
00438     for (k = 0; s1 > taus[k]; k++)
00439       ;
00440     bw = bws[k];
00441     // make a new block numbered bn
00442     xlo = (x1/bw)*bw;  // round down
00443     xhi = xlo + bw;
00444     if (xhi > n) xhi = n;
00445     ylo = (y1/bw)*bw;
00446     yhi = ylo + bw;
00447     if (yhi > m) yhi = m;
00448     for (x2 = xlo; x2 < xhi; x2++)
00449       for (y2 = ylo; y2 < yhi; y2++)
00450         if (blocks[m*x2+y2] == Block_Clear)
00451           blocks[m*x2+y2] = bn;
00452     bn++;
00453       } else if (BLOCK_GROUPED(blocks[m*x1+y1])) {
00454     continue; // already in a block: this is OK
00455       } else {
00456     // should not happen
00457     mexWarnMsgTxt("create: bad blocknum.\n");
00458     return Block_Min; // anything < 0
00459       }
00460     }
00461   return (blocknum_t) ((int) bn - 1); // the last valid block number
00462 }
00463 
00464 /*
00465  * block_stats: compute statistics over all blocks
00466  * Valid blocks run from Block_First (=1) to btot, inclusive of both endpoints.
00467  * Returns 0 for out-of-range block number.
00468  */
00469 static
00470 int
00471 block_stats(blockstat_t *stats, 
00472         blocknum_t *blocks, 
00473         double *z,             // z-coord
00474         double *a,             // the image to be convolved
00475         int m, 
00476         int n, 
00477         blocknum_t btot)
00478 {
00479   int x1, y1;
00480   blocknum_t bn = Block_First;
00481   double wt, norm;
00482 
00483   // stats[].* is already zeroed out
00484   /* loop over image pixels, accumulating statistics */
00485   for (x1 = 0; x1 < n; x1++)
00486     for (y1 = 0; y1 < m; y1++) {
00487       bn = blocks[m*x1+y1];
00488       if (BLOCK_GROUPED(bn)) {
00489     if (bn > btot)
00490       return 0; // error: out-of-range block number
00491     wt = fabs(a[m*x1+y1]);
00492     stats[bn].x  += x1 * wt; // un-centered
00493     stats[bn].y  += y1 * wt; // un-centered
00494     stats[bn].z  += z[m*x1+y1] * wt;
00495     stats[bn].wt += wt;
00496     stats[bn].f  += a[m*x1+y1];
00497       }
00498     } // end for x1, y1
00499   /* loop over blocks, computing average */
00500   for (bn = Block_First; bn <= btot; bn++) {
00501     if (stats[bn].wt == 0)
00502       return 0; // error: empty block
00503     norm = 1.0 / stats[bn].wt;
00504     stats[bn].x *= norm;
00505     stats[bn].y *= norm;
00506     stats[bn].z *= norm;
00507   }
00508   return 1;
00509 }
00510 
00511 /*
00512  * block_check: check that blocks have been cleared
00513  */
00514 static
00515 int
00516 block_check(blockstat_t *stats, int btot)
00517 {
00518   int bn;
00519   int bn_nc = 0; // count blocks not cleared
00520 
00521   /* loop over blocks, checking for uncleared block */
00522   for (bn = Block_First; bn <= btot; bn++)
00523     if (stats[bn].f != 0)
00524       bn_nc++;
00525   return bn_nc;
00526 }
00527 
00528 /*
00529  * convert blocks for output to matlab:  int-to-double
00530  */
00531 static
00532 void
00533 block_export(double *p, int *blocks, int m, int n)
00534 {
00535   int inx;
00536 
00537   for (inx = 0; inx < m*n; inx++)
00538     p[inx] = (double) blocks[inx];
00539 }
00540 
00541 
00542 /******************************************************************
00543  *
00544  * Threading Support
00545  *
00546  * Structures to hold state passed down to computational threads,
00547  * a routine for thread-number selection
00548  *
00549  ******************************************************************
00550  *
00551  * Note on threading strategy:
00552  * The key problem with threading this, is that the convolution is
00553  * pretty wide (typically 200 pixels wide) and so the footprint of
00554  * pixels in the output `b' that are modified by looping over a 
00555  * rectangle of pixels in the input `a' is pretty large.
00556  *
00557  * We do the convolution by splitting the input into strips of size
00558  * Rwid along the x coordinate, each thread smoothing one strip at a 
00559  * time.  These partial results, which are wider than Rwid, are 
00560  * accumulated into the final output array.  E.g., consider:
00561  *
00562  *   Input: X = [x0|x1|x2|x3|x4|x5]
00563  *
00564  * The thread corresponding to x2 = X(:,Rwid*2+[1:Rwid]), say, computes 
00565  * a result y2 which is roughly of width Rwid + Kswath, where Kswath is
00566  * the full width of the kernel.  This partial result must be summed in
00567  * to the full output starting at the correct offset.  It will extend
00568  * Kswath/2 rows "below" Rwid*2, and Kswath/2 rows "above" Rwid*3.
00569  * (The partial result is located in a thread-specific buffer, but the 
00570  * full output array is shared across threads.)
00571  * While it's being accumulated, other threads must not alter the 
00572  * output array.
00573  *
00574  * We pay a penalty for separating the input into chunks, because 
00575  * the corresponding output chunks are enlarged.  Smaller chunks
00576  * means more granularity in the threading, so all threads are always 
00577  * working (good), but too small means that we spend too much time 
00578  * accumulating partial results into a full output (bad).  In general,
00579  * each output array location will be accumulated-into 
00580  * 
00581  *    1 + Kswath/Rwid times,
00582  *
00583  * because in total, if there are k strips, there will be k*Rwid output
00584  * rows, but k*(Rwid+Kswath) result-rows found.  Therefore each output
00585  * row will be written (Rwid+Kswath)/Rwid = 1 + Kswath/Rwid times.
00586  * For HMI, Kswath = 200
00587  *
00588  * We put most of the logic for partitioning into the individual threads. 
00589  * We maintain an index of the largest row still remaining unfinished, and
00590  * let each thread check that index to determine whether to carve off 
00591  * another rectangle.
00592  * 
00593  ******************************************************************/
00594 
00595 /*
00596  * these structures are needed because the pthread interface passes
00597  * only one item to the thread as an argument, so we put all the
00598  * convolution arguments in that one argument
00599  */
00600 
00601 // one instance of this flavor of structure holds common info
00602 struct thread_info {
00603   // unchanging info, like geometry and overall parameters
00604   // (see smooth_rect for descriptions)
00605   // sizes
00606   int m; // m = y
00607   int n; // n = x
00608   // strides
00609   int strIx;
00610   int strIy;
00611   // disk geometry
00612   double xcen;
00613   double ycen;
00614   double R;
00615   double cos_a;
00616   double sin_a;
00617   // input matrices
00618   double *a;
00619   double *s;
00620   // output matrix
00621   double *b;
00622   // kernel
00623   double *kernel;
00624   int Klen;
00625   double Ktop;
00626   int Kswath;
00627   double *Kwt;
00628   // block structure of input
00629   blocknum_t *blocks;
00630   blockstat_t *stats;
00631   // swath rectangle sizes
00632   int xwid; // max size along x direction
00633   int ywid; // max size along y direction
00634   int Rwid; // target size for each thread's rectangle
00635   // next row to be assigned to a thread
00636   int smooth_bb_top;
00637 };
00638 
00639 // MAX_THREAD instances of this flavor of structure holds per-thread info
00640 struct thread_args {
00641   // per-thread info
00642   int tnum;  // my own thread number, just informational
00643   // output error status (zero is OK, nonzero is not)
00644   int status;
00645 };
00646 
00647 /*
00648  * thread info, declared as local to this file for simplicity
00649  * (in particular, SA could be local to a function, and it could contain
00650  * pointers to the mutex and to SI, but this way keeps the declaration
00651  * close to the structure definition)
00652  */
00653 // this is a hard max, but it can be lowered by an environment variable
00654 #define MAX_THREAD 8
00655 static struct thread_info SI; // one slot for common info
00656 static struct thread_args SA[MAX_THREAD]; // slots for per-thread info
00657 static pthread_mutex_t blockstat_mutex;
00658 static pthread_mutex_t accum_mutex;
00659 static pthread_mutex_t smooth_bb_top_mutex;
00660 
00661 /*
00662  * get the requested number of threads, either from environment
00663  * (MXT_NUM_THREAD) or from the compiled-in default
00664  */
00665 static
00666 int
00667 get_thread_num()
00668 {
00669   int tn, tn1;
00670   char *thread_env;
00671 
00672   if ((thread_env = getenv("MXT_NUM_THREAD")) != NULL) {
00673     // get from environment
00674     tn = strtol(thread_env, NULL, 10);
00675     if (tn == 0)
00676       tn = MAX_THREAD; // error in conversion
00677   } else {
00678     // use compiled-in default
00679     tn = MAX_THREAD;
00680   }
00681   // clip to 1..thread_max
00682   if (tn > MAX_THREAD)
00683     tn = MAX_THREAD;
00684   else if (tn <= 0)
00685     tn = 1;
00686   return tn;
00687 }
00688 
00689 /********************************************************************************
00690  *
00691  * Spherical convolution utility
00692  *
00693  ********************************************************************************/
00694 
00695 /* swath_extent: establish loop boundaries
00696  * 
00697  * move Kswath in each direction away from (x0,y0), 
00698  * adjusting for both rotation angle (cos_a, sin_a) and
00699  * distance weighting (Kwt), to determine the four corners
00700  * of a box containing the region to include in the 
00701  * convolution.  Returns the largest and smallest x and y
00702  * indices which contain the whole box.
00703  */
00704 static
00705 void
00706 swath_extent(int *xlo_out, int *xhi_out, 
00707          int *ylo_out, int *yhi_out, 
00708          double x0, double y0,
00709          double xcen, double ycen, double R, 
00710          double cos_a, double sin_a, 
00711          double *Kwt, int Kswath, int m, int n)
00712 {
00713   int xB, yB;
00714   double x1, y1;
00715   double xp1, yp1;
00716   double xlo, ylo, xhi, yhi;
00717 
00718   // high and low markers
00719   xlo = ylo = m+n; // higher than either alone
00720   xhi = yhi = 0;   // lower than either
00721   // loop over 4 corners
00722   for (xB = -1; xB <= 1; xB += 2)
00723     for (yB = -1; yB <= 1; yB += 2) {
00724       // scaled point
00725       x1  = xB * Kswath / sqrt(Kwt[0]);
00726       y1  = yB * Kswath / sqrt(Kwt[1]);
00727       // rotate by p-angle, place in image coords
00728       xp1 = x0 +  x1 * cos_a + y1 * sin_a;
00729       yp1 = y0 + -x1 * sin_a + y1 * cos_a;
00730       // save low and high
00731       if (xp1 < xlo) xlo = xp1;
00732       if (xp1 > xhi) xhi = xp1;
00733       if (yp1 < ylo) ylo = yp1;
00734       if (yp1 > yhi) yhi = yp1;
00735     }
00736   // clip to Rsun
00737   if (xlo < xcen-R) xlo = xcen-R;
00738   if (xhi > xcen+R) xhi = xcen+R;
00739   if (ylo < ycen-R) ylo = ycen-R;
00740   if (yhi > ycen+R) yhi = ycen+R;
00741   // round outward to integers
00742   xlo = floor(xlo);  ylo = floor(ylo);
00743   xhi = ceil (xhi);  yhi = ceil (yhi);
00744   // clip to image
00745   if (xlo < 0)   xlo = 0;
00746   if (xhi > n-1) xhi = n-1;
00747   if (ylo < 0)   ylo = 0;
00748   if (yhi > m-1) yhi = m-1;
00749   // make output
00750   *xlo_out = (int) xlo;
00751   *xhi_out = (int) xhi;
00752   *ylo_out = (int) ylo;
00753   *yhi_out = (int) yhi;
00754 }
00755 
00756 /* 
00757  * 
00758  * swath_max_size: determine an upper size limit on the footprint
00759  * of the kernel.  In other words, this returns an upper bound on the
00760  * length of the x and y dimensions, generated by swath_extent above,
00761  * needed to hold a convolution result array.  We want to be able to 
00762  * alloc xwid*ywid doubles to hold the result from convolving one input pixel.
00763  *
00764  * This routine assumes that the size of the swath can be upper-bounded 
00765  * independent of position (x0,y0).  Currently, the only need for an 
00766  * upper bound (rather than an exact figure) is that differences in 
00767  * rounding can cause +/-1 changes in size at each end of the convolution 
00768  * footprint.  Gross differences are not possible.  
00769  */
00770 static
00771 void
00772 swath_max_size(int *xwid, 
00773            int *ywid,
00774            double xcen, double ycen, double R, 
00775            double cos_a, double sin_a, 
00776            double *Kwt, int Kswath, int m, int n)
00777 {
00778   int xlo, xhi, ylo, yhi;
00779 
00780   swath_extent(&xlo, &xhi, &ylo, &yhi, 
00781            xcen, ycen,  // use image center as the center of the swath
00782            xcen, ycen, R, 
00783            cos_a, sin_a, 
00784            Kwt, Kswath, m, n);
00785   // first, +1 because a range of lo = hi still includes 1 pixel
00786   // then,  +2 because an unpredictable roundoff at either the lower
00787   // or upper endpoint, or both, might result in a larger range.
00788   *xwid = (xhi - xlo + 1) + 2;
00789   *ywid = (yhi - ylo + 1) + 2;
00790 }
00791 
00792 /********************************************************************************
00793  *
00794  * Spherical convolution core
00795  *
00796  ********************************************************************************/
00797 
00798 /*
00799  * Background on the convolution
00800  *
00801  * There are two aspects of this code that are different from standard
00802  * convolution (say, in 1 or 2 dimensions, in a periodic setting):
00803  *  a - the kernel can be viewed as spatially varying
00804  *  b - the discretization is not uniform
00805  * 
00806  * Point (a) is debatable.  If we imagine the functions x, y, k defined
00807  * on the sphere, then k can be expressed as depending only on s-t where
00808  * we are convolving x(t) and k(s).  That is, we can write
00809  *   y(s) = int_S x(t) k(s-t) dt
00810  * where t, s are 3-vectors of unit norm (on the sphere).  In this case,
00811  *   ||s - t||^2 = (s-t)'(s-t) = 2(1-t's), or
00812  *   t's = 1 - 0.5 ||s-t||^2
00813  * and the dot product is a function of only s-t.
00814  * In this interpretation, the kernel is not spatially varying.  The 
00815  * convolution is computed on the sphere S and then, in a separate 
00816  * operation, projected down to the unit disk.
00817  * 
00818  * As an aside, if an exponential kernel k is used as input this function,
00819  *   k(dot) = const * exp(lambda * dot), where dot = t's,
00820  * then this is equivalent to a gaussian function of the difference
00821  *   delta = s - t
00822  * since
00823  *   k(dot) = k(t's) = const * exp(lambda * 0.5 * ||s - t||^2)
00824  * This is a pretty good rationale for using a kernel that is of exponential
00825  * form, i.e. k = const1 * exp(const2 * linspace(-1, 0, kparam(2))), where
00826  * const2 sets the fwhm and const1 is chosen so k is of unit norm.
00827  * 
00828  * Point (b) is tricky.  It arises because one pixel at disk-center 
00829  * corresponds to seeing a region of (say) "one unit", but at the limb,
00830  * the pixel could correspond to seeing many units.  The conversion factor
00831  * is 1/z, where z is the distance from the x-y (image) plane outward to
00832  * the sphere boundary (z = sqrt(1-x^2-y^2)).  The limb pixels carry
00833  * information about more area, so they will influence other pixels 
00834  * proportionally more strongly.  We just need to quantify the effect.
00835  *
00836  * Before we go on to do that, a word about the (1/z) factor, which is
00837  * the area of a pixel.  As we go closer to the limb, z goes (slowly)
00838  * to zero, and 1/z, which approximates the area, goes (slowly) to 
00839  * infinity.  Especially for high-pixel-density images, this will 
00840  * make the integral blow up.  The area ~= 1/z approximation thus
00841  * breaks down there.  One way out is to say that a pixel with a center
00842  * precisely at the limb (z=0) may as well be regarded as having area
00843  * equal to that of its maximally-on-disk part, which is half a pixel 
00844  * farther onto the disk.  Drawing the right triangle, we see that the 
00845  * z of a point 0.5 pixel from the limb is sqrt(R) (where the max z, at
00846  * disc center, is R).  To tame the normalization factor, we take 
00847  * sqrt(R) as the lowest allowed z for the area computation.  This
00848  * typically does not arise in practice -- even for HMI, at 3 pixels in,
00849  * z = sqrt(6*R) ~ 100, the min-z = sqrt(R) ~ 44.
00850  * 
00851  * It's natural to compute the convolution as a discrete approximation 
00852  * to a continuous convolution:
00853  * 
00854  *   I-1:  y(s) = \int_S  x(t) k(s,t) dt         (using k(s,t) for k(s-t))
00855  *         y(i) = \sum_j  x(j) k(i,j) |S_j|      [*]
00856  * 
00857  * where the whole sphere S is partitioned into {S_j}, and we assume that 
00858  * to first order, the function x(t) equals a number x(j) on the patch S_j, 
00859  * which has volume |S_j|.
00860  * The problem is that one can expand the integral the other way:
00861  * 
00862  *   I-2:  y(s) = \int_S  x(s-t) k(t,0) dt       (using k(t,0) for k(t-0))
00863  *         y(i) = \sum_j  x(i-j) k(j,0) |S_j|
00864  *
00865  * and the discretized version weights x(i-j) differently.  There are some
00866  * subtleties here, because I-2 assumes the kernel is translation-invariant.  
00867  * But, in any case, the two forms are not symmetric.  And one then 
00868  * wonders about the exact interpretation of the terms within [*].  
00869  * (E.g., does k(i,j) have to be k(s,t) weighted by |S_i| or |S_j|?)
00870  * 
00871  * These questions become focused when you wonder how to normalize k() so
00872  * that the convolution integral has unit norm.
00873  *
00874  * It's possible to clarify these questions, which have to do with 
00875  * weighting of nonnegative functions x and y, by analogy with 
00876  * probability. The analogy is that x(t) and y(s) are both 
00877  * densities with respect to a reference measure A(.), and k(s,t) 
00878  * is a conditional density tying them together:
00879  * 
00880  *    x(t): pdf of an r.v. X relative to measure A(t)
00881  *    y(s): pdf of an r.v. Y   "      "      "    "
00882  *    k(s,t): conditional density p(Y=s|X=t) "    "
00883  * 
00884  * If the density x(t) is relatively smooth over patches S_j, we can 
00885  * discretize everything, where |S_j| = A(S_j), the area of the patch:
00886  * 
00887  *    x_j: pmf of the r.v. X.  Thus Pr(X in S_j) = x_j = x(t) |S_j| 
00888  *         for any t in S_j
00889  *    y_i: pmf of the r.v. Y.  Thus Pr(Y in S_i) = y_i = y(s) |S_i| 
00890  *         for any s in S_i
00891  *    k_(i,j): conditional density p(Y|X), so k_(i,j) = k(s,t) |S_i|
00892  *            (t and s as above).  This is so because we are *given* 
00893  *            the value of X, so the average is over Y (S_i).
00894  *    
00895  * To relate this back to the original problem, at large (limb) patches,
00896  * this says we are equally sure, throughout the patch, that the
00897  * patch has a given label (x(t) = const on the patch).  But, over
00898  * the whole patch, there is a greater mass than the at disk center
00899  * (since the |S_j| term is larger).  So, the limb patch as a whole
00900  * will have greater influence than disk-center patches as a whole.
00901  *
00902  * In short: the x(t), y(s) are the observed and smoothed values.
00903  * The x_j, y_i capture the discretization effects that make some
00904  * pixels "more strong" than others.
00905  * 
00906  * To relate Y to X, note
00907  * 
00908  *   Pr(Y in S_i) = \sum_j  Pr(X in S_j) * Pr(Y in S_i | X in S_j)
00909  *                = \sum_j  [x(t) |S_j|] * [k(s,t) |S_i|]
00910  * 
00911  * and, noting Pr(Y in S_i) = y_i = y(s) |S_i|,
00912  *
00913  *   y(s) = \sum_j  x(t) k(s,t) |S_j|      [t from S_j & s from S_i always]
00914  * 
00915  * This recovers the formula [*]. 
00916  *
00917  * It also tells us how to interpret "unit norm".  For y(s) and  x(t),
00918  * it means integrating to one against the area measure A(.).  This is
00919  * identical to the discretized y_i and x_j summing to one against
00920  * |S_i|.  In short: the discretized versions sum to unity, not the
00921  * continuous ones.  In matlab, this would mean:
00922  *   >> z = sqrt(1 - x^2 - y^2)  or
00923  *   >> z = rsun - ringmatrix([m  n], [], [], center)
00924  *   >> z(z < 0) = Inf;  % make them drop out
00925  * and then, since the patch areas |S_j| are proportional to 1/z:
00926  *   >> sum(sum(x .* (1./z))) = 1
00927  *   >> sum(sum(y .* (1./z))) = 1
00928  * 
00929  * More important, for k(s,t), it means 
00930  * 
00931  *   (forall t)  \int_S  k(s,t) dA(s) = 1
00932  *
00933  * Because k(s,t) depends only on the radial distance between 
00934  * s and t, we can take t = (0 0 1), and then s runs from 0..1 with
00935  * dot = s't = sqrt(1 - s^2).  However, this simplifies considerably.
00936  * As we have parameterized it, k(s,t) = k(dot), which is given 
00937  * explicitly in the input k(.).  We need only that
00938  *
00939  *   \int_{dot=0..1}  k(dot) dA(dot) = 1
00940  *
00941  * In fact, the minimum value of dot is kparam(1).  The area element
00942  * corresponding to s't in [dot, dot+d_dot] is just 2 pi d_dot.  Thus
00943  * we want
00944  *
00945  *   2 pi \int_{dot=kparam(1)..1}  k(dot) d_dot = 1
00946  *
00947  * The integral, when discretized, needs only the width of a length
00948  * element, which is delta_dot (roughly corresponding to d_dot above):
00949  * 
00950  *   >> delta_dot = (1 - kparam(1))/length(k);
00951  *   >> normer = 2*pi*delta_dot*sum(k);
00952  *
00953  * Letting k = k/normer makes k have unit norm.
00954  */
00955 
00956 // error codes for smooth(), must be < 0
00957 #define SMOOTH_NOMEM    -1
00958 #define SMOOTH_INTERNAL -2
00959 #define SMOOTH_NOTHREAD -3
00960 
00961 /*
00962  * smooth_rect: convolve input a with kernel and put into output b
00963  *
00964  * Does convolution only in an input rectangle given by {x,y}R{lo,hi}.
00965  * The output will be written into a larger range, depending on the kernel 
00966  * width Kswath, the kernel weights Kwt, and the tilt angle (cos_a, sin_a).
00967  * There are separate parameters for input and output strides, so the two
00968  * can be differently-sized.
00969  * This routine operates in a thread, but is not highly aware that it's 
00970  * threaded (we need one mutex reference).
00971  * 
00972  * returns 1 if error, 0 otherwise
00973  * (the only error condition is an internal logic error)
00974  */
00975 static
00976 int
00977 smooth_rect(int m,          // input image size (m = y)
00978         int n,          //                  (n = x)
00979         // rectangle and strides
00980         int xRlo,       // x rect range lo/hi (input images)
00981         int xRhi,
00982         int yRlo,       // y rect range lo/hi (input images)
00983         int yRhi,
00984         int xROlo,      // x rect offset (output image offset from input, typ. >= 0)
00985         int yROlo,      // y rect offset (output image offset from input, typ. >= 0)
00986         int strIx,      // stride in x, input (a, s arrays)
00987         int strIy,      // stride in y, input
00988         int strOx,      // stride in x, output (b array)
00989         int strOy,      // stride in y, output
00990         // disk geometry
00991         double xcen,    // center, n or x, in C coords, origin at 0
00992         double ycen,    // center, m or y, in C coords, origin at 0
00993         double R,       // radius, in pixels
00994         double cos_a,   // cos(alpha)
00995         double sin_a,   // sin(alpha)
00996         // input + output matrices
00997         double *a,      // input is mXn
00998         double *s,      // mXn (z coordinate, already found)
00999         double *b,      // output (convolved result)
01000         // kernel
01001         double *kernel, // kernel lookup table
01002         int Klen,       // length of the table
01003         double Ktop,    // upper limit of the table (0..Ktop)
01004         int Kswath,     // box enclosing kernel
01005         double *Kwt,    // weights on x, y, z
01006         // block structure of input
01007         blocknum_t *blocks,
01008         blockstat_t *stats)
01009 {
01010   const double Kstep = (Klen-1)/Ktop;  // map [0,Ktop] -> [0,K-1]
01011   const double min_Zwt = sqrt(R); // sanity bound on area-weighting, see above
01012   int x1, y1;             // p1 coords in 2D, bounded by xRlo, xRhi, etc.
01013   int x2, y2;             // p2 coords in 2D, bounded by xlo, xhi, etc.
01014   int xlo, xhi, ylo, yhi; // inner loop boundaries
01015   double p1x, p1y, p1z;   // point P1 coords in 3D
01016   double p2x, p2y, p2z;   // point P2 coords in 3D
01017   int p1_off;             // array offset into a,s for the point (x1,y1)
01018   int x2_off, y2_off;     // array offsets into a,s for the point (x2,y2)
01019   int xO_off, yO_off;     // array offsets into b for the point (x2,y2)
01020   blocknum_t bn;          // block number of (x1,y1)
01021   double dist;            // distance from p1 to p2
01022   double contribution;    // point (x1,y1)'s contribution to integral
01023   double scale;           // scale factor for convolution integral
01024   double xx_term;
01025   double wt_xx, wt_yy, wt_zz, wt_xy;
01026   double inxC;            // real-valued index into kernel table
01027   int inxD;               // integer index into kernel table
01028 
01029   /* set up weights */
01030   wt_xx = Kwt[0] * cos_a * cos_a + Kwt[1] * sin_a * sin_a;
01031   wt_yy = Kwt[0] * sin_a * sin_a + Kwt[1] * cos_a * cos_a;
01032   wt_xy = 2.0 * (Kwt[0] - Kwt[1]) * cos_a * sin_a;
01033   wt_zz = Kwt[2];
01034   /* include 1/(R^2) factor in each */
01035   wt_xx /= R*R;
01036   wt_yy /= R*R;
01037   wt_zz /= R*R;
01038   wt_xy /= R*R;
01039 
01040   /* loop over image pixels */
01041   for (x1 = xRlo; x1 <= xRhi; x1++) 
01042     for (y1 = yRlo; y1 <= yRhi; y1++) {
01043       p1_off = x1*strIx + y1*strIy;
01044       bn = blocks[p1_off]; // block number
01045       if (bn == Block_Clear) 
01046     return 1; // should never happen
01047       if (bn == Block_Offdisk || bn == Block_Skip)
01048     continue; // off-limb or zero weight: no action needed
01049       if (bn == Block_Singleton) {
01050     // singleton
01051     /* find original vector coordinates P1 = (px1, py1, pz1) */
01052     p1x = x1;
01053     p1y = y1;
01054     p1z = s[p1_off];
01055     contribution = a[p1_off];
01056       } else {
01057     // look up coordinates and function value via per-block statistics
01058     p1x = stats[bn].x;  // un-centered
01059     p1y = stats[bn].y;  // un-centered
01060     p1z = stats[bn].z;
01061     // skip the mutex in the case where the block was already counted
01062     if (stats[bn].f == 0) continue; 
01063     // ensure only this thread is convolving with this block
01064     pthread_mutex_lock(&blockstat_mutex);
01065     contribution = stats[bn].f;
01066     stats[bn].f = 0; // indicate this thread has put it in the integral
01067     pthread_mutex_unlock(&blockstat_mutex);
01068       }
01069       // contrib == 0 when the block has been included once already,
01070       // in this thread or in another thread
01071       if (contribution == 0)
01072     continue; 
01073       /* establish loop boundaries */
01074       swath_extent(&xlo, &xhi, &ylo, &yhi, p1x, p1y,
01075            xcen, ycen, R, cos_a, sin_a, Kwt, Kswath, m, n);
01076       if (0)
01077     printf("(%.1f,%.1f) loop [%d,%d]x[%d,%d] (%dx%d)\n", 
01078            p1x, p1y, xlo, xhi, ylo, yhi, xhi-xlo+1, yhi-ylo+1); 
01079       // scale factor is the target of the convolution divided by an
01080       // area-based weight; the lowest weight allowed is min_Zwt
01081       scale = contribution / (((p1z < min_Zwt) ? min_Zwt : p1z) * R);
01082       /* loop over kernel */
01083       for (x2 = xlo, 
01084          x2_off =  xlo         *strIx, 
01085          xO_off = (xlo - xROlo)*strOx; 
01086        x2 <= xhi; 
01087        x2++, 
01088          x2_off += strIx,
01089          xO_off += strOx) {
01090     // optimizations for xx_term, offsets help a little (~5%): Apr 2010
01091     xx_term = (p1x - x2)*(p1x - x2)*wt_xx;
01092     for (y2 = ylo, 
01093            y2_off =  ylo         *strIy,
01094            yO_off = (ylo - yROlo)*strOy; 
01095          y2 <= yhi; 
01096          y2++, 
01097            y2_off += strIy,
01098            yO_off += strOy) {
01099       if (blocks[x2_off+y2_off] == Block_Offdisk) continue;
01100       p2z = s[x2_off+y2_off];
01101       dist = xx_term +
01102              (p1y -  y2) * (p1y -  y2) * wt_yy + 
01103              (p1y -  y2) * (p1x -  x2) * wt_xy + 
01104              (p1z - p2z) * (p1z - p2z) * wt_zz;  // distance ||P1-P2||
01105       /* do kernel lookup on distance */
01106       if (dist >= Ktop) continue; // dist >= Ktop: no contribution to b
01107       inxC = dist * Kstep;
01108       inxD = floor(inxC);
01109       if (inxD >= Klen-1) continue; // rounding puts dist on cusp: skip 
01110       // note, now 0 <= inxD < Klen-1
01111       dist = inxC - inxD;  /* in [0,1), smaller means closer to inxD */
01112       /*
01113       printf(" adding %g->%d, wt %g, interp %g\n", inxC, inxD, kernel[inxD], dist);
01114       */
01115       b[xO_off+yO_off] += 
01116         (kernel[inxD]*(1.0-dist) + kernel[inxD+1]*dist) * scale;
01117     } // for y2
01118       } // for x2
01119     } // for (x1,y1)
01120   return 0;
01121 }
01122 
01123 /*
01124  * smooth_thread: shell for Pthread invocation of the smoothing operation
01125  *
01126  * Each thread operates largely independently, selecting a rectangular region
01127  * of the input a, dividing along x and taking all of y, and sending that to
01128  * the core smoother.  Once the smoothed rectangle comes back, it is accumulated 
01129  * into the overall output matrix.
01130  * The thread control at this level needs two mutexes.  One to control the
01131  * division of the input among the threads (smooth_bb_top, which indicates
01132  * the next un-smoothed row in the input).  Another to control accumulation
01133  * into the result.  We make it simple, so that only one thread can touch the
01134  * output at a time.  By looking at timings, there seems to be little contention
01135  * at this point.
01136  *
01137  */
01138 static
01139 void *
01140 smooth_thread(void *arg)
01141 {
01142   const int verbose = 0;
01143   const int bb_max = SI.n-1; // the maximum value the rectangle needs to encompass
01144   double * const b = SI.b;   // points to output matrix
01145   double *b1;                // individual output matrix, lumped into b
01146   int rc;
01147   struct thread_args *sa;
01148   struct timeval tv1, tv2;
01149   int bb_top, bb_low, bb_hi; // bounding box boundary bookkeeping
01150   int bb_total, bb_count;    // internal counts for informational printouts
01151   int b_zero;                // offset to y origin of bb passed to smooth_rect
01152   int xsize, ysize;
01153   int x, y;
01154   int p1_off, p2_off;
01155 
01156   // arguments are taken from global arg list SI, and per-thread arg list *sa
01157   gettimeofday(&tv1, NULL);
01158   sa = (struct thread_args *) arg;
01159   // split the input along x (0...n-1)
01160   bb_total = bb_count = 0;
01161   while (1) {
01162     // carve off a new range of rows
01163     pthread_mutex_lock(&smooth_bb_top_mutex);
01164     bb_top = SI.smooth_bb_top; // NB, bb_top row itself is NOT assigned to a thread
01165     if (bb_top <= bb_max) {
01166       // more rows to do: find new range
01167       bb_low = bb_top;
01168       bb_hi  = bb_top + SI.Rwid - 1; // bb_hi is assigned to this thread
01169       if (bb_hi > bb_max) bb_hi = bb_max; // cap at the max x
01170       // install new top
01171       SI.smooth_bb_top = bb_hi + 1;
01172     }
01173     pthread_mutex_unlock(&smooth_bb_top_mutex);
01174     if (bb_top > bb_max)
01175       break; // there were no more rows to do
01176     bb_count++;
01177     bb_total += bb_hi - bb_low + 1;
01178     // new range of rows is [bb_low,bb_hi], inclusive
01179     // calloc a rectangle big enough to hold the convolved rectangle
01180     xsize = (bb_hi - bb_low + 1) + SI.xwid;
01181     b_zero = bb_low - SI.xwid/2; // b1(y==b_zero) will go into b(y==bb_low)
01182     ysize = SI.m;
01183     // fresh calloc each iteration is faster than clearing b1 with memset()
01184     b1 = calloc(xsize*ysize, sizeof(*b1)); 
01185     rc = smooth_rect(SI.m, SI.n, 
01186              // bounding rectangle (input images)
01187              bb_low, bb_hi,
01188              0, SI.m-1,
01189              b_zero, 0,
01190              // strides (input images)
01191              SI.strIx,  SI.strIy, 
01192              // strides (output image)
01193              ysize, 1,
01194              // disk geometry
01195              SI.xcen,  SI.ycen, SI.R, SI.cos_a, SI.sin_a, 
01196              // input matrices
01197              SI.a, SI.s, 
01198              // output matrix
01199              b1,
01200              // kernel
01201              SI.kernel, SI.Klen, SI.Ktop, SI.Kswath, SI.Kwt, 
01202              // block structure of input
01203              SI.blocks, SI.stats);
01204     sa->status = rc; // exit status, 0 is OK
01205     if (rc != 0) {
01206       free(b1);
01207       pthread_exit(NULL);  // trouble
01208     }
01209     //printf("    thread #%d: handled [%d,%d] * %d @ %d\n", sa->tnum, 
01210     //   bb_low, bb_hi, SI.xwid, b_zero);
01211     // accumulate into the final result
01212     pthread_mutex_lock(&accum_mutex);
01213     // x loop is over all of b1, some of which may be aligned with
01214     // an off-image part of the output b
01215     for (x = 0; x < xsize; x++) 
01216       for (y = 0; y < ysize; y++) {
01217     // off-image check: is (x,y) on-image for b?
01218     if ((x + b_zero < 0) || (x + b_zero >= SI.n)) continue;
01219     p1_off = x*ysize + y*1; // offset into b1
01220     p2_off = (x+b_zero)*SI.strIx + y*SI.strIy; // offset into b
01221     b[p2_off] += b1[p1_off];
01222       }
01223     pthread_mutex_unlock(&accum_mutex);
01224     free(b1);
01225   } 
01226   // thread is done
01227   gettimeofday(&tv2, NULL);
01228   int dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 
01229     (((int) tv2.tv_usec) - ((int) tv1.tv_usec));
01230   if (verbose)
01231     mexPrintf("    ST: exiting thread #%d, did %d/%d, dt = %.1f ms\n", 
01232           sa->tnum, bb_count, bb_total, dt/1000.0);
01233   pthread_exit(NULL);
01234 }
01235 
01236 
01237 /* control:  coordinates the smoothing operation
01238  *
01239  * sets up blocking data structures,
01240  * launches threads, combines results
01241  */
01242 static
01243 int
01244 control(int m,          // image size
01245     int n,
01246     double *a,      // input is mXn
01247     double *b,      // output is mXn (convolved result)
01248     double *s,      // output is mXn (z coordinate)
01249     double *p,      // output, mXn (patch indexes)
01250     double xcen,    // center, n or x, in C coords, origin at 0
01251     double ycen,    // center, m or y, in C coords, origin at 0
01252     double R,       // radius, in pixels
01253     double p0,      // tilt angle
01254     double *kernel, // kernel lookup table
01255     int Klen,       // length of the table
01256     double Ktop,    // upper limit of the table (0..Ktop)
01257     int Kswath,     // box enclosing kernel
01258     double *Kwt,    // weights on x, y, z; angle
01259     double *bws_in, // block lookup table (length x 2)
01260     int bwlen)      // its length
01261 
01262 {
01263   const int verbose = 0;
01264   const int mn = m*n;
01265   const double cos_a = cos(p0); // cos(alpha)
01266   const double sin_a = sin(p0); // sin(alpha)
01267   const double nan = mxt_getnand(); // cache nan
01268   int s_alloc_local;            // did we alloc s here?
01269   blocknum_t blockNum, bn;
01270   blocknum_t *blocks;
01271   blockstat_t *stats;
01272   double tau1, *taus;           // z-thresholds for blocking
01273   int *bws;                     // block widths (int's)
01274   int x1, y1, off;              // array position and offset
01275   double p1x, p1y;              // image position
01276   double temp;
01277   int xwid, ywid, Rwid;         // prototype rectangle widths
01278   int t, NT;                    // thread counts
01279   pthread_t threads[MAX_THREAD];// thread info
01280   struct timeval tv1, tv2;
01281   int dt;
01282 
01283   gettimeofday(&tv1, NULL);
01284   /*
01285    * 1: Allocations
01286    */
01287   // s, if necessary
01288   if (s_alloc_local = (s == NULL))
01289     s = calloc(mn, sizeof(double));
01290   if (!s) return SMOOTH_NOMEM;
01291   // block-number array
01292   blocks = calloc(mn, sizeof(blocknum_t)); /* must be cleared */
01293   if (!blocks) return SMOOTH_NOMEM;
01294 
01295   /*
01296    * 2: Find z
01297    */
01298   // loop over image pixels
01299   off = 0;  // in general, off = x1*m + y1
01300   for (x1 = 0; x1 < n; x1++)
01301     for (y1 = 0; y1 < m; y1++) {
01302       // find original vector coordinates P1 = (px1, py1, pz1)
01303       p1x = x1 - xcen;
01304       p1y = y1 - ycen;
01305       temp = (R + p1x) * (R - p1x) - p1y*p1y; /* R*R-x*x = (R-x)*(R+x) */
01306       if (temp < 0)
01307     b[off] = s[off] = nan;  // off-disk
01308       else
01309     s[off] = sqrt(temp);        // +sqrt: visible half-sphere
01310       off++;
01311     }
01312   gettimeofday(&tv2, NULL);
01313   dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 
01314     (((int) tv2.tv_usec) - ((int) tv1.tv_usec));
01315   if (verbose)
01316     mexPrintf("  do_SSa: dt = %.1f ms\n", dt/1000.0);
01317 
01318   /*
01319    * 3: Set up blocking info
01320    */
01321   // juggle the block table we were given
01322   taus = calloc(bwlen, sizeof(*taus));
01323   bws  = calloc(bwlen, sizeof(*bws));
01324   if (!taus || !bws) return SMOOTH_NOMEM;
01325   block_lut_reformat(&tau1, taus, bws, bws_in, bwlen, R);
01326   // sets up blocks, finds number of blocks
01327   blockNum = block_create(blocks, s, a, tau1, taus, bws, m, n);
01328   if (blockNum == Block_Min)
01329     return SMOOTH_INTERNAL; // logic error in code
01330   // don't need these any more
01331   free(taus); free(bws); taus = NULL; bws = NULL;
01332   // printf("Using %d blocks\n", blockNum);
01333   // compute aggregate block statistics
01334   stats = calloc(blockNum+1, sizeof(*stats));
01335   if (!stats) return SMOOTH_NOMEM;
01336   if (!block_stats(stats, blocks, s, a, m, n, blockNum))
01337     return SMOOTH_INTERNAL;  // out-of-range block number
01338   gettimeofday(&tv2, NULL);
01339   dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 
01340     (((int) tv2.tv_usec) - ((int) tv1.tv_usec));
01341   if (verbose)
01342     mexPrintf("  do_SSb: dt = %.1f ms\n", dt/1000.0);
01343 
01344   /*
01345    * 4: Set up thread info
01346    */
01347   NT = get_thread_num(); // check env for advice
01348   // determine maximum rectangle width 
01349   swath_max_size(&xwid, &ywid, xcen, ycen, R, cos_a, sin_a, Kwt, Kswath, m, n);
01350   // could make Rwid dependent on Kswath...
01351   // n=4096, NT=8, Kswath = 200 => Rwid = 128, 1+Ks/Rwid = 2.5
01352   Rwid = n / (NT * 4);
01353   if (Rwid < 4) Rwid = 4; // best not to go too small, certainly need > 0
01354   if (NT == 1) Rwid = n; // if one thread, do it all at once
01355   // record rectangle extent, common across threads
01356   SI.xwid = xwid; // max size along x direction
01357   SI.ywid = ywid; // max size along y
01358   SI.Rwid = Rwid; // target size for each thread's rectangle
01359   // set up geometry and param info valid for all calls to the smoother
01360   SI.m = m; SI.n = n;
01361   SI.strIx = m; SI.strIy = 1; // could generalize these strides
01362   SI.xcen = xcen; SI.ycen = ycen;
01363   SI.R = R; SI.cos_a = cos_a, SI.sin_a = sin_a;
01364   // input arrays (image size)
01365   SI.a = a; SI.s = s;
01366   // output array (image size)
01367   SI.b = b;
01368   // kernel
01369   SI.kernel = kernel; SI.Klen = Klen; SI.Ktop = Ktop;
01370   SI.Kswath = Kswath; SI.Kwt = Kwt;
01371   SI.blocks = blocks; SI.stats = stats;
01372 
01373   /*
01374    * 5: Do the smoothing in threads
01375    */
01376   pthread_mutex_init(&blockstat_mutex, NULL);
01377   pthread_mutex_init(&accum_mutex, NULL);
01378   pthread_mutex_init(&smooth_bb_top_mutex, NULL);
01379   // shared across threads, this marks our current position in filling 
01380   // in the output -- it indicates the "next row to convolve"
01381   SI.smooth_bb_top = 0; 
01382   // initiate NT threads
01383   for (t = 0; t < NT; t++) {
01384     // pack changing args
01385     SA[t].tnum = t;  // informational for the thread
01386     SA[t].status = 0; // status output
01387     // start the thread
01388     if (pthread_create(threads+t, NULL, smooth_thread, (void *) &SA[t]) != 0)
01389       return SMOOTH_NOTHREAD;
01390   }
01391   // join all threads
01392   for (t = 0; t < NT; t++) {
01393     if (pthread_join(threads[t], NULL) != 0)
01394       return SMOOTH_NOTHREAD;
01395     if (SA[t].status != 0)
01396       return SMOOTH_INTERNAL; // internal error in the rectangle smoother
01397   }
01398   pthread_mutex_destroy(&blockstat_mutex);
01399   pthread_mutex_destroy(&accum_mutex);
01400   pthread_mutex_destroy(&smooth_bb_top_mutex);
01401   gettimeofday(&tv2, NULL);
01402   dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 
01403     (((int) tv2.tv_usec) - ((int) tv1.tv_usec));
01404   if (verbose)
01405     mexPrintf("  do_SSc: dt = %.1f ms\n", dt/1000.0);
01406 
01407   /*
01408    * 6: Clean up and exit
01409    */
01410   if (block_check(stats, blockNum) != 0) {
01411     mexWarnMsgTxt("Error: did not integrate with all blocks\n");
01412     return SMOOTH_INTERNAL;
01413   }
01414   // optionally copy blocks into p
01415   if (p)
01416     block_export(p, blocks, m, n); 
01417   free(blocks);
01418   free(stats);
01419   if (s_alloc_local)
01420     free(s);
01421   return blockNum; // always >= 0
01422 }
01423 
01424 #ifdef StaticP  /* undefined under mex */
01425 StaticP
01426 #endif
01427 void
01428 mexFunction(int nlhs, 
01429         mxArray *plhs[], 
01430         int nrhs,
01431         const mxArray *prhs[])
01432 {
01433   const int verbose = 0;
01434   char errstr[160];
01435   char *msg;
01436   double datamin, datamax;
01437   double *s_data;             // place to put s in case we don't return it 
01438   double *p_data;             // place to put p in case we don't return it 
01439   mxArray *wt_holder;         // place to put KWt if it wasn't supplied 
01440   int m, n;                   // image size 
01441   double xcen, ycen, rsun, p0;// disk params 
01442   double *Klut;               // kernel lookup table
01443   int Nker;                   // and its length
01444   double Ktop;                // lut spans [0,Ktop]
01445   double KswathR; int KswathI;// box enclosing kernel 
01446   double *bws; int bwlen;     // block width lookup table
01447   int ok;
01448   struct timeval tv1, tv2;
01449 
01450   gettimeofday(&tv1, NULL);
01451   /* Hook for introspection (function signature, docstring) */
01452   if (nrhs < 0) { 
01453     plhs[0] = mxt_PackSignature((mxt_Signature) (-nrhs), 
01454                 NARGIN_MIN, NARGIN_MAX, 
01455                 NARGOUT_MIN, NARGOUT_MAX, 
01456                 in_names, in_specs, out_names, docstring);
01457     return;
01458   }
01459   /* check number of arguments */
01460   if ((nrhs < NARGIN_MIN) || (nrhs > NARGIN_MAX))
01461      mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01462                 "%s: Expect %d <= input args <= %d",
01463                 progname, NARGIN_MIN, NARGIN_MAX), errstr));
01464   if ((nlhs < NARGOUT_MIN) || (nlhs > NARGOUT_MAX))
01465       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01466                  "%s: Expect %d <= output args <= %d",
01467                  progname, NARGOUT_MIN, NARGOUT_MAX), errstr));
01468   mexargparse(nrhs, prhs, in_names, in_specs, NULL, progname);
01469 
01470   /* get size of input image x */
01471   m = (int) mxGetM(prhs[ARG_X]);  // m = y 
01472   n = (int) mxGetN(prhs[ARG_X]);  // n = x 
01473   /* get disk params */
01474   xcen = mxGetPr(prhs[ARG_GEOM])[0] - 1; // arg #1 = x0 (convert to C origin)
01475   ycen = mxGetPr(prhs[ARG_GEOM])[1] - 1; // arg #2 = y0 (convert to C origin)
01476   rsun = mxGetPr(prhs[ARG_GEOM])[2];     // arg #3 = radius
01477                                          // (do not use beta angle now)
01478   p0   = mxGetPr(prhs[ARG_GEOM])[4];     // p-angle
01479   p0  *= M_PI/180.0;                     // to radians
01480   // kernel params
01481   Ktop   = mxGetPr(prhs[ARG_KParam])[0];
01482   KswathR= mxGetPr(prhs[ARG_KParam])[1];
01483   KswathI= (int) KswathR;
01484   if (Ktop <= 0 || Ktop >= 1)
01485       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01486                  "%s: Expect %g <= Kpar(1) < %g, got %g",
01487                  progname, 0.0, 1.0, Ktop), errstr));
01488   if (KswathI < 1 || KswathI != KswathR)
01489       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01490                  "%s: Expect integer Kpar(2) >= %d, got %g",
01491                  progname, -1, KswathR), errstr));
01492   // kernel LUT - make or use supplied one
01493   if (mxGetNumberOfElements(prhs[ARG_K]) == 1) {
01494     // a Gaussian kernel width was given
01495     Nker = 256; // this is a reasonable default
01496     Klut = calloc(Nker, sizeof(*Klut));
01497     if (!Klut)
01498       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01499                  "%s: Could not allocate kernel LUT (length %d)",
01500                  progname, Nker), errstr));
01501     // make_lut checks for negatives, NaNs
01502     if (!make_kernel_lut(Klut, Nker, Ktop, mxGetScalar(prhs[ARG_K]))) 
01503       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01504                  "%s: Illegal spec for kernel LUT (check kparam, k(1))",
01505                  progname), errstr));
01506   } else {
01507     // explicit kernel given
01508     Klut = mxGetPr(prhs[ARG_K]);
01509     Nker = mxGetNumberOfElements(prhs[ARG_K]);
01510   }
01511   if (0)
01512     for (ok = 0; ok < Nker; ok++)
01513       printf("k[% 3d] = %.6f\n", ok, Klut[ok]);
01514 
01515   // KWt
01516   // if there's no KWt arg, make up one to fill in later
01517   if (nrhs > ARG_KWt)
01518     wt_holder = (mxArray *) prhs[ARG_KWt]; /* if uncast, the RHS is const */
01519   else
01520     wt_holder = mxCreateDoubleMatrix(0,0,mxREAL);
01521 
01522   // bin params
01523   if (nrhs > ARG_BWs) {
01524     bws = mxGetPr(prhs[ARG_BWs]);
01525     bwlen = mxGetM(prhs[ARG_BWs]);
01526     if (bwlen > 0 && mxGetN(prhs[ARG_BWs]) != 2)
01527       // (empty is OK)
01528       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01529                  "%s: bws LUT should have 2 columns",
01530                  progname), errstr));
01531   } else {
01532     if (m < 2048) {
01533       // "small" image case
01534       bws   = Default_BWs_small;          // just a list of numbers
01535       bwlen = Default_BWs_count_small/2;  // always 2 columns
01536     } else {
01537       // "large" image case
01538       bws   = Default_BWs_large;          // just a list of numbers
01539       bwlen = Default_BWs_count_large/2;  // always 2 columns
01540     }
01541   }
01542   if ((msg = block_lut_validate(bws, bwlen)) != NULL)
01543     mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01544                "%s: bws LUT format error: %s",
01545                progname, msg), errstr));
01546 
01547   // output y
01548   plhs[ARG_Y] = mxCreateDoubleMatrix(m, n, mxREAL);  // always returned
01549   if (!plhs[ARG_Y])
01550     mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01551                "%s: Could not allocate y output",
01552                progname), errstr));
01553   // s, if we want it
01554   if (nlhs > ARG_S) {
01555     plhs[ARG_S] = mxCreateDoubleMatrix(m, n, mxREAL); // optionally returned
01556     if (!plhs[ARG_S])
01557       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01558                  "%s: Could not allocate s output",
01559                  progname), errstr));
01560     s_data = mxGetPr(plhs[ARG_S]);
01561   } else {
01562     s_data = NULL;
01563   }
01564   // p, if we want it
01565   if (nlhs > ARG_P) {
01566     plhs[ARG_P] = mxCreateDoubleMatrix(m, n, mxREAL); // optionally returned
01567     if (!plhs[ARG_P])
01568       mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01569                  "%s: Could not allocate p output",
01570                  progname), errstr));
01571     p_data = mxGetPr(plhs[ARG_P]);
01572   } else {
01573     p_data = NULL;
01574   }
01575 
01576   /* call the function */
01577   ok = control(m, n,                 // sizes 
01578            mxGetPr(prhs[ARG_X]), // input is mXn
01579            mxGetPr(plhs[ARG_Y]), // convolved result is mXn 
01580            s_data,     // z coord, mXn (opt. returned if non-null)
01581            p_data,     // patch indexes, mXn (opt. returned if non-null)
01582            xcen,       // center, n or x, in C coords, origin at 0 
01583            ycen,       // center, m or y, in C coords, origin at 0 
01584            rsun,       // radius, in pixels 
01585            p0,         // p-angle, in radians
01586            Klut,       // kernel LUT
01587            Nker,       // and its size
01588            Ktop,       // lower range of the table 
01589            KswathI,    // box enclosing kernel 
01590            mxt_make_vector(wt_holder, KWt_count,Default_KWt, Default_KWt_count),
01591            bws,
01592            bwlen);
01593   // ok holds number of patches, or negative if trouble
01594   if (ok < 0)
01595     mexErrMsgTxt((snprintf(errstr, sizeof(errstr),
01596                "%s: Failed calloc, thread, or internal error in control(), code = %d",
01597                progname, ok), errstr));
01598 
01599   /* set up range of first output */
01600   // getrange(prhs[ARG_X], &datamin, &datamax);
01601   datamin = 0; datamax = 2; /* input is treated as an indicator */
01602   setrange(plhs[ARG_Y], datamin, datamax); 
01603   /* optional "s" output */
01604   if (nlhs > ARG_S) {
01605     setrange(plhs[ARG_S], 0.0, rsun); /* "z" index in pixels */
01606   } 
01607   /* optional "p" output */
01608   if (nlhs > ARG_P) {
01609     // ok is number of patches
01610     setrange(plhs[ARG_P], (double) Block_Min, (double) ok);
01611   } 
01612   // free up kernel lut if we generated it internally
01613   if (mxGetNumberOfElements(prhs[ARG_K]) == 1) 
01614     free(Klut);
01615   // free up weight matrix placeholder if it was used 
01616   if (nrhs <= ARG_KWt) mxDestroyArray(wt_holder);
01617   gettimeofday(&tv2, NULL);
01618   int dt = (((long) tv2.tv_sec) - ((long) tv1.tv_sec)) * 1000000 + 
01619     (((int) tv2.tv_usec) - ((int) tv1.tv_usec));
01620   if (verbose)
01621     mexPrintf("SS: dt = %.1f ms\n", dt/1000.0);
01622 }
01623 
01624 
01625 
01626 /* Hook for generic tail matter */
01627 #ifdef MEX2C_TAIL_HOOK
01628 #include "mex2c_tail.h"
01629 #endif
01630