00001 static char rcsid[] = "$Header: /home/cvsuser/cvsroot/JSOC/proj/jpe/apps/start_pvmd.c,v 1.2 2009/02/23 22:12:09 production Exp $"; 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <unistd.h> 00005 #include <sys/types.h> 00006 #include <sys/stat.h> 00007 #include <sys/file.h> 00008 #include <fcntl.h> 00009 #include <pvm3.h> 00010 #include "pe.h" 00011 00012 #define MAXSTR 128 00013 #define PVMNOT "/usr/local/logs/dsds/PVM_NOT" 00014 00015 int ck_pe_rpc_dsds(int (*msg)()); 00016 int ck_cluster(char *hostname, char *user, int (*msg)()); 00017 int start_pvm_now(char *hostname, int (*msg)()); 00018 00019 /* Starts a pvm daemon on the local host. 00020 * Allows for a future pvm_addhosts() call to add aditya or fault, etc.. 00021 * The file to start pvmd3 is built in /tmp/hostfile.$USER. 00022 * Uses $STAGING for ep= to define the path for the executables. 00023 * Registers the calling task with pvm and returns it tid. Returns 0 on error. 00024 * 00025 * msg = pointer to a function to print a vararg message. 00026 * 00027 * NEW:4Jun2002: First check if on a cluster machine (currently n00-n07) by 00028 * seeing if /etc/SGE/pvm_start exists. 00029 * If so then see if we can reach dsds_svc through a pvm/pe_rpc. If so then 00030 * we already have a good pvmd running, else we start one by calling 00031 * /etc/SGE/pvm_start which will clean out any remnants of an old pe_rpc/pvmd 00032 * and /tmp/pvm* files and start a new one outside of the SGE environment, 00033 * so that it will not be killed off when the current job completes. 00034 */ 00035 int start_pvm(int (*msg)()) 00036 { 00037 char hostname[MAXSTR]; 00038 char *user; 00039 int status; 00040 00041 gethostname(hostname, MAXSTR); 00042 if(!(user = (char *)getenv("USER"))) user = "pvm"; 00043 status = ck_cluster(hostname, user, msg); 00044 if(status == 0) return(0); 00045 return(start_pvm_now(hostname, msg)); 00046 } 00047 00048 /* Returns 0 on error, else the tid of the started pvmd. 00049 */ 00050 int start_pvm_now(char *hostname, int (*msg)()) 00051 { 00052 FILE *fphost; 00053 struct stat stbuf; 00054 int mytid, rint; 00055 char filename[MAXSTR], cmd[MAXSTR]; 00056 char *staging, *machine, *user; 00057 00058 if((mytid = pvm_mytid()) < 0) { /* pvmd not running */ 00059 /* don't run if PVM_NOT file present */ 00060 if(stat(PVMNOT, &stbuf) != -1) { /* dont try to start a pvmd */ 00061 (*msg)("*** pvm daemon not allowed. The SSSC is not available.\n"); 00062 return(0); 00063 } 00064 /* new 15Sep98: delay a random time and check again in case 00065 * someone else has already started it (usually multiple 00066 * scripts still running after an sssc_stop). 00067 */ 00068 #ifdef __sgi 00069 int napcnt; 00070 srand(getpid()); /* seed for rand() */ 00071 rint = rand(); /* 0 to 65535 */ 00072 napcnt = rint/4369; /* a sginap count 0 to 15 */ 00073 sginap(napcnt); /* delay 0 to 5 seconds */ 00074 #else 00075 sleep(1); 00076 #endif 00077 if((mytid = pvm_mytid()) < 0) { /* pvmd still not running */ 00078 (*msg)("\nStarting a pvm daemon on %s\n", hostname); 00079 if(!(user = (char *)getenv("USER"))) user = "pvm"; 00080 sprintf(filename, "/tmp/hostfile.%s", user); 00081 if((fphost=fopen(filename, "w")) == NULL) { 00082 (*msg)("Can't open %s to configure pvmd\n", filename); 00083 return(0); 00084 } 00085 if(!(staging = (char *)getenv("STAGING"))) { 00086 (*msg)("You must have an env $STAGING to find the executables for pvm\n"); 00087 return(0); 00088 } 00089 if(!(machine = (char *)getenv("MACHINE"))) { 00090 (*msg)("You must have an env $MACHINE to find the executables for pvm\n"); 00091 return(0); 00092 } 00093 fprintf(fphost, "#%s built by start_pvm()\n", filename); 00094 fprintf(fphost, "%s ep=%s/bin/_%s\n", hostname, staging, machine); 00095 /* put in aditya and fault, etc. for possible pvm_addhosts() later */ 00096 /* NEW: this is no longer used, except for production possibly */ 00097 /* adding tarax or sonar to its virtual machine. */ 00098 /* NOTE: on 15May03 tarax became an alias for sonar */ 00099 fprintf(fphost, "&tarax.Stanford.EDU ep=%s/bin/_sgi4\n", staging); 00100 fprintf(fphost, "&sonar.Stanford.EDU ep=%s/bin/_sgi4\n", staging); 00101 /******* 00102 fprintf(fphost, "&fault.Stanford.EDU ep=%s/bin/_sgi4\n", staging); 00103 fprintf(fphost, "&shock.Stanford.EDU ep=%s/bin/_linux4\n", staging); 00104 fprintf(fphost, "&rick.Stanford.EDU ep=%s/bin/_linux4\n", staging); 00105 fprintf(fphost, "&phil.Stanford.EDU ep=%s/bin/_sgi4\n", staging); 00106 fprintf(fphost, "&yeti.Stanford.EDU ep=%s/bin/_sgi\n", staging); 00107 fprintf(fphost, "&shoom.Stanford.EDU ep=%s/bin/_sgi\n", staging); 00108 fprintf(fphost, "&aditya.Stanford.EDU ep=%s/bin/_sun4\n", staging); 00109 fprintf(fphost, "&shiver.Stanford.EDU ep=%s/bin/_sun4\n", staging); 00110 fprintf(fphost, "&soidb.Stanford.EDU ep=%s/bin/_sun4\n", staging); 00111 fprintf(fphost, "&quake.Stanford.EDU ep=%s/bin/_mips\n", staging); 00112 ********/ 00113 fclose(fphost); 00114 sprintf(cmd, "pvmd3 %s&\n", filename); 00115 if(system(cmd)) { 00116 (*msg)("Error starting pvm daemon\n"); 00117 return(0); 00118 } 00119 while(1) { 00120 sleep(2); /* let it start */ 00121 if((mytid = pvm_mytid()) > 0) break; 00122 } 00123 } 00124 } 00125 return(mytid); 00126 } 00127 00128 /* If on a cluster machine then see if a valid pvmd is already running, and 00129 * if not start one with an rsh to the machine for /etc/SGE/pvm_start. 00130 * Make sure that 2 different machines don't get in each others way by 00131 * using an exclusive lock on a /tmp file. 00132 * The rsh must be done from some other machine (use sonar) so that the SGE 00133 * doesn't end up knowing about the pvmd and killing it when the calling pe 00134 * exits.(NEW: Don't need to do the rsh from sonar if redirect output ok.) 00135 * Return -1 if not on a cluster machine. 00136 * Return 0 if failure. 00137 * Return 1 if on cluster and pvmd is now running. 00138 */ 00139 int ck_cluster(char *hostname, char *user, int (*msg)()) 00140 { 00141 char cmd[MAXSTR]; 00142 int status, fdes; 00143 00144 00145 if(!access("/etc/SGE/pvm_start", F_OK)) { /* on a cluster machine */ 00146 sprintf(cmd, "/tmp/pvm_lock.%s", user); 00147 if((fdes=open(cmd, O_WRONLY | O_CREAT, 0644)) == -1) { 00148 (*msg)("Can't open %s. Proceed anyway.\n", cmd); 00149 } 00150 if(fdes != -1) { 00151 lockf(fdes, F_LOCK, 0); /* lock or block */ 00152 } 00153 if(status = ck_pe_rpc_dsds(msg)) { /* can't reach dsds_svc */ 00154 //printf("Cannot reach dsds_svc, restarting\n"); 00155 printf("Restarting...\n"); 00156 if(status != 1) { /* if other then no pvmd, exit it*/ 00157 pvm_exit(); 00158 system("echo y | /CM/bin/_linux4/pvm_halt"); 00159 } 00160 (*msg)("Starting a pvmd on cluster %s\n", hostname); 00161 /****old way to get around SGE killing the pvmd****** 00162 sprintf(cmd, "rsh sonar \"rsh %s /etc/SGE/pvm_start&\"&", hostname); 00163 *****/ 00164 sprintf(cmd, "rsh %s /etc/SGE/pvm_start >\& /tmp/pvm_start.%s.log&", 00165 hostname, user); 00166 (*msg)("cmd: %s\n", cmd); /* !!TEMP */ 00167 if(system(cmd)) { 00168 (*msg)("Error starting pvm daemon with /etc/SGE/pvm_start\n"); 00169 return(0); 00170 } 00171 while(1) { 00172 sleep(2); /* let it start */ 00173 if(pvm_mytid() > 0) break; 00174 } 00175 sleep(2); /* let pe_rpc start */ 00176 } 00177 if(fdes != -1) { 00178 lockf(fdes, F_ULOCK, 0); /* unlock so others can run */ 00179 close(fdes); 00180 } 00181 return(1); 00182 } 00183 return(-1); 00184 } 00185 00186 /* See if a pvmd/pe_rpc is running in our virtual machine. 00187 * Give error return if not. 00188 * If one is already running, verify that the dsds_svc can be contacted, 00189 * and print the db name that the dsds_svc is connected to. 00190 * Return 0 on success, else: 00191 * 1 = pvmd not running 00192 * 2 = pvmd internal error 00193 * 3 = pe_rpc not running 00194 * 4 = error contacting dsds_svc 00195 */ 00196 int ck_pe_rpc_dsds(int (*msg)()) 00197 { 00198 struct taskinfo *taskp; 00199 int i, ntask, perpctid; 00200 00201 int mytid; 00202 00203 if((mytid = pvm_mytid()) < 0) { /* pvmd not running */ 00204 (*msg)("pvmd not running\n"); 00205 return(1); 00206 } 00207 if(pvm_tasks(0, &ntask, &taskp)) { /* get all tasks on the virt machine */ 00208 (*msg)("Error on pvm_tasks() call for pe_rpc info\n"); 00209 return(2); 00210 } 00211 for(i=0; i<ntask; i++) { 00212 if(!strcmp(taskp[i].ti_a_out, "pe_rpc")) { 00213 perpctid=taskp[i].ti_tid; /* current pe_rpc tid */ 00214 break; 00215 } 00216 } 00217 if(i == ntask) { /* pe_rpc not there */ 00218 (*msg)("pe_rpc not there\n"); 00219 return(3); 00220 } 00221 /* pe_rpc is there. check the dsds_svc database */ 00222 //For jpe, don't try to contact dsds_svc 00223 //KEY *alist, *blist; 00224 // alist=newkeylist(); 00225 // if((blist = (KEY *)call_dsds(&alist, REQDBNAME, perpctid, mytid, msg, 0)) == NULL) { 00226 // (*msg)("Error requesting database name from dsds_svc:\n"); 00227 // return(4); 00228 // } 00229 // freekeylist(&alist); freekeylist(&blist); 00230 return(0); 00231 } 00232 00233 /*main(int argc, char *argv[]) 00234 /*{ 00235 /* int mstat; 00236 /* 00237 /* mstat=start_pvm(printf); 00238 /* printf("The pvm tid = %d\n", mstat); 00239 /*} 00240 */ 00241 00242 /* 00243 $Id: start_pvmd.c,v 1.2 2009/02/23 22:12:09 production Exp $ 00244 $Source: /home/cvsuser/cvsroot/JSOC/proj/jpe/apps/start_pvmd.c,v $ 00245 $Author: production $ 00246 */ 00247 /* $Log: start_pvmd.c,v $ 00248 /* Revision 1.2 2009/02/23 22:12:09 production 00249 /* elim call_dsds() 00250 /* 00251 /* Revision 1.1 2009/01/23 21:30:46 production 00252 /* initial 00253 /* 00254 * Revision 1.27 2006/03/27 22:49:42 jim 00255 * add mode 0644 to open() 00256 * 00257 * Revision 1.26 2004/11/09 23:25:25 rmunk 00258 * Updated file handling and removed extraneous printfs. 00259 * 00260 * Revision 1.25 2004/11/06 20:02:38 rmunk 00261 * removed C++ style comment that the SGI compiler chokes on. 00262 * 00263 * Revision 1.24 2004/11/05 18:43:26 rmunk 00264 * Changed file handling to speed up file locking. 00265 * 00266 * Revision 1.23 2003/05/15 17:38:02 jim 00267 * add comment on tarax alias to sonar 00268 * 00269 * Revision 1.22 2003/02/27 17:55:06 jim 00270 * remove adding host to the /tmp/hostfile.user 00271 * 00272 * Revision 1.21 2003/02/26 19:12:29 jim 00273 * change rick from _sgi to _linux4 00274 * 00275 * Revision 1.20 2002/08/29 23:20:28 jim 00276 * output to /tmp/pvm_start.%s.log and sleep(2) 00277 * 00278 * Revision 1.19 2002/08/21 23:14:00 jim 00279 * sprintf(cmd, "/tmp/pvm_lock.%s", user), user instead of hostname 00280 * 00281 * Revision 1.18 2002/08/14 17:08:20 jim 00282 * change shock to _linux4 00283 * 00284 * Revision 1.17 2002/07/02 20:50:03 jim 00285 * add exclusive lock on /tmp file 00286 * 00287 * Revision 1.16 2002/06/07 21:30:56 jim 00288 * don't need double rsh to "fool" SGE 00289 * 00290 * Revision 1.15 2002/06/07 00:04:42 jim 00291 * add code for cluster check and pvm_start call 00292 * 00293 * Revision 1.14 2001/08/24 20:56:35 jim 00294 * remove restriction of PVM_NOT only on tarax 00295 * 00296 * Revision 1.13 2001/05/02 21:04:28 jim 00297 * change phil to bin/_sgi4 00298 * 00299 * Revision 1.12 2000/03/20 22:14:42 jim 00300 * update host list. add sonar, remove flytrap, rumble 00301 * 00302 * Revision 1.11 1998/11/05 23:37:25 CM 00303 * add include for mips 00304 * 00305 * Revision 1.10 1998/10/14 18:48:55 jim 00306 * dont start pvmd on tarax if PVM_NOT file found 00307 * 00308 * Revision 1.9 1998/09/15 18:16:34 jim 00309 * add sginap delay 00310 * 00311 * Revision 1.8 1996/12/09 17:20:42 jim 00312 * change fault to _sgi4 00313 * 00314 * Revision 1.7 1995/07/24 20:28:39 jim 00315 * add yeti 00316 * 00317 * Revision 1.6 1995/06/30 17:37:30 jim 00318 * make tarax path _sgi4. add rick, phil, shoom 00319 * 00320 * Revision 1.5 1995/05/01 21:44:01 jim 00321 * add host shock and soidb 00322 * 00323 * Revision 1.4 1994/11/07 23:59:32 jim 00324 * added tarax.Stanford.EDU to hostfile 00325 * 00326 * Revision 1.3 1994/10/19 23:43:02 jim 00327 * put .user with hostfile name 00328 * 00329 * Revision 1.2 1994/09/16 18:20:43 jim 00330 * initial 00331 * */