![]() ![]() |
![]() |
File: [Development] / JSOC / proj / globalhs / scripts / doretile1
(download)
Revision: 1.14, Wed Jun 11 05:18:46 2014 UTC (8 years, 11 months ago) by tplarson Branch: MAIN CVS Tags: Ver_8-6, Ver_8-5 Changes since 1.13: +3 -1 lines added informative output to match doretilen |
#! /bin/tcsh -f setenv PATH $JSOCROOT/bin/$JSOC_MACHINE':'$JSOCROOT/proj/globalhs/scripts':'$PATH # this script retiles a single interval in time # submit njobs jobs at once if (! $?GLOBALHS_RETNJOBS) then set njobs=6 else set njobs=$GLOBALHS_RETNJOBS endif # submit next batch of jobs when total number of retile jobs drops below jobthreshold if (! $?GLOBALHS_RETJOBTHRESHOLD) then set jobthreshold=2 else set jobthreshold=$GLOBALHS_RETJOBTHRESHOLD endif # if this script inherited a label in its environment, use it as the suffix for job names if (! $?GLOBALHS_LABEL) then set suff='' else set suff=.$GLOBALHS_LABEL endif set qsubtmp=/tmp27/$USER/qsubtmp mkdir -p $qsubtmp if (! $?GLOBALHS_RETQUEUE) then set q=j.q else set q=$GLOBALHS_RETQUEUE endif if ($q == a.q) then alias qsub qsub2 alias qstat qstat2 alias waittosubmit waittosubmit2 endif @ i = 1 while ($i <= $#argv ) set $argv[$i] @ i++ end if (! $?starttime) then echo must specify parameter starttime exit 1 endif if (! $?totaltime) then echo must specify parameter totaltime exit 1 endif set totalsecs = `durcon $totaltime` if ($status) then echo incorrect format for parameter totaltime exit 1 endif if (! $?timechunk) set timechunk = $totaltime set chunksecs = `durcon $timechunk` if ($status) then echo incorrect format for parameter timechunk exit 1 endif if (! $?lchunkin) then echo must specify parameter lchunkin exit 1 endif if (! $?lmin) set lmin = 0 if (! $?lmax) set lmax = 300 if (! $?lchunkout) set lchunkout = 1 if (($lchunkin % $lchunkout && $lchunkin < $lmax + 1) && ($lchunkout % $lchunkin && $lchunkout < $lmax + 1)) then echo lchunkout must evenly divide lchunkin or lchunkin must evenly divide lchunkout exit 1 endif date echo $PWD echo $0 $argv if (! -es ret.parms.blank) then echo parameter file blank missing: ret.parms.blank is required exit 1 endif grep -v "^[[:space:]]*#" ret.parms.blank | sed s@"[[:space:]]"@"\n"@g | grep -v '^[[:space:]]*$' > parms.tmp set in = `cat parms.tmp | grep ^in= | sed s/\'//g | sed s/\"//g | awk -F '[=[]' '{print $2}'` set epoch = `show_info -j $in | grep -i '^keyword:t_start_epoch' | cut -d, -f5` set epochsecs = `time_convert o=jsoc time=$epoch` set tstartstep = `show_info -j $in | grep -i '^keyword:t_start_step' | cut -d, -f5` set cadence = `show_info -j $in | grep -i '^keyword:t_step' | cut -d, -f5` durcon $starttime >& /dev/null if ($status) then set startsecs = `time_convert o=jsoc time=$starttime` else set startsecs = `durcon $starttime` set startsecs = `echo "$startsecs + $epochsecs" | bc` endif set start = `time_convert o=cal zone=tai s=$startsecs` set tagclause = `cat parms.tmp | grep ^in= | cut -d= -f2- | sed s/"'"/""/g | sed s@"\["@"\n"@g | sed s@]@"\n"@g | grep TAG` set ndt = `cat parms.tmp | grep ^in= | awk -F '[][]' '{print $8}'` set firstindexin = `show_info -q $in\[]\[$lmin]\[]\[$ndt]\[$tagclause]'[? t_start <= $('$start') and t_stop > $('$start') ?]' key=t_start_index` set firstindexout = `echo "($startsecs - $epochsecs)/$tstartstep" | bc` set indexchunkout = `echo "$totalsecs/$tstartstep" | bc` @ indexchunkin = ($firstindexout - $firstindexin) + $indexchunkout @ lchunkfirst = $lmin / $lchunkin @ lchunklast = $lmax / $lchunkin @ nlc = ($lchunklast - $lchunkfirst) + 1 set inrecset = $in'[#'$firstindexin/$indexchunkin']['$lmin-$lmax']['$lmin-$lmax']['$ndt']['$tagclause']' show_info -q "$inrecset" key=version >& version.tmp set version = `uniq version.tmp` if ($#version > 1) then echo VERSION varies across the input exit 1 endif # limit desc 1024 used to be necessary, apparently now it isn't @ lc = $lchunkfirst while ($lc <= $lchunklast) @ lfirst = $lc * $lchunkin @ llast = $lfirst + $lchunkin - 1 if ($lfirst < $lmin) set lfirst = $lmin if ($llast > $lmax) set llast = $lmax @ r = ( $lc - $lchunkfirst ) % $njobs set subfile = subr.$firstindexout.$lmin-$lmax.$r$suff if ( ( $lc - $lchunkfirst ) < $njobs) then echo '#\!/bin/csh' > $subfile echo 'setenv PATH' $JSOCROOT'/bin/$JSOC_MACHINE' >> $subfile echo 'cd' $PWD >> $subfile # echo 'limit desc 1024' >> $subfile endif cat parms.tmp | sed s@XXXX@"#$firstindexin/$indexchunkin"@g | sed s/MMMM/$llast/g | sed s/NNNN/$lfirst/g | sed s/CCCC/$lchunkout/g | sed s/TTTT/$totaltime/g | sed s/UUUU/$timechunk/g | sed s/SSSS/$start/g > ret.parms.$lfirst-$llast echo \(time jretile @ret.parms.$lfirst-$llast\) '>&' ret.log.$lfirst-$llast >> $subfile echo echo \$status '>&' jretile.exitstatus.$lfirst-$llast >> $subfile @ lc++ end if ($nlc < $njobs) set njobs=$nlc echo $njobs job scripts created waittosubmit $jobthreshold subr set i=0 while ($i < $njobs) set subfile = subr.$firstindexout.$lmin-$lmax.$i$suff qsub -q $q -e $qsubtmp -o $qsubtmp $subfile @ i++ end echo jobs submitted, start waiting set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subr.$firstindexout | grep $lmin-$lmax | grep "$suff" | wc -l` while($njobsrunning > 0) sleep 60 set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subr.$firstindexout | grep $lmin-$lmax | grep "$suff" | wc -l` end #check for errors here set expectedlist = `/bin/ls ret.parms.[0-9]* | cut -d'.' -f3` set ranlist = `/bin/ls ret.log.* | cut -d'.' -f3` set rerunlist = `echo $expectedlist $ranlist | sed s/" "/"\n"/g | sort | uniq -u` set errlist1 = `grep -Hv 0 jretile.exitstatus.* | awk -F '[.:]' '{print $3}'` set errlist2 = `grep -Hc "successful completion" ret.log.* | grep -v :1 | awk -F '[.:]' '{print $3}'` set errlist = `echo $errlist1 $errlist2 | sed s/" "/"\n"/g | sort | uniq` if ($#rerunlist || $#errlist) then echo some jobs fail, rerunning echo rerunlist: $rerunlist > faillog echo errlist1: $errlist1 >> faillog echo errlist2: $errlist2 >> faillog set subfile = subr.$firstindexout.redo.$lmin-$lmax$suff echo '#\!/bin/csh' > $subfile echo 'setenv PATH' $JSOCROOT'/bin/$JSOC_MACHINE' >> $subfile echo 'cd' $PWD >> $subfile # echo 'limit desc 1024' >> $subfile foreach ind ($errlist $rerunlist) set ilmin = `echo $ind | cut -d'-' -f1` set ilmax = `echo $ind | cut -d'-' -f2` echo \(time jretile @ret.parms.$ilmin-$ilmax\) '>&' ret.log.$ilmin-$ilmax >> $subfile echo echo \$status '>&' jretile.exitstatus.$ilmin-$ilmax >> $subfile end waittosubmit $jobthreshold subr qsub -q $q -e $qsubtmp -o $qsubtmp subr.$firstindexout.redo.$lmin-$lmax$suff set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subr.$firstindexout | grep $lmin-$lmax | grep "$suff" | wc -l` while($njobsrunning > 0) sleep 60 set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subr.$firstindexout | grep $lmin-$lmax | grep "$suff" | wc -l` end set ranlist = `/bin/ls ret.log.* | cut -d'.' -f3` set rerunlist = `echo $expectedlist $ranlist | sed s/" "/"\n"/g | sort | uniq -u` set errlist1 = `grep -Hv 0 jretile.exitstatus.* | awk -F '[.:]' '{print $3}'` set errlist2 = `grep -Hc "successful completion" ret.log.* | grep -v :1 | awk -F '[.:]' '{print $3}'` if ($#errlist1 || $#errlist2 || $#rerunlist) then echo some jobs still fail, i give up exit 1 endif endif echo successful completion rm parms.tmp version.tmp exit 0
Karen Tian |
Powered by ViewCVS 0.9.4 |