(file) Return to dogapfill CVS log (file) (dir) Up to [Development] / JSOC / proj / globalhs / scripts

File: [Development] / JSOC / proj / globalhs / scripts / dogapfill (download)
Revision: 1.12, Fri Feb 13 01:53:10 2015 UTC (3 years, 2 months ago) by tplarson
Branch: MAIN
CVS Tags: Ver_LATEST, Ver_9-1, Ver_9-0, Ver_8-8, Ver_8-7, Ver_8-12, Ver_8-11, Ver_8-10
Changes since 1.11: +2 -2 lines
change from a.q to k.q

#!/bin/tcsh -f

setenv PATH $JSOCROOT/bin/$JSOC_MACHINE':'$JSOCROOT/proj/globalhs/scripts':'$PATH

# submit njobs jobs at once
if (! $?GLOBALHS_TSFNJOBS) then
  set njobs=32
else
  set njobs=$GLOBALHS_TSFNJOBS
endif

# submit next batch of jobs when total number of gapfill jobs drops below jobthreshold
if (! $?GLOBALHS_TSFJOBTHRESHOLD) then
  set jobthreshold=40
else
  set jobthreshold=$GLOBALHS_TSFJOBTHRESHOLD
endif

# if this script inherited a label in its environment, use it as the suffix for job names
if (! $?GLOBALHS_LABEL) then
  set suff=''
else
  set suff=.$GLOBALHS_LABEL
endif

set qsubtmp=/tmp27/$USER/qsubtmp
mkdir -p $qsubtmp
if (! $?GLOBALHS_TSFQUEUE) then
  set q=k.q
else
  set q=$GLOBALHS_TSFQUEUE
endif

if ($q == k.q) then
  alias qsub qsub2
  alias qstat qstat2
  alias waittosubmit waittosubmit2
endif

@ i = 1
while ($i <= $#argv )
  set $argv[$i]
@ i++
end

if (! $?starttime) then
  echo must specify parameter starttime
  exit 1
endif

if (! $?totaltime) then
  echo must specify parameter totaltime
  exit 1
endif

set totalsecs = `durcon $totaltime`
if ($status) then
  echo incorrect format for parameter totaltime
  exit 1
endif

if (! $?lmin) set lmin = 0
if (! $?lmax) set lmax = 300
if (! $?lchunk) set lchunk = 10

echo `date` on $HOST
echo $PWD
echo $0 $argv

if (! -es tsf.parms.blank) then
  echo parameter file blank missing: tsf.parms.blank is required
  exit 1
endif

grep -v "^[[:space:]]*#" tsf.parms.blank | sed s@"[[:space:]]"@"\n"@g | grep -v '^[[:space:]]*$'  > parms.tmp

set in = `cat parms.tmp | grep ^in= | sed s/\'//g | sed s/\"//g | awk -F '[=[]' '{print $2}'`
set epoch = `show_info -j $in | grep -i '^keyword:t_start_epoch' | cut -d, -f5`
set epochsecs = `time_convert o=jsoc time=$epoch`
set tstartstep = `show_info -j $in | grep -i '^keyword:t_start_step' | cut -d, -f5`
set cadence = `show_info -j $in | grep -i '^keyword:t_step' | cut -d, -f5`
set ndt   = `echo "$totalsecs / $cadence" | bc`

durcon $starttime >& /dev/null
if ($status) then
  set startsecs = `time_convert o=jsoc time=$starttime`
  set firstday  = `echo "($startsecs - $epochsecs)/86400" | bc` 
else
  set startsecs = `durcon $starttime` 
  set firstday = `echo "$startsecs / 86400" | bc`
  set startsecs = `echo "$startsecs + $epochsecs" | bc`
endif

set firstindex = `echo "($startsecs - $epochsecs)/$tstartstep" | bc`
@ lchunkfirst = $lmin / $lchunk
@ lchunklast  = $lmax / $lchunk
@ nlc = ($lchunklast - $lchunkfirst) + 1

@ lc = $lchunkfirst
while ($lc <= $lchunklast)
@ lfirst = $lc * $lchunk
@ llast  = $lfirst + $lchunk - 1
  if ($lfirst < $lmin) set lfirst = $lmin
  if ($llast  > $lmax) set llast  = $lmax

@ r = ( $lc - $lchunkfirst ) % $njobs
  set subfile = subg.$firstindex.$lmin-$lmax.$r$suff
  if ( ( $lc - $lchunkfirst ) < $njobs) then
    echo '#\!/bin/csh' > $subfile
    echo 'setenv PATH' $JSOCROOT'/bin/$JSOC_MACHINE' >> $subfile
    echo 'cd' $PWD >> $subfile
  endif

  cat parms.tmp | sed s@XXXX@$starttime@g | sed s/MMMM/$llast/g | sed s/NNNN/$lfirst/g | sed s/TTTT/$ndt/g > tsf.parms.$lfirst-$llast
  echo \(time jtsfiddle @tsf.parms.$lfirst-$llast\) '>&' tsf.log.$lfirst-$llast >> $subfile
  echo echo \$status '>&' jtsfiddle.exitstatus.$lfirst-$llast >> $subfile

@ lc++
end

waittosubmit $jobthreshold subg
set i=0
if ($nlc < $njobs) set njobs=$nlc
while ($i < $njobs)
  set subfile = subg.$firstindex.$lmin-$lmax.$i$suff
  qsub -q $q -e $qsubtmp -o $qsubtmp $subfile
@ i++
end

echo jobs submitted, start waiting

set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subg.$firstindex | grep $lmin-$lmax | grep "$suff" | wc -l`
while($njobsrunning > 0)
  sleep 60
  set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subg.$firstindex | grep $lmin-$lmax | grep "$suff" | wc -l`
end

#check for errors here
set expectedlist = `/bin/ls tsf.parms.[0-9]* | cut -d'.' -f3`
set ranlist      = `/bin/ls tsf.log.* | cut -d'.' -f3`
set rerunlist = `echo $expectedlist $ranlist | sed s/" "/"\n"/g | sort | uniq -u`

set errlist1 = `grep -Hv 0 jtsfiddle.exitstatus.* | awk -F '[.:]' '{print $3}'`
set errlist2 = `grep -c "successful completion" tsf.log.* | grep -v :1 | awk -F '[.:]' '{print $3}'`
set errlist = `echo $errlist1 $errlist2 | sed s/" "/"\n"/g | sort | uniq`
set list = `echo $rerunlist $errlist | sed s/" "/"\n"/g | sort`

if ($#list) then
  echo some jobs fail, rerunning
  echo rerunlist: $rerunlist > faillog
  echo errlist1: $errlist1 >> faillog
  echo errlist2: $errlist2 >> faillog
  mkdir -p subsave
  mv subg.* subsave
  set i=0
  while ($i < $#list)
  @ r = $i % $njobs
    set subfile = subg.$firstindex.$lmin-$lmax.$r$suff
    if ( $i < $njobs) then
      echo '#\!/bin/csh' > $subfile
      echo 'setenv PATH' $JSOCROOT'/bin/$JSOC_MACHINE' >> $subfile
      echo 'cd' $PWD >> $subfile
    endif
  @ i++
    echo \(time jtsfiddle @tsf.parms.$list[$i]\) '>&' tsf.log.$list[$i] >> $subfile
    echo echo \$status '>&' jtsfiddle.exitstatus.$list[$i] >> $subfile
  end

  waittosubmit $jobthreshold subg
  set i=0
  if ($#list < $njobs) set njobs = $#list
  while ($i < $njobs)
    set subfile = subg.$firstindex.$lmin-$lmax.$i$suff
    qsub -q $q -e $qsubtmp -o $qsubtmp $subfile
  @ i++
  end

  set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subg.$firstindex | grep $lmin-$lmax | grep "$suff" | wc -l`
  while($njobsrunning > 0)
    sleep 60
    set njobsrunning = `qstat -r -u $USER | grep "Full jobname:" | grep subg.$firstindex | grep $lmin-$lmax | grep "$suff" | wc -l`
  end

  set ranlist   = `/bin/ls tsf.log.*   | cut -d'.' -f 3`
  set rerunlist = `echo $expectedlist $ranlist | sed s/" "/"\n"/g | sort | uniq -u`
  set errlist1  = `grep -Hv 0 jtsfiddle.exitstatus.* | awk -F '[.:]' '{print $3}'`
  set errlist2  = `grep -c "successful completion" tsf.log.* | grep -v :1 | awk -F '[.:]' '{print $3}'`

  if ($#errlist1 || $#errlist2 || $#rerunlist) then
    echo some jobs still fail, i give up
    exit 1
  endif
endif

echo successful completion

rm -rf subsave parms.tmp

exit 0

Karen Tian
Powered by
ViewCVS 0.9.4