(file) Return to BigRedButton.csh CVS log (file) (dir) Up to [Development] / JSOC / proj / workflow

  1 phil  1.1 #! /bin/csh -f
  2           
  3           ###  BigRedButton.csh should be run as jsocprod on n04 clean up gates and tasks
  4           ###  after a major failure in production processing:
  5           ###  
  6           ###  1. Stop the gatekeeper
  7           ###  2. Kill all taskmanagers running on n04
  8           ###  3. Delete all tickets and miscellaneous files from all gates and tasks
  9           ###  4. Kill anything running in qsub
 10           ###  5. Check for valid low and high keys in gates 
 11           ###  6. Run the cleanup script and delete failed directories to make it easier to check gates and tasks
 12           ###  7  Restart the gatekeeper
 13           
 14           set user = $USER
 15           if ( $user != 'jsocprod' ) then
 16             echo ""
 17             echo "Must run as user jsocprod on n04."
 18             echo ""
 19             exit
 20           endif
 21           
 22 phil  1.1 set WORKFLOW_DATA = /home/jsoc/pipeline
 23 jeneen 1.2 #set WORKFLOW_ROOT = /home/phil/jsoc/proj/workflow
 24            set WORKFLOW_ROOT = /home/jsoc/cvs/Development/JSOC/proj/workflow
 25 phil   1.1 set TASKS = $WORKFLOW_DATA/tasks
 26            set GATES = $WORKFLOW_GATES/gates
 27            
 28            set echo 
 29            
 30            #  1  #
 31            
 32            rm $WORKFLOW_DATA/Keep_running
 33            
 34            
 35             #  2  #
 36            
 37            @ TM_num = `ps -ef | grep taskmanager.csh | wc -l`
 38            while  ( $TM_num > 0 )
 39              foreach TM ( `ps -ef | grep taskmanager.csh | awk '{print $2}'` ) 
 40                kill -9 $TM
 41              end
 42             e@ TM_num = `ps -ef | grep taskmanager.csh | wc -l`
 43            while  ( `ps -ef | grep taskmanager.csh | wc -l` > 0 )
 44            end
 45            
 46 phil   1.1 
 47            #  3  #
 48            
 49            cd $TASKS
 50            foreach task ( * )
 51              rm -rf $task/active/*
 52              echo 0 > $task/state
 53              rm $task/active/$task'-root'/pending_tickets/*
 54            end
 55            
 56            cd $GATES
 57            foreach gate ( * )
 58              rm $gate/active_tickets/*
 59              rm $gate/new_tickets/*
 60            end
 61            
 62            
 63            ##  4  ##
 64            
 65            foreach QSub ( `qstat | grep jsocprod | egrep '(OBS|VEC|NRT|IMG|MSK|FITS|keiji)' | awk '{print $1}'` )
 66              qdel $Qsub
 67 phil   1.1 end
 68            
 69            
 70            ##  5  ##
 71            
 72            cd $WORKFLOW_ROOT
 73            ./cleanup.csh
 74            cd $TASKS
 75            foreach task ( * )
 76              rm -rf $task/archive/failed/*
 77            end
 78            
 79            
 80            ##  6  ##
 81             
 82            cd $GATES
 83            foreach gate ( * )
 84              echo $gate
 85              cat $gate/low
 86              cat $gate/high
 87              echo ""
 88 phil   1.1 end
 89            
 90            ##  7  ##
 91            
 92 jeneen 1.2 #/home/phil/jsoc/proj/workflow/gatekeeper.restart >> /home/jsoc/pipeline/restart.log &
 93            /home/jsoc/cvs/Development/JSOC/proj/workflow/gatekeeper.restart >> /home/jsoc/pipeline/restart.log &
 94 phil   1.1 
 95            
 96            echo "1. Check for bad low high times in gates (should be the last thing on the screen)."
 97            echo "2. Make sure there are no taskmanagers running."
 98            echo "3. Check gates and tasks (chechgates.csh | more, etc)."
 99            echo "4. Restart failed tickets or run maketickets to get things running again."
100            
101            
102            
103            

Karen Tian
Powered by
ViewCVS 0.9.4