1 phil 1.1 #! /bin/csh -f
2
3 ### BigRedButton.csh should be run as jsocprod on n04 clean up gates and tasks
4 ### after a major failure in production processing:
5 ###
6 ### 1. Stop the gatekeeper
7 ### 2. Kill all taskmanagers running on n04
8 ### 3. Delete all tickets and miscellaneous files from all gates and tasks
9 ### 4. Kill anything running in qsub
10 ### 5. Check for valid low and high keys in gates
11 ### 6. Run the cleanup script and delete failed directories to make it easier to check gates and tasks
12 ### 7 Restart the gatekeeper
13
14 set user = $USER
15 if ( $user != 'jsocprod' ) then
16 echo ""
17 echo "Must run as user jsocprod on n04."
18 echo ""
19 exit
20 endif
21
22 phil 1.1 set WORKFLOW_DATA = /home/jsoc/pipeline
|
23 jeneen 1.2 #set WORKFLOW_ROOT = /home/phil/jsoc/proj/workflow
24 set WORKFLOW_ROOT = /home/jsoc/cvs/Development/JSOC/proj/workflow
|
25 phil 1.1 set TASKS = $WORKFLOW_DATA/tasks
26 set GATES = $WORKFLOW_GATES/gates
27
28 set echo
29
30 # 1 #
31
32 rm $WORKFLOW_DATA/Keep_running
33
34
35 # 2 #
36
37 @ TM_num = `ps -ef | grep taskmanager.csh | wc -l`
38 while ( $TM_num > 0 )
39 foreach TM ( `ps -ef | grep taskmanager.csh | awk '{print $2}'` )
40 kill -9 $TM
41 end
42 e@ TM_num = `ps -ef | grep taskmanager.csh | wc -l`
43 while ( `ps -ef | grep taskmanager.csh | wc -l` > 0 )
44 end
45
46 phil 1.1
47 # 3 #
48
49 cd $TASKS
50 foreach task ( * )
51 rm -rf $task/active/*
52 echo 0 > $task/state
53 rm $task/active/$task'-root'/pending_tickets/*
54 end
55
56 cd $GATES
57 foreach gate ( * )
58 rm $gate/active_tickets/*
59 rm $gate/new_tickets/*
60 end
61
62
63 ## 4 ##
64
65 foreach QSub ( `qstat | grep jsocprod | egrep '(OBS|VEC|NRT|IMG|MSK|FITS|keiji)' | awk '{print $1}'` )
66 qdel $Qsub
67 phil 1.1 end
68
69
70 ## 5 ##
71
72 cd $WORKFLOW_ROOT
73 ./cleanup.csh
74 cd $TASKS
75 foreach task ( * )
76 rm -rf $task/archive/failed/*
77 end
78
79
80 ## 6 ##
81
82 cd $GATES
83 foreach gate ( * )
84 echo $gate
85 cat $gate/low
86 cat $gate/high
87 echo ""
88 phil 1.1 end
89
90 ## 7 ##
91
|
92 jeneen 1.2 #/home/phil/jsoc/proj/workflow/gatekeeper.restart >> /home/jsoc/pipeline/restart.log &
93 /home/jsoc/cvs/Development/JSOC/proj/workflow/gatekeeper.restart >> /home/jsoc/pipeline/restart.log &
|
94 phil 1.1
95
96 echo "1. Check for bad low high times in gates (should be the last thing on the screen)."
97 echo "2. Make sure there are no taskmanagers running."
98 echo "3. Check gates and tasks (chechgates.csh | more, etc)."
99 echo "4. Restart failed tickets or run maketickets to get things running again."
100
101
102
103
|