General
===============================
Known jobs should be running for only few minutes.
However, since they are connected via DB_LINKS to several remote databases, due to networks issues connection can be stuck, and job is hanged.
The workaround - it to kill this job, and start execution again.
Following code is executed from crontab every 15 minutes, and checks for jobs which run longer than 30 minutes.
===============================
Code
===============================
less kill_long_running_jobs.sh
#!/bin/bash
. /etc/profile
. /etc/sh/orash/oracle_login.sh igt
handle_return_code() {
ret_code=$1
step_name=$2
if [[ $ret_code -ne 0 ]];then
echo $step_name Finished With Error!! Return Code $ret_code
exit $ret_code
fi
}
WORK_DIR=/software/oracle/oracle/scripts
LOG_DIR=${WORK_DIR}
RUN_TIME=`date +"%Y%m%d_%H%M%S"`
LOG_FILE=killed_sessions.log
TEMP_FILE=long_running_sessions.tmp
cd ${WORK_DIR}
touch ${LOG_DIR}/${LOG_FILE}
rm -f ${TEMP_FILE}
sqlplus -s /nolog <<EOF
whenever sqlerror exit SQL.SQLCODE
@./kill_long_running_jobs.sql
EOF
handle_return_code $? ./kill_long_running_jobs.sql
if [[ ! -f ${TEMP_FILE} || ! -s ${TEMP_FILE} ]]; then
echo "$RUN_TIME No Long Running Sessions Found" >> ${LOG_DIR}/${LOG_FILE}
exit 0
fi
echo reading file ${TEMP_FILE}
while read line
do
KILL_CMD=`echo $line | awk 'BEGIN {FS="~"}{print $1}'`
WHAT=`echo $line | awk 'BEGIN {FS="~"}{print $2}'`
echo "$RUN_TIME ${KILL_CMD} ${WHAT}" >> ${LOG_DIR}/${LOG_FILE}
${KILL_CMD}
done < ${TEMP_FILE}
less kill_long_running_jobs.sql
@./set_user.sql
connect &&user/&&pass@&&conn_str
PROMPT CONNECTED TO &&user/&&pass@&&conn_str
SET HEADING OFF
SET PAGESIZE 0
SET LINESIZE 200
SET NEWPAGE NONE
SET FEEDBACK OFF
SET VERIFY OFF
SPOOL long_running_sessions.tmp
SELECT 'kill -9 '||V_PROCESS.spid||'~'||DBA_JOBS.what AS LINUX_KILL
FROM V$SESSION V_SESSION,
V$PROCESS V_PROCESS,
DBA_JOBS_RUNNING STUCK_JOBS,
DBA_JOBS
WHERE V_PROCESS.addr = V_SESSION.paddr
AND V_SESSION.type != 'BACKGROUND'
AND V_SESSION.sid = STUCK_JOBS.sid
AND DBA_JOBS.job = STUCK_JOBS.job
AND logon_time < (SYSDATE - 30/1440)
AND UPPER(DBA_JOBS.what) IN ('EXT_MOCO.SCHEDULE;','EXT_OVMD.SCHEDULE;','EXT_IPN.SCHEDULE;','EXT_GLR.SCHEDULE;','EXT_SPARX.SCH
EDULE;')
UNION ALL
SELECT 'kill -9 4545~EXT_SPARX.SCHEDULE;' AS LINUX_KILL
FROM DUAL
WHERE 1=2;
SPOOL OFF
less set_user.sql
DEFINE user=user_name
DEFINE pass=user_pass
DEFINE conn_str=orainst