sh dameng_monitor.sh
#!/bin/bash
# dameng_monitor.sh - 达梦数据库监控脚本(修复Ctrl+C问题)
# 默认配置参数
DB_HOST="10.56.0.98"
DB_PORT="5236"
CHECK_INTERVAL="2"
LOG_FILE="/var/log/dameng_monitor.log"
PID_FILE="/var/run/dameng_monitor.pid"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 全局变量
declare -i check_count=0
declare -i consecutive_failures=0
declare -i total_failures=0
declare -i total_success=0
start_time=$(date +%s)
DAEMON_MODE=false
# 显示使用说明
show_usage() {
echo "达梦数据库监控脚本"
echo ""
echo "用法: $0 [选项] [命令]"
echo ""
echo "命令:"
echo " start 启动后台监控(守护进程)"
echo " stop 停止监控"
echo " status 查看状态"
echo " restart 重启监控"
echo " log 实时查看日志"
echo " (无命令) 前台运行模式(Ctrl+C可停止)"
echo ""
echo "选项:"
echo " -h, --host HOST 数据库主机地址 (默认: $DB_HOST)"
echo " -p, --port PORT 数据库端口 (默认: $DB_PORT)"
echo " -i, --interval SECONDS 检查间隔秒数 (默认: $CHECK_INTERVAL)"
echo " -l, --log FILE 日志文件路径 (默认: $LOG_FILE)"
echo " --pid FILE PID文件路径 (默认: $PID_FILE)"
echo " --help 显示此帮助信息"
echo ""
echo "示例:"
echo " $0 start # 后台启动守护进程"
echo " $0 -h 192.168.1.100 -p 5236 start # 指定IP和端口后台启动"
echo " $0 -h 10.1.1.1 -p 32141 # 前台运行(Ctrl+C可停止)"
echo " $0 stop # 停止后台监控"
exit 0
}
# 解析命令行参数
parse_arguments() {
while [[ $# -gt 0 ]]; do
case $1 in
-h|--host)
DB_HOST="$2"
shift 2
;;
-p|--port)
DB_PORT="$2"
shift 2
;;
-i|--interval)
CHECK_INTERVAL="$2"
shift 2
;;
-l|--log)
LOG_FILE="$2"
shift 2
;;
--pid)
PID_FILE="$2"
shift 2
;;
--help)
show_usage
;;
start|stop|status|restart|log)
COMMAND="$1"
shift
;;
*)
echo "错误: 未知参数 $1"
show_usage
;;
esac
done
}
# 验证参数
validate_parameters() {
# 检查IP地址格式
if ! [[ $DB_HOST =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "错误: 无效的IP地址格式: $DB_HOST"
exit 1
fi
# 检查端口范围
if ! [[ $DB_PORT =~ ^[0-9]+$ ]] || [ $DB_PORT -lt 1 ] || [ $DB_PORT -gt 65535 ]; then
echo "错误: 无效的端口号: $DB_PORT"
exit 1
fi
# 检查间隔时间
if ! [[ $CHECK_INTERVAL =~ ^[0-9]+$ ]] || [ $CHECK_INTERVAL -lt 1 ]; then
echo "错误: 无效的检查间隔: $CHECK_INTERVAL"
exit 1
fi
# 检查日志文件目录
local log_dir=$(dirname "$LOG_FILE")
if [ ! -w "$log_dir" ] 2>/dev/null; then
echo "警告: 日志目录可能不可写: $log_dir"
echo "尝试创建目录..."
mkdir -p "$log_dir" 2>/dev/null || {
echo "错误: 无法创建日志目录"
exit 1
}
fi
}
# 信号处理(仅用于前台模式)
cleanup() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] 收到停止信号,正在关闭监控..." >> "$LOG_FILE"
if [ "$DAEMON_MODE" = false ]; then
echo -e "\n${YELLOW}正在停止监控...${NC}"
fi
rm -f "$PID_FILE"
exit 0
}
# 检查PID文件
check_pid() {
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
if ps -p "$pid" > /dev/null 2>&1; then
echo "错误: 监控进程已在运行 (PID: $pid)"
echo "使用: $0 stop 来停止运行中的进程"
exit 1
else
rm -f "$PID_FILE"
fi
fi
}
# 创建PID文件
create_pid_file() {
echo $$ > "$PID_FILE"
}
# 初始化日志
init_logging() {
mkdir -p "$(dirname "$LOG_FILE")"
mkdir -p "$(dirname "$PID_FILE")"
echo "=== 达梦数据库监控启动于 $(date) ===" >> "$LOG_FILE"
echo "目标主机: $DB_HOST" >> "$LOG_FILE"
echo "目标端口: $DB_PORT" >> "$LOG_FILE"
echo "检查间隔: ${CHECK_INTERVAL}秒" >> "$LOG_FILE"
echo "日志文件: $LOG_FILE" >> "$LOG_FILE"
echo "进程PID: $$" >> "$LOG_FILE"
echo "运行模式: $([ "$DAEMON_MODE" = true ] && echo "后台守护进程" || echo "前台模式")" >> "$LOG_FILE"
}
# 日志函数
log() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
# 1. 基础网络检查
check_basic_network() {
local success=true
# Ping检查
if ping -c 1 -W 1 "$DB_HOST" &>/dev/null; then
if [ "$DAEMON_MODE" = false ]; then
echo -n "✅ Ping正常 | "
fi
log "NETWORK" "Ping检查: 成功 - 主机 $DB_HOST"
else
if [ "$DAEMON_MODE" = false ]; then
echo -n "❌ Ping失败 | "
fi
log "NETWORK" "Ping检查: 失败 - 主机 $DB_HOST"
success=false
fi
return $([ "$success" = true ] && echo 0 || echo 1)
}
# 2. 端口连通性检查
check_port_connectivity() {
# 使用netcat检查端口
if nc -zv -w 1 "$DB_HOST" "$DB_PORT" &>/dev/null; then
if [ "$DAEMON_MODE" = false ]; then
echo "✅ 端口$DB_PORT正常"
fi
log "PORT" "端口检查: 成功 - $DB_HOST:$DB_PORT"
return 0
else
if [ "$DAEMON_MODE" = false ]; then
echo "❌ 端口$DB_PORT失败"
fi
log "PORT" "端口检查: 失败 - $DB_HOST:$DB_PORT"
return 1
fi
}
# 单次检查执行
perform_check() {
local check_start=$(date +%s)
check_count=$((check_count + 1))
if [ "$DAEMON_MODE" = false ]; then
echo -n "[#$check_count] "
fi
# 执行基础网络检查
if check_basic_network && check_port_connectivity; then
consecutive_failures=0
total_success=$((total_success + 1))
log "STATUS" "检查通过: 所有项目正常"
else
consecutive_failures=$((consecutive_failures + 1))
total_failures=$((total_failures + 1))
log "STATUS" "检查失败: 发现异常"
# 连续失败告警
if [ $consecutive_failures -ge 3 ]; then
log "ALERT" "连续 $consecutive_failures 次检查失败"
fi
fi
local check_duration=$(( $(date +%s) - check_start ))
if [ $check_duration -gt 1 ]; then
log "PERF" "检查耗时较长: ${check_duration}秒"
fi
}
# 显示统计信息(仅前台模式)
show_statistics() {
if [ "$DAEMON_MODE" = false ]; then
local current_time=$(date +%s)
local run_time=$((current_time - start_time))
local success_rate=0
if [ $check_count -gt 0 ]; then
success_rate=$((total_success * 100 / check_count))
fi
echo -e "\n${BLUE}=== 监控统计 ===${NC}"
echo -e "目标: $DB_HOST:$DB_PORT"
echo -e "运行: $(date -u -d @$run_time +%T)"
echo -e "检查: $check_count 次"
echo -e "成功: $total_success 次"
echo -e "失败: $total_failures 次"
echo -e "成功率: ${success_rate}%"
echo -e "连续失败: $consecutive_failures 次"
echo -e "日志文件: $LOG_FILE"
fi
}
# 真正的守护进程启动
start_daemon() {
echo "启动后台守护进程..."
echo "目标: $DB_HOST:$DB_PORT"
echo "间隔: ${CHECK_INTERVAL}秒"
echo "日志: $LOG_FILE"
echo "PID文件: $PID_FILE"
echo ""
echo "注意: 守护进程将在后台运行,不受Ctrl+C影响"
echo ""
echo "管理命令:"
echo " $0 stop # 停止监控"
echo " $0 status # 查看状态"
echo " $0 log # 查看日志"
echo ""
# 检查是否已经运行
check_pid
# 使用nohup和setsid启动真正的守护进程
nohup setsid "$0" --daemon -h "$DB_HOST" -p "$DB_PORT" -i "$CHECK_INTERVAL" -l "$LOG_FILE" > /dev/null 2>&1 &
local daemon_pid=$!
sleep 2
# 检查守护进程是否启动成功
if ps -p $daemon_pid > /dev/null 2>&1; then
echo "✅ 守护进程启动成功 (PID: $daemon_pid)"
echo "📋 使用 '$0 status' 查看运行状态"
else
echo "❌ 守护进程启动失败"
exit 1
fi
}
# 守护进程主循环(不受Ctrl+C影响)
daemon_main_loop() {
# 忽略信号,让守护进程不受终端影响
trap '' SIGINT SIGTERM SIGHUP
DAEMON_MODE=true
init_logging
create_pid_file
log "DAEMON" "守护进程启动 - 目标: $DB_HOST:$DB_PORT, 间隔: ${CHECK_INTERVAL}秒"
# 守护进程主循环
while true; do
perform_check
# 定期记录统计
if [ $((check_count % 50)) -eq 0 ]; then
local success_rate=0
if [ $check_count -gt 0 ]; then
success_rate=$((total_success * 100 / check_count))
fi
local run_time=$(( $(date +%s) - start_time ))
log "STATISTICS" "运行统计 - 次数: $check_count, 成功率: ${success_rate}%, 运行时间: $(date -u -d @$run_time +%T)"
fi
sleep "$CHECK_INTERVAL"
done
}
# 前台运行模式(受Ctrl+C影响)
foreground_main_loop() {
# 设置信号处理(前台模式可以Ctrl+C停止)
trap cleanup SIGINT SIGTERM
DAEMON_MODE=false
init_logging
create_pid_file
echo -e "${GREEN}🚀 启动达梦数据库监控(前台模式)${NC}"
echo -e "目标: ${YELLOW}$DB_HOST:$DB_PORT${NC}"
echo -e "间隔: ${YELLOW}${CHECK_INTERVAL}秒${NC}"
echo -e "日志: ${YELLOW}$LOG_FILE${NC}"
echo -e "提示: ${RED}Ctrl+C${NC} 可以停止此监控"
echo -e "${BLUE}────────────────────────────────${NC}"
while true; do
perform_check
# 定期显示统计
if [ $((check_count % 20)) -eq 0 ]; then
show_statistics
echo ""
fi
sleep "$CHECK_INTERVAL"
done
}
# 显示状态
show_status() {
echo "监控配置:"
echo " 目标主机: $DB_HOST"
echo " 目标端口: $DB_PORT"
echo " 检查间隔: ${CHECK_INTERVAL}秒"
echo " 日志文件: $LOG_FILE"
echo " PID文件: $PID_FILE"
echo ""
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
if ps -p "$pid" > /dev/null 2>&1; then
echo "监控状态: 🟢 运行中(守护进程)"
echo "进程PID: $pid"
echo "启动时间: $(ps -p "$pid" -o lstart= 2>/dev/null || echo "未知")"
echo "运行用户: $(ps -p "$pid" -o user= 2>/dev/null || echo "未知")"
# 显示最近状态
if [ -f "$LOG_FILE" ]; then
echo ""
echo "最近状态:"
tail -5 "$LOG_FILE" | grep -E "(STATUS|ALERT)" | tail -3
fi
else
echo "监控状态: 🔴 未运行(残留PID文件)"
rm -f "$PID_FILE"
fi
else
echo "监控状态: 🔴 未运行"
fi
}
# 停止监控
stop_monitor() {
echo "监控配置: $DB_HOST:$DB_PORT"
echo ""
if [ -f "$PID_FILE" ]; then
local pid=$(cat "$PID_FILE")
if ps -p "$pid" > /dev/null 2>&1; then
echo "停止监控守护进程 (PID: $pid)..."
# 杀死整个进程组
kill -- -$(ps -o pgid= $pid | grep -o '[0-9]*') 2>/dev/null || kill $pid
# 等待进程结束
for i in {1..10}; do
if ! ps -p "$pid" > /dev/null 2>&1; then
break
fi
echo "等待进程结束... ($i/10)"
sleep 1
done
# 强制停止如果还在运行
if ps -p "$pid" > /dev/null 2>&1; then
echo "强制停止进程..."
kill -9 "$pid" 2>/dev/null
fi
rm -f "$PID_FILE"
echo "✅ 监控守护进程已停止"
else
echo "进程不存在,清理PID文件"
rm -f "$PID_FILE"
fi
else
echo "监控未运行(未找到PID文件)"
fi
}
# 主程序
main() {
# 检查是否是守护进程模式
if [ "$1" = "--daemon" ]; then
shift
parse_arguments "$@"
validate_parameters
daemon_main_loop
exit 0
fi
# 正常模式
parse_arguments "$@"
validate_parameters
case "${COMMAND:-}" in
start)
start_daemon
;;
stop)
stop_monitor
;;
status)
show_status
;;
restart)
stop_monitor
sleep 2
start_daemon
;;
log)
if [ -f "$LOG_FILE" ]; then
echo "实时日志: $LOG_FILE"
echo "按 Ctrl+C 退出日志查看"
echo "────────────────────────────────"
tail -f "$LOG_FILE"
else
echo "日志文件不存在: $LOG_FILE"
fi
;;
*)
# 无命令时前台运行(可Ctrl+C停止)
check_pid
foreground_main_loop
;;
esac
}
# 启动主程序
main "$@"
文章
阅读量
获赞
