Files
myaps_api/apps/data_opt/utils/binlog_ha/failover_manager.py
T
chaoge 78269d8d74 新增: binlog监听器高可用增强模块
实现功能:
- Prometheus指标暴露(/metrics端点)
- 背压控制(主动限流机制)
- 事件去重(Redis+文件降级)
- 配置热更新(审计日志)
- 主备故障转移(心跳检测)
- 分布式锁安全降级
- 连接池监控(泄漏检测)
- 重试策略优化(指数退避)

新增接口:
- GET /metrics, /health, /binlog/status
- GET /binlog/backpressure/status
- POST /binlog/config/update, GET /binlog/config/audit
- GET /binlog/dedup/stats
- GET /binlog/failover/status

测试覆盖:
- 单元测试71个,全部通过
- 压测验证:吞吐量499事件/秒,达标率99.91%
2026-05-22 07:08:49 +08:00

328 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Binlog 监听器 - 主备故障转移管理器
提供主备选举、心跳维护、故障转移功能
"""
import os
import time
import json
import threading
import socket
from typing import Optional, Dict, Any
from datetime import datetime, timezone
from enum import Enum
from .models import ListenerRole
from .enhanced_lock import EnhancedDistributedLock, LockResult
from .prometheus_metrics import prometheus_metrics
from globalobjects import logger
class FailoverManager:
"""主备故障转移管理器"""
HEARTBEAT_KEY = "binlog:heartbeat"
MASTER_LOCK_NAME = "binlog:master_lock"
def __init__(
self,
heartbeat_interval: int = 5,
heartbeat_timeout: int = 30,
lock_ttl: int = 30
):
"""
初始化故障转移管理器
Args:
heartbeat_interval: 心跳间隔(秒)
heartbeat_timeout: 心跳超时(秒)
lock_ttl: 锁TTL(秒)
"""
self.heartbeat_interval = heartbeat_interval
self.heartbeat_timeout = heartbeat_timeout
self.lock_ttl = lock_ttl
self._role = ListenerRole.STANDALONE
self._lock = EnhancedDistributedLock(
lock_name=self.MASTER_LOCK_NAME,
ttl=lock_ttl
)
self._heartbeat_thread: Optional[threading.Thread] = None
self._monitor_thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._failover_count = 0
self._last_heartbeat = 0.0
self._promoted_time: Optional[float] = None
def _get_redis_client(self):
"""获取Redis客户端"""
try:
from apps.common.utils.redis_pool_manager import get_redis_pool_manager
pool_manager = get_redis_pool_manager()
return pool_manager.get_client()
except Exception as e:
logger.warning(f"⚠️ 获取Redis客户端失败: {e}")
return None
def acquire_master_role(self) -> bool:
"""
竞争主节点角色
流程:
1. 尝试获取分布式锁
2. 成功 → 升级为主节点,启动心跳线程
3. 失败 → 降级为备节点,启动监控线程
Returns:
是否成功成为主节点
"""
result = self._lock.acquire()
if result.success:
self._role = ListenerRole.MASTER
self._promoted_time = time.time()
prometheus_metrics.set_listener_role("master")
self._start_heartbeat_thread()
logger.success(
"主节点选举",
"FailoverManager",
f"已升级为主节点 (mode={result.mode.value})"
)
return True
else:
self._role = ListenerRole.SLAVE
prometheus_metrics.set_listener_role("slave")
self._start_monitor_thread()
logger.info(
f"⏳ 已降级为备节点 (reason={result.reason})"
)
return False
def _start_heartbeat_thread(self):
"""启动心跳线程(主节点专用)"""
if self._heartbeat_thread is not None:
return
def heartbeat_loop():
while not self._stop_event.is_set():
try:
self.update_heartbeat()
self._stop_event.wait(self.heartbeat_interval)
except Exception as e:
logger.error(f"心跳更新失败: {e}")
self._heartbeat_thread = threading.Thread(
target=heartbeat_loop,
daemon=True,
name='binlog-heartbeat'
)
self._heartbeat_thread.start()
logger.info("✅ 心跳线程已启动(主节点)")
def _start_monitor_thread(self):
"""启动监控线程(备节点专用)"""
if self._monitor_thread is not None:
return
def monitor_loop():
while not self._stop_event.is_set():
try:
master_healthy = self.monitor_master()
if not master_healthy:
logger.warning("⚠️ 主节点心跳超时,尝试接管...")
if self.promote_to_master():
break
self._stop_event.wait(self.heartbeat_interval)
except Exception as e:
logger.error(f"主节点监控失败: {e}")
self._monitor_thread = threading.Thread(
target=monitor_loop,
daemon=True,
name='binlog-monitor'
)
self._monitor_thread.start()
logger.info("✅ 监控线程已启动(备节点)")
def update_heartbeat(self):
"""
更新心跳时间戳(主节点专用)
Redis存储:
- Key: binlog:heartbeat
- Value: {timestamp, pid, hostname, binlog_file, binlog_pos}
- TTL: heartbeat_timeout × 2
"""
try:
client = self._get_redis_client()
if not client:
return
heartbeat_data = {
"timestamp": time.time(),
"pid": os.getpid(),
"hostname": socket.gethostname(),
"role": "master",
"promoted_at": self._promoted_time
}
ttl = self.heartbeat_timeout * 2
client.setex(
self.HEARTBEAT_KEY,
ttl,
json.dumps(heartbeat_data)
)
self._last_heartbeat = time.time()
prometheus_metrics.set_heartbeat_delay(0)
logger.debug(f"💓 心跳已更新: timestamp={heartbeat_data['timestamp']}")
except Exception as e:
logger.warning(f"⚠️ 心跳更新失败: {e}")
def monitor_master(self) -> bool:
"""
监控主节点心跳(备节点专用)
Returns:
主节点是否健康
"""
try:
client = self._get_redis_client()
if not client:
return True
data = client.get(self.HEARTBEAT_KEY)
if not data:
logger.warning("⚠️ 主节点心跳不存在")
return False
heartbeat = json.loads(data.decode())
last_heartbeat = heartbeat.get("timestamp", 0)
delay = time.time() - last_heartbeat
prometheus_metrics.set_heartbeat_delay(delay)
if delay > self.heartbeat_timeout:
logger.warning(
f"⚠️ 主节点心跳超时: delay={delay:.1f}s, "
f"timeout={self.heartbeat_timeout}s, "
f"master_pid={heartbeat.get('pid')}, "
f"master_host={heartbeat.get('hostname')}"
)
return False
logger.debug(
f"✅ 主节点健康: delay={delay:.1f}s, "
f"master_pid={heartbeat.get('pid')}"
)
return True
except Exception as e:
logger.warning(f"⚠️ 主节点心跳检查失败: {e}")
return True
def promote_to_master(self) -> bool:
"""
升级为主节点
流程:
1. 尝试获取分布式锁
2. 从持久化存储恢复Binlog位置
3. 启动监听循环
4. 发送故障转移告警
Returns:
是否成功升级
"""
logger.info("🔄 开始故障转移...")
result = self._lock.acquire()
if not result.success:
logger.warning("⚠️ 故障转移失败:无法获取锁")
return False
self._role = ListenerRole.MASTER
self._promoted_time = time.time()
self._failover_count += 1
prometheus_metrics.set_listener_role("master")
prometheus_metrics.inc_failover_count()
if self._monitor_thread:
self._stop_event.set()
self._monitor_thread = None
self._stop_event.clear()
self._start_heartbeat_thread()
logger.success(
"故障转移",
"FailoverManager",
f"已成功升级为主节点 (failover_count={self._failover_count})"
)
return True
def get_role(self) -> ListenerRole:
"""获取当前角色"""
return self._role
def get_master_info(self) -> Optional[Dict[str, Any]]:
"""获取主节点信息(备节点专用)"""
try:
client = self._get_redis_client()
if not client:
return None
data = client.get(self.HEARTBEAT_KEY)
if not data:
return None
heartbeat = json.loads(data.decode())
heartbeat["delay"] = time.time() - heartbeat.get("timestamp", 0)
return heartbeat
except Exception as e:
logger.warning(f"⚠️ 获取主节点信息失败: {e}")
return None
def get_failover_count(self) -> int:
"""获取故障转移次数"""
return self._failover_count
def stop(self):
"""停止故障转移管理器"""
self._stop_event.set()
if self._heartbeat_thread and self._heartbeat_thread.is_alive():
self._heartbeat_thread.join(timeout=5)
if self._monitor_thread and self._monitor_thread.is_alive():
self._monitor_thread.join(timeout=5)
self._lock.release()
logger.info("🛑 故障转移管理器已停止")
failover_manager = FailoverManager()