|
| 1 | +# healthcheck — Pending 告警扫描与分发任务 |
| 2 | + |
| 3 | +本包提供一个定时任务: |
| 4 | +- 周期性扫描 Pending 状态的告警 |
| 5 | +- 将告警投递到消息队列(供下游处理器消费) |
| 6 | +- 成功投递后,原子地把缓存中的状态更新: |
| 7 | + - `alert:issue:{id}` 的 `alertState`:Pending → InProcessing |
| 8 | + - `service_state:{service}:{version}` 的 `health_state`:由告警等级推导(P0→Error;P1/P2→Warning) |
| 9 | + |
| 10 | +此任务默认只更新缓存,不直接更新数据库,以降低耦合与避免与业务处理竞争。数据库状态可由下游处理器在处理开始时回写,或由后续补偿任务兜底。 |
| 11 | + |
| 12 | +—— |
| 13 | + |
| 14 | +## 1. 触发与频率 |
| 15 | + |
| 16 | +- 间隔:默认每 10s 扫描一次(可配置) |
| 17 | +- 批量:每次最多处理 200 条 Pending(可配置) |
| 18 | +- 并发:串行或小并发(<= 4),避免重复投递 |
| 19 | + |
| 20 | +环境变量建议: |
| 21 | +``` |
| 22 | +HC_SCAN_INTERVAL=10s |
| 23 | +HC_SCAN_BATCH=200 |
| 24 | +HC_WORKERS=1 |
| 25 | +``` |
| 26 | + |
| 27 | +—— |
| 28 | + |
| 29 | +## 2. 数据来源与过滤 |
| 30 | + |
| 31 | +优先以数据库为准,结合缓存加速: |
| 32 | + |
| 33 | +- 数据库查询(推荐) |
| 34 | + ```sql |
| 35 | + SELECT id, level, title, labels, alert_since |
| 36 | + FROM alert_issues |
| 37 | + WHERE alert_state = 'Pending' |
| 38 | + ORDER BY alert_since ASC |
| 39 | + LIMIT $1; |
| 40 | + ``` |
| 41 | + |
| 42 | +当告警切换为 InProcessing 时,需要更新对应 `service_states.report_at` 为该 service/version 关联的 `alert_issue_ids` 中,所有 alert_issues 里 alert_state=InProcessing 的 `alert_since` 最早时间(min)。可通过下游处理器或本任务的补充逻辑回填: |
| 43 | + |
| 44 | +```sql |
| 45 | +UPDATE service_states ss |
| 46 | +SET report_at = sub.min_since |
| 47 | +FROM ( |
| 48 | + SELECT si.service, si.version, MIN(ai.alert_since) AS min_since |
| 49 | + FROM service_states si |
| 50 | + JOIN alert_issues ai ON ai.id = ANY(si.alert_issue_ids) |
| 51 | + WHERE ai.alert_state = 'InProcessing' |
| 52 | + GROUP BY si.service, si.version |
| 53 | +) AS sub |
| 54 | +WHERE ss.service = sub.service AND ss.version = sub.version; |
| 55 | +``` |
| 56 | + |
| 57 | +- 或仅用缓存(可选): |
| 58 | + - 维护集合 `alert:index:alert_state:Pending`(若未维护,可临时 SCAN `alert:issue:*` 并过滤 JSON 中的 `alertState`,但不推荐在大规模下使用 SCAN)。 |
| 59 | + |
| 60 | +—— |
| 61 | + |
| 62 | +## 3. 消息队列 |
| 63 | + |
| 64 | +抽象接口: |
| 65 | +```go |
| 66 | +type AlertQueue interface { |
| 67 | + PublishAlert(ctx context.Context, msg AlertMessage) error |
| 68 | +} |
| 69 | + |
| 70 | +type AlertMessage struct { |
| 71 | + ID string `json:"id"` |
| 72 | + Service string `json:"service"` |
| 73 | + Version string `json:"version,omitempty"` |
| 74 | + Level string `json:"level"` |
| 75 | + Title string `json:"title"` |
| 76 | + AlertSince time.Time `json:"alert_since"` |
| 77 | + Labels map[string]string `json:"labels"` |
| 78 | +} |
| 79 | +``` |
| 80 | + |
| 81 | +实现可选:Kafka、NATS、SQS、Redis Stream(示例): |
| 82 | +```go |
| 83 | +// Redis Stream 样例 |
| 84 | +func (q *RedisStreamQueue) PublishAlert(ctx context.Context, m AlertMessage) error { |
| 85 | + b, _ := json.Marshal(m) |
| 86 | + return q.r.XAdd(ctx, &redis.XAddArgs{Stream: q.stream, Values: map[string]any{"data": b}}).Err() |
| 87 | +} |
| 88 | +``` |
| 89 | + |
| 90 | +环境变量建议: |
| 91 | +``` |
| 92 | +ALERT_QUEUE_KIND=redis_stream|kafka|nats |
| 93 | +ALERT_QUEUE_DSN=redis://localhost:6379/0 |
| 94 | +ALERT_QUEUE_TOPIC=alerts.pending |
| 95 | +``` |
| 96 | + |
| 97 | +—— |
| 98 | + |
| 99 | +## 4. 缓存键与原子更新 |
| 100 | + |
| 101 | +现有(或建议)键: |
| 102 | +- 告警:`alert:issue:{id}` → JSON,字段包含 `alertState` |
| 103 | +- 指数(可选):`alert:index:alert_state:{Pending|InProcessing|...}` |
| 104 | +- 服务态:`service_state:{service}:{version}` → JSON,字段包含 `health_state` |
| 105 | +- 指数:`service_state:index:health:{Error|Warning|...}` |
| 106 | + |
| 107 | +为避免并发写冲突,建议使用 Lua CAS(Compare-And-Set)脚本原子修改值与索引: |
| 108 | + |
| 109 | +```lua |
| 110 | +-- KEYS[1] = alert key, ARGV[1] = expected, ARGV[2] = next, KEYS[2] = idx:old, KEYS[3] = idx:new, ARGV[3] = id |
| 111 | +local v = redis.call('GET', KEYS[1]) |
| 112 | +if not v then return 0 end |
| 113 | +local obj = cjson.decode(v) |
| 114 | +if obj.alertState ~= ARGV[1] then return -1 end |
| 115 | +obj.alertState = ARGV[2] |
| 116 | +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') |
| 117 | +if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[3]) end |
| 118 | +if KEYS[3] ~= '' then redis.call('SADD', KEYS[3], ARGV[3]) end |
| 119 | +return 1 |
| 120 | +``` |
| 121 | + |
| 122 | +服务态类似(示例将态切换到推导的新态): |
| 123 | +```lua |
| 124 | +-- KEYS[1] = service_state key, ARGV[1] = expected(optional), ARGV[2] = next, KEYS[2] = idx:old(optional), KEYS[3] = idx:new, ARGV[3] = member |
| 125 | +local v = redis.call('GET', KEYS[1]) |
| 126 | +if not v then return 0 end |
| 127 | +local obj = cjson.decode(v) |
| 128 | +if ARGV[1] ~= '' and obj.health_state ~= ARGV[1] then return -1 end |
| 129 | +obj.health_state = ARGV[2] |
| 130 | +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') |
| 131 | +if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[3]) end |
| 132 | +if KEYS[3] ~= '' then redis.call('SADD', KEYS[3], ARGV[3]) end |
| 133 | +return 1 |
| 134 | +``` |
| 135 | + |
| 136 | +—— |
| 137 | + |
| 138 | +## 5. 任务流程(伪代码) |
| 139 | + |
| 140 | +```go |
| 141 | +func runOnce(ctx context.Context, db *Database, rdb *redis.Client, q AlertQueue, batch int) error { |
| 142 | + rows := queryPendingFromDB(ctx, db, batch) // id, level, title, labels(JSON), alert_since |
| 143 | + for _, it := range rows { |
| 144 | + svc := it.Labels["service"] |
| 145 | + ver := it.Labels["service_version"] |
| 146 | + // 1) 投递消息 |
| 147 | + if err := q.PublishAlert(ctx, AlertMessage{ID: it.ID, Service: svc, Version: ver, Level: it.Level, Title: it.Title, AlertSince: it.AlertSince, Labels: it.Labels}); err != nil { |
| 148 | + // 投递失败:跳过状态切换,计数并继续 |
| 149 | + continue |
| 150 | + } |
| 151 | + // 2) 缓存状态原子切换(告警) |
| 152 | + alertKey := "alert:issue:" + it.ID |
| 153 | + rdb.Eval(ctx, alertCAS, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing"}, "Pending", "InProcessing", it.ID) |
| 154 | + // 3) 缓存状态原子切换(服务态:按告警等级推导) |
| 155 | + if svc != "" { // version 可空 |
| 156 | + target := deriveHealth(it.Level) // P0->Error; P1/P2->Warning; else Warning |
| 157 | + svcKey := "service_state:" + svc + ":" + ver |
| 158 | + -- 可按需指定旧态索引,否则留空 |
| 159 | + localOld := '' |
| 160 | + newIdx := "service_state:index:health:" + target |
| 161 | + member := svcKey |
| 162 | + rdb.Eval(ctx, svcCAS, []string{svcKey, localOld, newIdx}, '', target, member) |
| 163 | + } |
| 164 | + } |
| 165 | + return nil |
| 166 | +} |
| 167 | + |
| 168 | +func StartScheduler(ctx context.Context, deps Deps) { |
| 169 | + t := time.NewTicker(deps.Interval) |
| 170 | + defer t.Stop() |
| 171 | + for { |
| 172 | + select { |
| 173 | + case <-ctx.Done(): return |
| 174 | + case <-t.C: |
| 175 | + _ = runOnce(ctx, deps.DB, deps.Redis, deps.Queue, deps.Batch) |
| 176 | + } |
| 177 | + } |
| 178 | +} |
| 179 | +``` |
| 180 | + |
| 181 | +—— |
| 182 | + |
| 183 | +## 6. 可观测与重试 |
| 184 | + |
| 185 | +- 指标:扫描次数、选出数量、成功投递数量、CAS 成功/失败数量、用时分位 |
| 186 | +- 日志:每批开始/结束、首尾 ID、错误明细 |
| 187 | +- 重试: |
| 188 | + - 消息投递失败:不更改缓存状态,等待下次扫描重试 |
| 189 | + - CAS 返回 -1(状态被他处更改):记录并跳过 |
| 190 | + |
| 191 | +—— |
| 192 | + |
| 193 | +## 7. 本地验证 |
| 194 | + |
| 195 | +1) 准备 Redis 与 DB(见 receiver/README.md) |
| 196 | + |
| 197 | +2) 造数据:插入一条 `alert_issues.alert_state='Pending'` 且缓存中存在 `alert:issue:{id}` 的 JSON。 |
| 198 | + |
| 199 | +3) 启动任务:观察日志/指标。 |
| 200 | + |
| 201 | +4) 验证缓存: |
| 202 | +```bash |
| 203 | +redis-cli --raw GET alert:issue:<id> | jq |
| 204 | +redis-cli --raw SMEMBERS alert:index:alert_state:InProcessing | head -n 20 |
| 205 | +redis-cli --raw GET service_state:<service>:<version> | jq |
| 206 | +redis-cli --raw SMEMBERS service_state:index:health:Processing | head -n 20 |
| 207 | +``` |
| 208 | + |
| 209 | +5) 验证消息队列:在订阅端查看 `alerts.pending` 是否收到消息。 |
| 210 | + |
| 211 | +—— |
| 212 | + |
| 213 | +## 8. 配置汇总 |
| 214 | + |
| 215 | +``` |
| 216 | +# 扫描任务 |
| 217 | +HC_SCAN_INTERVAL=10s |
| 218 | +HC_SCAN_BATCH=200 |
| 219 | +HC_WORKERS=1 |
| 220 | +
|
| 221 | +# 队列 |
| 222 | +ALERT_QUEUE_KIND=redis_stream|kafka|nats |
| 223 | +ALERT_QUEUE_DSN=redis://localhost:6379/0 |
| 224 | +ALERT_QUEUE_TOPIC=alerts.pending |
| 225 | +``` |
| 226 | + |
| 227 | +—— |
| 228 | + |
| 229 | + |
0 commit comments