doc(Metrics&Alert): 修改数据库字段，修改内部处理逻辑并通过测试

acd19ml · acd19ml · commit 1b516fa9bfd0 · 2025-09-15T11:52:07.000+08:00
diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md
@@ -22,7 +22,7 @@
 |--------|------|------|
 | id | varchar(64) PK | 告警 issue ID |
 | state | enum(Closed, Open) | 问题状态 |
-| level | varchar(32) | 告警等级：如 P0/P1/Px/Warning |
+| level | varchar(32) | 告警等级：如 P0/P1/Px |
 | alert_state | enum(Pending, Restored, AutoRestored, InProcessing) | 处理状态 |
 | title | varchar(255) | 告警标题 |
 | labels | json | 标签，格式：[{key, value}] |
@@ -117,25 +117,21 @@
 
 ---
 
-### 7) service_states（服务异常状态表）
+### 7) service_states（服务状态表）
 
 追踪服务在某一版本上的健康状态与处置进度。
 
 | 字段名 | 类型 | 说明 |
 |--------|------|------|
 | service | varchar(255) PK | 服务名 |
 | version | varchar(255) PK | 版本号 |
-<!-- | level | varchar(32) | 影响等级：如 P0/P1/Px/Warning  | -->
-| detail | text | 异常详情（可为 JSON 文本）（可空） |
-| report_at | TIMESTAMP(6) | 首次报告时间 |
+| report_at | TIMESTAMP(6) | 同步alert_issue_ids中，alert_issue中alert_state=InProcessing状态的alert_since的最早时间 |
 | resolved_at | TIMESTAMP(6) | 解决时间（可空） |
-| health_state | enum(Normal,Processing,Error) | 处置阶段 |
-| correlation_id | varchar(255) | 关联 ID（用于跨系统联动/串联事件）（可空） |
+| health_state | enum(Normal,Warning,Error) | 处置阶段 |
+| alert_issue_ids | [] alert_issue_id | 关联alert_issues表的id |
 
 **索引建议：**
 - PRIMARY KEY: `(service, version)`
-- INDEX: `(health_state, report_at)`
-- INDEX: `(correlation_id)`
 
 ## 数据关系（ER）
 
diff --git a/internal/alerting/service/healthcheck/README.md b/internal/alerting/service/healthcheck/README.md
@@ -0,0 +1,229 @@
+# healthcheck — Pending 告警扫描与分发任务
+
+本包提供一个定时任务：
+- 周期性扫描 Pending 状态的告警
+- 将告警投递到消息队列（供下游处理器消费）
+- 成功投递后，原子地把缓存中的状态更新：
+  - `alert:issue:{id}` 的 `alertState`：Pending → InProcessing
+  - `service_state:{service}:{version}` 的 `health_state`：由告警等级推导（P0→Error；P1/P2→Warning）
+
+此任务默认只更新缓存，不直接更新数据库，以降低耦合与避免与业务处理竞争。数据库状态可由下游处理器在处理开始时回写，或由后续补偿任务兜底。
+
+——
+
+## 1. 触发与频率
+
+- 间隔：默认每 10s 扫描一次（可配置）
+- 批量：每次最多处理 200 条 Pending（可配置）
+- 并发：串行或小并发（<= 4），避免重复投递
+
+环境变量建议：
+```
+HC_SCAN_INTERVAL=10s
+HC_SCAN_BATCH=200
+HC_WORKERS=1
+```
+
+——
+
+## 2. 数据来源与过滤
+
+优先以数据库为准，结合缓存加速：
+
+- 数据库查询（推荐）
+  ```sql
+  SELECT id, level, title, labels, alert_since
+  FROM alert_issues
+  WHERE alert_state = 'Pending'
+  ORDER BY alert_since ASC
+  LIMIT $1;
+  ```
+
+当告警切换为 InProcessing 时，需要更新对应 `service_states.report_at` 为该 service/version 关联的 `alert_issue_ids` 中，所有 alert_issues 里 alert_state=InProcessing 的 `alert_since` 最早时间（min）。可通过下游处理器或本任务的补充逻辑回填：
+
+```sql
+UPDATE service_states ss
+SET report_at = sub.min_since
+FROM (
+  SELECT si.service, si.version, MIN(ai.alert_since) AS min_since
+  FROM service_states si
+  JOIN alert_issues ai ON ai.id = ANY(si.alert_issue_ids)
+  WHERE ai.alert_state = 'InProcessing'
+  GROUP BY si.service, si.version
+) AS sub
+WHERE ss.service = sub.service AND ss.version = sub.version;
+```
+
+- 或仅用缓存（可选）：
+  - 维护集合 `alert:index:alert_state:Pending`（若未维护，可临时 SCAN `alert:issue:*` 并过滤 JSON 中的 `alertState`，但不推荐在大规模下使用 SCAN）。
+
+——
+
+## 3. 消息队列
+
+抽象接口：
+```go
+type AlertQueue interface {
+    PublishAlert(ctx context.Context, msg AlertMessage) error
+}
+
+type AlertMessage struct {
+    ID         string            `json:"id"`
+    Service    string            `json:"service"`
+    Version    string            `json:"version,omitempty"`
+    Level      string            `json:"level"`
+    Title      string            `json:"title"`
+    AlertSince time.Time         `json:"alert_since"`
+    Labels     map[string]string `json:"labels"`
+}
+```
+
+实现可选：Kafka、NATS、SQS、Redis Stream（示例）：
+```go
+// Redis Stream 样例
+func (q *RedisStreamQueue) PublishAlert(ctx context.Context, m AlertMessage) error {
+    b, _ := json.Marshal(m)
+    return q.r.XAdd(ctx, &redis.XAddArgs{Stream: q.stream, Values: map[string]any{"data": b}}).Err()
+}
+```
+
+环境变量建议：
+```
+ALERT_QUEUE_KIND=redis_stream|kafka|nats
+ALERT_QUEUE_DSN=redis://localhost:6379/0
+ALERT_QUEUE_TOPIC=alerts.pending
+```
+
+——
+
+## 4. 缓存键与原子更新
+
+现有（或建议）键：
+- 告警：`alert:issue:{id}` → JSON，字段包含 `alertState`
+- 指数（可选）：`alert:index:alert_state:{Pending|InProcessing|...}`
+- 服务态：`service_state:{service}:{version}` → JSON，字段包含 `health_state`
+- 指数：`service_state:index:health:{Error|Warning|...}`
+
+为避免并发写冲突，建议使用 Lua CAS（Compare-And-Set）脚本原子修改值与索引：
+
+```lua
+-- KEYS[1] = alert key, ARGV[1] = expected, ARGV[2] = next, KEYS[2] = idx:old, KEYS[3] = idx:new, ARGV[3] = id
+local v = redis.call('GET', KEYS[1])
+if not v then return 0 end
+local obj = cjson.decode(v)
+if obj.alertState ~= ARGV[1] then return -1 end
+obj.alertState = ARGV[2]
+redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL')
+if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[3]) end
+if KEYS[3] ~= '' then redis.call('SADD', KEYS[3], ARGV[3]) end
+return 1
+```
+
+服务态类似（示例将态切换到推导的新态）：
+```lua
+-- KEYS[1] = service_state key, ARGV[1] = expected(optional), ARGV[2] = next, KEYS[2] = idx:old(optional), KEYS[3] = idx:new, ARGV[3] = member
+local v = redis.call('GET', KEYS[1])
+if not v then return 0 end
+local obj = cjson.decode(v)
+if ARGV[1] ~= '' and obj.health_state ~= ARGV[1] then return -1 end
+obj.health_state = ARGV[2]
+redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL')
+if KEYS[2] ~= '' then redis.call('SREM', KEYS[2], ARGV[3]) end
+if KEYS[3] ~= '' then redis.call('SADD', KEYS[3], ARGV[3]) end
+return 1
+```
+
+——
+
+## 5. 任务流程（伪代码）
+
+```go
+func runOnce(ctx context.Context, db *Database, rdb *redis.Client, q AlertQueue, batch int) error {
+    rows := queryPendingFromDB(ctx, db, batch) // id, level, title, labels(JSON), alert_since
+    for _, it := range rows {
+        svc := it.Labels["service"]
+        ver := it.Labels["service_version"]
+        // 1) 投递消息
+        if err := q.PublishAlert(ctx, AlertMessage{ID: it.ID, Service: svc, Version: ver, Level: it.Level, Title: it.Title, AlertSince: it.AlertSince, Labels: it.Labels}); err != nil {
+            // 投递失败：跳过状态切换，计数并继续
+            continue
+        }
+        // 2) 缓存状态原子切换（告警）
+        alertKey := "alert:issue:" + it.ID
+        rdb.Eval(ctx, alertCAS, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing"}, "Pending", "InProcessing", it.ID)
+        // 3) 缓存状态原子切换（服务态：按告警等级推导）
+        if svc != "" { // version 可空
+            target := deriveHealth(it.Level) // P0->Error; P1/P2->Warning; else Warning
+            svcKey := "service_state:" + svc + ":" + ver
+            -- 可按需指定旧态索引，否则留空
+            localOld := ''
+            newIdx := "service_state:index:health:" + target
+            member := svcKey
+            rdb.Eval(ctx, svcCAS, []string{svcKey, localOld, newIdx}, '', target, member)
+        }
+    }
+    return nil
+}
+
+func StartScheduler(ctx context.Context, deps Deps) {
+    t := time.NewTicker(deps.Interval)
+    defer t.Stop()
+    for {
+        select {
+        case <-ctx.Done(): return
+        case <-t.C:
+            _ = runOnce(ctx, deps.DB, deps.Redis, deps.Queue, deps.Batch)
+        }
+    }
+}
+```
+
+——
+
+## 6. 可观测与重试
+
+- 指标：扫描次数、选出数量、成功投递数量、CAS 成功/失败数量、用时分位
+- 日志：每批开始/结束、首尾 ID、错误明细
+- 重试：
+  - 消息投递失败：不更改缓存状态，等待下次扫描重试
+  - CAS 返回 -1（状态被他处更改）：记录并跳过
+
+——
+
+## 7. 本地验证
+
+1) 准备 Redis 与 DB（见 receiver/README.md）
+
+2) 造数据：插入一条 `alert_issues.alert_state='Pending'` 且缓存中存在 `alert:issue:{id}` 的 JSON。
+
+3) 启动任务：观察日志/指标。
+
+4) 验证缓存：
+```bash
+redis-cli --raw GET alert:issue:<id> | jq
+redis-cli --raw SMEMBERS alert:index:alert_state:InProcessing | head -n 20
+redis-cli --raw GET service_state:<service>:<version> | jq
+redis-cli --raw SMEMBERS service_state:index:health:Processing | head -n 20
+```
+
+5) 验证消息队列：在订阅端查看 `alerts.pending` 是否收到消息。
+
+——
+
+## 8. 配置汇总
+
+```
+# 扫描任务
+HC_SCAN_INTERVAL=10s
+HC_SCAN_BATCH=200
+HC_WORKERS=1
+
+# 队列
+ALERT_QUEUE_KIND=redis_stream|kafka|nats
+ALERT_QUEUE_DSN=redis://localhost:6379/0
+ALERT_QUEUE_TOPIC=alerts.pending
+```
+
+——
+
+
diff --git a/internal/alerting/service/receiver/README.md b/internal/alerting/service/receiver/README.md
@@ -164,18 +164,21 @@ func (h *Handler) AlertmanagerWebhook(c *gin.Context) {
             // 若唯一约束冲突/网络抖动等，记录后继续
             continue
         }
-        // 5) 同步写入 service_states（health_state=Error；detail/resolved_at/correlation_id 留空）
+        // 5) 同步写入 service_states（health_state 由 level 推导；resolved_at 留空；alert_issue_ids 追加新 issue id）
+        //    规则：P0→Error；P1/P2→Warning；其他→Warning（可按需调整）
         //    service 从 labels.service 取；version 可从 labels.service_version 取（可空）
-        if err := h.dao.UpsertServiceState(c, a.Labels["service"], a.Labels["service_version"], row.AlertSince, "Error"); err != nil {
+        derived := func(l string) string { if l == "P0" { return "Error" }; if l == "P1" || l == "P2" { return "Warning" }; return "Warning" }(row.Level)
+        //    report_at 此处暂不写，由 healthcheck 定时任务在 alert_issues 进入 InProcessing 后回填为最早的 alert_since
+        if err := h.dao.UpsertServiceState(c, a.Labels["service"], a.Labels["service_version"], nil, derived, row.ID); err != nil {
             // 仅记录错误，不阻断主流程
         }
         // 6) 写通到 Redis（不阻塞主流程，失败仅记录日志）
         //    alert_issues
         if err := h.cache.WriteIssue(c, row, a); err != nil {
             // 仅记录错误，避免影响 Alertmanager 重试逻辑
         }
-        //    service_states
-        _ = h.cache.WriteServiceState(c, a.Labels["service"], a.Labels["service_version"], row.AlertSince, "Error")
+        //    service_states（使用同样的推导态）
+        _ = h.cache.WriteServiceState(c, a.Labels["service"], a.Labels["service_version"], "", derived)
         MarkSeen(key) // 记忆幂等键
         created++
     }
@@ -264,7 +267,7 @@ func NormalizeLevel(sev string) string {
 目标：将 Alertmanager 的单条 AMAlert → AlertIssueRow。
 	•	id：uuid.NewString()
 	•	state：Open（首次创建强制）
-	•	alertState：InProcessing（首次创建强制）
+	•	alertState：Pending（首次创建强制）
 	•	level：NormalizeLevel(alert.Labels["severity"])
 	•	title：优先 annotations.summary，否则拼：{idc} {service} {alertname} ...
 	•	label：把 labels 展平成 [{key,value}]（额外加上一些关键来源信息：am_fingerprint、generatorURL、groupKey）
@@ -509,8 +512,8 @@ curl -X POST http://localhost:8080/v1/integrations/alertmanager/webhook \
 	•	service=serviceA
 	•	version=（若 labels 中有 service_version 则为其值，否则为空字符串）
 	•	report_at=与 alert_since 一致（若已存在则保留更早的 report_at）
-	•	health_state=Error
-	•	detail/resolved_at/correlation_id 为空
+	•	health_state=Warning（因本示例 level=P1）
+	•	alert_issue_ids 包含刚插入的 alert_issues.id
 
 Redis 中应看到：
 	•	key: alert:issue:<id> 值为 JSON 且 TTL≈3 天
diff --git a/internal/alerting/service/receiver/dao.go b/internal/alerting/service/receiver/dao.go
@@ -14,7 +14,7 @@ type AlertIssueDAO interface {
 
 // ServiceStateWriter optionally allows writing to service_states table.
 type ServiceStateWriter interface {
-	UpsertServiceState(ctx context.Context, service, version string, reportAt time.Time, healthState string) error
+	UpsertServiceState(ctx context.Context, service, version string, reportAt *time.Time, healthState string, issueID string) error
 }
 
 type NoopDAO struct{}
@@ -23,7 +23,7 @@ func NewNoopDAO() *NoopDAO { return &NoopDAO{} }
 
 func (d *NoopDAO) InsertAlertIssue(ctx context.Context, r *AlertIssueRow) error { return nil }
 
-func (d *NoopDAO) UpsertServiceState(ctx context.Context, service, version string, reportAt time.Time, healthState string) error {
+func (d *NoopDAO) UpsertServiceState(ctx context.Context, service, version string, reportAt *time.Time, healthState string, issueID string) error {
 	return nil
 }
 
@@ -44,17 +44,26 @@ func (d *PgDAO) InsertAlertIssue(ctx context.Context, r *AlertIssueRow) error {
 	return nil
 }
 
-// UpsertServiceState inserts or updates service_states with health_state and earliest report_at.
-// detail, resolved_at, correlation_id remain empty/unchanged.
-func (d *PgDAO) UpsertServiceState(ctx context.Context, service, version string, reportAt time.Time, healthState string) error {
+// UpsertServiceState inserts or updates service_states with health_state and alert_issue_ids.
+// report_at is not updated here except at insert-time if provided (may be NULL).
+func (d *PgDAO) UpsertServiceState(ctx context.Context, service, version string, reportAt *time.Time, healthState string, issueID string) error {
 	const q = `
-	INSERT INTO service_states (service, version, report_at, health_state)
-	VALUES ($1, $2, $3, $4)
+	INSERT INTO service_states (service, version, report_at, health_state, alert_issue_ids)
+	VALUES ($1, $2, $3, $4, ARRAY[$5]::text[])
 	ON CONFLICT (service, version) DO UPDATE
 	SET health_state = EXCLUDED.health_state,
-	    report_at = LEAST(service_states.report_at, EXCLUDED.report_at)
+		alert_issue_ids = CASE
+			WHEN NOT ($5 = ANY(service_states.alert_issue_ids)) THEN array_append(service_states.alert_issue_ids, $5)
+			ELSE service_states.alert_issue_ids
+		END
 	`
-	if _, err := d.DB.ExecContext(ctx, q, service, version, reportAt, healthState); err != nil {
+	var reportAtVal any
+	if reportAt != nil {
+		reportAtVal = *reportAt
+	} else {
+		reportAtVal = nil
+	}
+	if _, err := d.DB.ExecContext(ctx, q, service, version, reportAtVal, healthState, issueID); err != nil {
 		return fmt.Errorf("upsert service_state: %w", err)
 	}
 	return nil
diff --git a/internal/alerting/service/receiver/handler.go b/internal/alerting/service/receiver/handler.go
@@ -3,6 +3,7 @@ package receiver
 import (
 	"net/http"
 	"strings"
+	"time"
 
 	"github.com/fox-gonic/fox"
 )
@@ -60,13 +61,19 @@ func (h *Handler) AlertmanagerWebhook(c *fox.Context) {
 		if err := h.dao.InsertAlertIssue(c.Request.Context(), row); err != nil {
 			continue
 		}
-		// Upsert service_states: health_state=Error; detail/resolved_at/correlation_id left empty
+
 		if w, ok := h.dao.(ServiceStateWriter); ok {
 			service := strings.TrimSpace(a.Labels["service"])
 			version := strings.TrimSpace(a.Labels["service_version"]) // optional
 			if service != "" {
-				_ = w.UpsertServiceState(c.Request.Context(), service, version, row.AlertSince, "Error")
-				_ = h.cache.WriteServiceState(c.Request.Context(), service, version, row.AlertSince, "Error")
+				derived := "Warning"
+				if row.Level == "P0" {
+					derived = "Error"
+				} else if row.Level == "P1" || row.Level == "P2" {
+					derived = "Warning"
+				}
+				_ = w.UpsertServiceState(c.Request.Context(), service, version, nil, derived, row.ID)
+				_ = h.cache.WriteServiceState(c.Request.Context(), service, version, time.Time{}, derived)
 			}
 		}
 		// Write-through to cache. Errors are ignored to avoid impacting webhook ack.