@@ -154,16 +154,125 @@ KUBECONFIG_PATH = xxx (Optional参数,只有当KUBECONFIG_MODE = LOCAL 时生
154154
155155### 访问可观测数据
156156
157- 优先访问ACK集群对应的阿里云Prometheus服务数据,如没有对应服务,通过env参数寻找可观测数据的访问地址。
158- 通过配置可指定[ Prometheus Read HTTP API] ( https://prometheus.io/docs/prometheus/latest/querying/api/ ) 。
157+ #### Prometheus 端点解析策略
158+
159+ ack-mcp-server 支持三种 Prometheus 端点解析模式,通过 ` prometheus_endpoint_mode ` 参数配置:
160+
161+ ** 1. ARMS_PUBLIC(默认模式)**
162+
163+ 通过阿里云 ARMS API 自动获取集群对应的 Prometheus 实例公网端点,失败时回退到本地配置:
164+
165+ ``` bash
166+ # 命令行参数
167+ --prometheus-endpoint-mode ARMS_PUBLIC
168+
169+ # 环境变量
170+ export PROMETHEUS_ENDPOINT_MODE=ARMS_PUBLIC
171+ ```
172+
173+ - 调用 ARMS GetPrometheusInstance API 获取 ` http_api_inter_url ` (公网访问地址)
174+ - 适用于 ack-mcp-server 部署在集群外部的场景
175+ - ARMS API 失败时自动回退到本地配置
176+
177+ ** 2. ARMS_PRIVATE(内网模式)**
178+
179+ 通过阿里云 ARMS API 自动获取集群对应的 Prometheus 实例内网端点:
180+
181+ ``` bash
182+ # 命令行参数
183+ --prometheus-endpoint-mode ARMS_PRIVATE
184+
185+ # 环境变量
186+ export PROMETHEUS_ENDPOINT_MODE=ARMS_PRIVATE
187+ ```
188+
189+ - 调用 ARMS GetPrometheusInstance API 获取 ` http_api_intra_url ` (内网访问地址)
190+ - 适用于 ack-mcp-server 部署在集群内部或与阿里云 VPC 内网打通的场景
191+ - ** 要求** :ack-mcp-server 所在部署环境需与对应 region 阿里云 VPC 内网网域打通
192+ - ARMS API 失败时自动回退到本地配置
193+
194+ ** 3. LOCAL(本地配置模式)**
195+
196+ 仅使用本地静态配置或环境变量,不调用 ARMS API:
197+
198+ ``` bash
199+ # 命令行参数
200+ --prometheus-endpoint-mode LOCAL
201+
202+ # 环境变量
203+ export PROMETHEUS_ENDPOINT_MODE=LOCAL
204+ ```
205+
206+ - 不调用任何 ARMS API
207+ - 适用于使用自建 Prometheus 或开发测试环境
208+ - 必须通过环境变量或静态配置指定 Prometheus 端点
209+
210+ #### Prometheus 端点配置
211+
212+ 当使用 ` LOCAL ` 模式或 ARMS API 回退时,按如下优先级查找 Prometheus HTTP API 端点:
159213
160- 当该集群没有阿里云Prometheus对应实例数据,ack-mcp-server将按按如下优先级寻找={prometheus_http_api_url}访问可观测数据。
161214``` shell
162- env参数配置:
215+ # 1. 集群特定配置(优先级最高)
163216PROMETHEUS_HTTP_API_{cluster_id}={prometheus_http_api_url}
217+
218+ # 2. 全局默认配置
164219PROMETHEUS_HTTP_API={prometheus_http_api_url}
220+
221+ # 示例
222+ export PROMETHEUS_HTTP_API_c1234567890=" https://prometheus-cluster1.example.com"
223+ export PROMETHEUS_HTTP_API=" https://prometheus-default.example.com"
165224```
166225
226+ #### ExecutionLog 可观测性
227+
228+ 所有 Prometheus 端点解析过程都记录在 ` ExecutionLog ` 中,包括:
229+
230+ - ** mode** : 使用的解析模式(` ARMS_PUBLIC ` 、` ARMS_PRIVATE ` 或 ` LOCAL ` )
231+ - ** source** : 端点来源(` arms_api ` 、` static_config ` 或 ` env_var:XXX ` )
232+ - ** endpoint_type** : 端点类型(` public ` 或 ` private ` ,仅 ARMS 模式)
233+ - ** request_id** : ARMS API 调用的请求 ID(如适用)
234+ - ** duration_ms** : API 调用耗时(如适用)
235+ - ** endpoint** : 最终解析的端点地址
236+
237+ 示例 ExecutionLog(ARMS_PUBLIC 模式):
238+
239+ ``` json
240+ {
241+ "api_calls" : [
242+ {
243+ "api" : " GetPrometheusInstance" ,
244+ "source" : " arms_api" ,
245+ "mode" : " ARMS_PUBLIC" ,
246+ "cluster_id" : " c1234567890" ,
247+ "region_id" : " cn-hangzhou" ,
248+ "request_id" : " B8A0D7C3-..." ,
249+ "duration_ms" : 245 ,
250+ "status" : " success" ,
251+ "endpoint_type" : " public"
252+ }
253+ ]
254+ }
255+ ```
256+
257+ 示例 ExecutionLog(LOCAL 模式):
258+
259+ ``` json
260+ {
261+ "api_calls" : [
262+ {
263+ "api" : " GetPrometheusEndpoint" ,
264+ "source" : " env_var:PROMETHEUS_HTTP_API_c1234567890" ,
265+ "mode" : " LOCAL" ,
266+ "cluster_id" : " c1234567890" ,
267+ "endpoint" : " https://prometheus-cluster1.example.com" ,
268+ "status" : " success"
269+ }
270+ ]
271+ }
272+ ```
273+
274+ 通过配置可指定[ Prometheus Read HTTP API] ( https://prometheus.io/docs/prometheus/latest/querying/api/ ) 。
275+
167276## 包命名和版本管理
168277
169278### 项目命名
@@ -404,6 +513,253 @@ async def query_prometheus_tool(
404513
405514MCP 服务器实现两种主要类型的端点:
406515
516+ ### 执行日志追踪 (ExecutionLog)
517+
518+ #### 设计目标
519+
520+ 所有 ack-mcp-server 工具调用都实现完整的执行日志追踪,记录工具执行的全生命周期,包括:
521+ - 工具调用的起止时间和总耗时
522+ - 所有外部 API 调用(ACK、ARMS、SLS 等)的详细信息
523+ - 执行过程中的警告信息
524+ - 错误信息和异常元数据
525+
526+ 这些日志用于审计、性能监控、问题诊断和系统可观测性。
527+
528+ #### ExecutionLog 数据结构
529+
530+ ``` python
531+ class ExecutionLog (BaseModel ):
532+ """ 执行日志模型"""
533+ tool_call_id: str = Field(... , description = " 工具调用的唯一标识符" )
534+ start_time: str = Field(... , description = " 执行开始时间(ISO 8601格式)" )
535+ end_time: Optional[str ] = Field(None , description = " 执行结束时间(ISO 8601格式)" )
536+ duration_ms: Optional[int ] = Field(None , description = " 总执行时长(毫秒)" )
537+ messages: List[str ] = Field(default_factory = list , description = " 执行过程中的消息" )
538+ api_calls: List[Dict[str , Any]] = Field(default_factory = list , description = " API 调用记录列表" )
539+ warnings: List[str ] = Field(default_factory = list , description = " 警告信息列表" )
540+ error: Optional[str ] = Field(None , description = " 错误信息" )
541+ metadata: Optional[Dict[str , Any]] = Field(None , description = " 额外的元数据信息" )
542+ ```
543+
544+ #### 实现原则
545+
546+ ** 1. 成功场景 - 精简日志**
547+
548+ 正常成功的执行保持日志精简,仅记录关键信息:
549+ - API 调用名称、请求 ID、耗时、状态
550+ - 避免冗余的描述性消息
551+ - 不填充 metadata 字段
552+
553+ ``` python
554+ execution_log.api_calls.append({
555+ " api" : " DescribeClusterDetail" ,
556+ " cluster_id" : cluster_id,
557+ " request_id" : " B8A0D7C3-..." ,
558+ " duration_ms" : 234 ,
559+ " status" : " success"
560+ })
561+ ```
562+
563+ ** 2. 错误场景 - 详细日志**
564+
565+ 错误场景记录完整的诊断信息:
566+ - 错误类型、错误码、失败阶段
567+ - 详细的错误消息和堆栈信息
568+ - 上下文元数据(请求参数、状态等)
569+
570+ ``` python
571+ execution_log.error = " Cluster endpoint not available"
572+ execution_log.metadata = {
573+ " error_type" : " ValueError" ,
574+ " error_code" : " EndpointNotFound" ,
575+ " failure_stage" : " kubeconfig_acquisition" ,
576+ " cluster_id" : cluster_id,
577+ " kubeconfig_mode" : " ACK_PRIVATE"
578+ }
579+ ```
580+
581+ ** 3. 外部调用追踪**
582+
583+ 所有外部 API 调用都必须记录:
584+ - ** 阿里云 OpenAPI** :记录 request_id、duration_ms、http_status
585+ - ** Prometheus HTTP API** :记录 response_size_bytes、endpoint
586+ - ** Kubectl 命令** :记录 command、exit_code、type (normal/streaming)
587+ - ** Kubeconfig 获取** :记录 source (cache/ack_api/local_file/incluster)
588+
589+ #### 使用示例
590+
591+ ** 工具初始化执行日志**
592+
593+ ``` python
594+ @mcp.tool (name = ' query_prometheus' )
595+ async def query_prometheus (
596+ ctx : Context,
597+ query : str = Field(... , description = " PromQL 查询语句" ),
598+ cluster_id : str = Field(... , description = " 集群 ID" ),
599+ ) -> QueryPrometheusOutput:
600+ # 初始化执行日志
601+ start_ms = int (time.time() * 1000 )
602+ execution_log = ExecutionLog(
603+ tool_call_id = f " query_prometheus_ { cluster_id} _ { start_ms} " ,
604+ start_time = datetime.utcnow().isoformat() + " Z"
605+ )
606+
607+ try :
608+ # ... 执行业务逻辑 ...
609+
610+ # 记录结束时间
611+ execution_log.end_time = datetime.utcnow().isoformat() + " Z"
612+ execution_log.duration_ms = int (time.time() * 1000 ) - start_ms
613+
614+ return QueryPrometheusOutput(
615+ resultType = " matrix" ,
616+ result = results,
617+ execution_log = execution_log
618+ )
619+ except Exception as e:
620+ execution_log.error = str (e)
621+ execution_log.end_time = datetime.utcnow().isoformat() + " Z"
622+ execution_log.duration_ms = int (time.time() * 1000 ) - start_ms
623+ execution_log.metadata = {
624+ " error_type" : type (e).__name__ ,
625+ " failure_stage" : " prometheus_query"
626+ }
627+ return {
628+ " error" : ErrorModel(error_code = " QueryFailed" , error_message = str (e)).model_dump(),
629+ " execution_log" : execution_log
630+ }
631+ ```
632+
633+ ** API 调用追踪**
634+
635+ ``` python
636+ # ACK OpenAPI 调用
637+ api_start = int (time.time() * 1000 )
638+ response = await cs_client.describe_cluster_detail_with_options_async(
639+ cluster_id, request, headers, runtime
640+ )
641+ api_duration = int (time.time() * 1000 ) - api_start
642+
643+ # 提取 request_id
644+ request_id = None
645+ if hasattr (response, ' headers' ) and response.headers:
646+ request_id = response.headers.get(' x-acs-request-id' , ' N/A' )
647+
648+ execution_log.api_calls.append({
649+ " api" : " DescribeClusterDetail" ,
650+ " cluster_id" : cluster_id,
651+ " request_id" : request_id,
652+ " duration_ms" : api_duration,
653+ " status" : " success"
654+ })
655+ ```
656+
657+ ** 轮询场景 - 合并中间日志**
658+
659+ 对于需要轮询的异步操作(如诊断任务、巡检任务),需要合并中间轮询调用的执行日志:
660+
661+ ``` python
662+ # 提取轮询调用的 ExecutionLog
663+ if isinstance (result, dict ) and " execution_log" in result:
664+ poll_execution_log = result.get(" execution_log" )
665+ elif hasattr (result, ' execution_log' ):
666+ poll_execution_log = result.execution_log
667+
668+ # 合并到主执行日志
669+ if poll_execution_log:
670+ if hasattr (poll_execution_log, ' api_calls' ):
671+ execution_log.api_calls.extend(poll_execution_log.api_calls)
672+ if hasattr (poll_execution_log, ' warnings' ) and poll_execution_log.warnings:
673+ execution_log.warnings.extend(poll_execution_log.warnings)
674+ ```
675+
676+ #### 输出模型标准
677+
678+ 所有工具的输出模型必须继承 ` BaseOutputModel ` 以包含 ` execution_log ` 字段:
679+
680+ ``` python
681+ class BaseOutputModel (BaseModel ):
682+ """ 所有输出模型的基类"""
683+ execution_log: ExecutionLog = Field(
684+ default_factory = lambda : ExecutionLog(
685+ tool_call_id = " " ,
686+ start_time = datetime.utcnow().isoformat() + " Z"
687+ ),
688+ description = " 执行日志"
689+ )
690+
691+ class QueryPrometheusOutput (BaseOutputModel ):
692+ """ Prometheus 查询输出"""
693+ resultType: str = Field(... , description = " 结果类型" )
694+ result: List[QueryPrometheusSeriesPoint] = Field(... , description = " 查询结果" )
695+ # execution_log 自动继承
696+ ```
697+
698+ #### 完整示例日志
699+
700+ ** 成功场景** :
701+ ``` json
702+ {
703+ "tool_call_id" : " query_prometheus_c1234567890_1763624189" ,
704+ "start_time" : " 2025-01-19T10:23:09Z" ,
705+ "end_time" : " 2025-01-19T10:23:10Z" ,
706+ "duration_ms" : 1245 ,
707+ "api_calls" : [
708+ {
709+ "api" : " GetPrometheusInstance" ,
710+ "source" : " arms_api" ,
711+ "mode" : " ARMS_PUBLIC" ,
712+ "cluster_id" : " c1234567890" ,
713+ "region_id" : " cn-hangzhou" ,
714+ "request_id" : " B8A0D7C3-1D2E-4F5A-9B8C-7D6E5F4A3B2C" ,
715+ "duration_ms" : 245 ,
716+ "status" : " success" ,
717+ "endpoint_type" : " public"
718+ },
719+ {
720+ "api" : " PrometheusQuery" ,
721+ "endpoint" : " https://prometheus.cn-hangzhou.aliyuncs.com/api/v1/query_range" ,
722+ "cluster_id" : " c1234567890" ,
723+ "duration_ms" : 856 ,
724+ "status" : " success" ,
725+ "http_status" : 200 ,
726+ "response_size_bytes" : 3456
727+ }
728+ ],
729+ "warnings" : [],
730+ "error" : null ,
731+ "metadata" : null
732+ }
733+ ```
734+
735+ ** 错误场景** :
736+ ``` json
737+ {
738+ "tool_call_id" : " ack_kubectl_c1234567890_1763624289" ,
739+ "start_time" : " 2025-01-19T10:24:49Z" ,
740+ "end_time" : " 2025-01-19T10:24:50Z" ,
741+ "duration_ms" : 567 ,
742+ "api_calls" : [
743+ {
744+ "api" : " DescribeClusterDetail" ,
745+ "cluster_id" : " c1234567890" ,
746+ "request_id" : " A7B2C6D4-..." ,
747+ "duration_ms" : 234 ,
748+ "status" : " failed" ,
749+ "error" : " No intranet endpoint"
750+ }
751+ ],
752+ "warnings" : [],
753+ "error" : " Cluster c1234567890 does not have intranet endpoint access" ,
754+ "metadata" : {
755+ "error_type" : " ValueError" ,
756+ "failure_stage" : " kubeconfig_acquisition" ,
757+ "kubeconfig_mode" : " ACK_PRIVATE" ,
758+ "cluster_id" : " c1234567890"
759+ }
760+ }
761+ ```
762+
407763### 资源定义
408764
409765MCP协议中,“资源”为定制化地请求和访问本地的资源 (Resources allow servers to share data that provides context to language models, such as files, database schemas, or application-specific information. Each resource is uniquely identified by a URI.)
0 commit comments