Skip to content

Commit e101741

Browse files
committed
address additional comments, add some shortcuts, better documentation, additional testing, better naming
1 parent f97806d commit e101741

File tree

13 files changed

+179
-122
lines changed

13 files changed

+179
-122
lines changed

docs/topics/fenic-mcp.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ session.catalog.drop_tool("users_by_name_regex", ignore_if_not_exists=True)
136136

137137
### Step 2a: Auto-generate system tools from catalog tables
138138

139-
You can generate a suite of reusable data tools (Schema, Profile, Read, Search Summary, Search Content, Analyze) directly from catalog tables and their descriptions. This is helpful for quickly exposing exploratory and read/query capabilities to MCP.
139+
You can generate a suite of reusable data tools (Schema, Profile, Read, Search Summary, Search Content, Analyze) directly from catalog tables and their descriptions. This is helpful for quickly exposing exploratory and read/query capabilities to MCP. Available tools include: - Schema: list columns/types for any or all tables - Profile: column statistics (counts, basic numeric analysis [min, max, mean, etc.], contextual information for text columns [average_length, etc.]) - Read: read a selection of rows from a single table. These rows can be paged over, filtered and can use column projections. - Search Summary: regex search across all text columns in all tables -- returns back dataframe names with result counts. - Search Content: regex search across a single table, specifying one or more text columns to search across -- returns back rows corresponding to the query. - Analyze: Write raw SQL to perform complex analysis on one or more tables.
140140

141141
Requirements:
142142

@@ -147,15 +147,15 @@ Example:
147147
```python
148148
from fenic import Session
149149
from fenic.api.mcp.server import create_mcp_server
150-
from fenic.api.mcp.tools import ToolGenerationConfig
150+
from fenic.api.mcp.tools import SystemToolConfig
151151

152152
session = Session.get_or_create(...)
153153
server = create_mcp_server(
154154
session,
155155
server_name="Fenic MCP",
156-
system_tools=ToolGenerationConfig(
156+
system_tools=SystemToolConfig(
157157
table_names=["orders", "users"],
158-
tool_group_name="Dataset Exploration",
158+
tool_namespace="Dataset Exploration",
159159
max_result_rows=200,
160160
),
161161
)

src/fenic/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
SemanticExtensions,
2424
Session,
2525
SessionConfig,
26-
ToolGenerationConfig,
26+
SystemToolConfig,
2727
array,
2828
array_agg,
2929
array_contains,
@@ -239,7 +239,7 @@
239239
"BoundToolParam",
240240
"UserDefinedTool",
241241
"SystemTool",
242-
"ToolGenerationConfig",
242+
"SystemToolConfig",
243243
"create_mcp_server",
244244
"run_mcp_server_asgi",
245245
"run_mcp_server_async",

src/fenic/api/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
from fenic.api.io import DataFrameReader, DataFrameWriter
4545
from fenic.api.lineage import Lineage
4646
from fenic.api.mcp import (
47-
ToolGenerationConfig,
47+
SystemToolConfig,
4848
create_mcp_server,
4949
run_mcp_server_asgi,
5050
run_mcp_server_async,
@@ -137,5 +137,5 @@
137137
"run_mcp_server_sync",
138138
"run_mcp_server_async",
139139
"run_mcp_server_asgi",
140-
"ToolGenerationConfig",
140+
"SystemToolConfig",
141141
]

src/fenic/api/mcp/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""MCP Tool Creation/Server Management API."""
22

33
from fenic.api.mcp.server import (
4-
ToolGenerationConfig,
4+
SystemToolConfig,
55
create_mcp_server,
66
run_mcp_server_asgi,
77
run_mcp_server_async,
@@ -13,5 +13,5 @@
1313
"run_mcp_server_sync",
1414
"run_mcp_server_async",
1515
"run_mcp_server_asgi",
16-
"ToolGenerationConfig",
16+
"SystemToolConfig",
1717
]

src/fenic/api/mcp/_tool_generation_utils.py

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,23 @@
88
import polars as pl
99
from typing_extensions import Annotated
1010

11-
from fenic import (
11+
from fenic.api import col
12+
from fenic.api.dataframe import DataFrame
13+
from fenic.api.functions import avg, stddev
14+
from fenic.api.functions import max as max_
15+
from fenic.api.functions import min as min_
16+
from fenic.api.session import Session
17+
from fenic.core._logical_plan import LogicalPlan
18+
from fenic.core._logical_plan.plans import InMemorySource
19+
from fenic.core.error import ConfigurationError, ValidationError
20+
from fenic.core.mcp.types import SystemTool
21+
from fenic.core.types.datatypes import (
1222
BooleanType,
13-
DataFrame,
1423
DoubleType,
1524
FloatType,
1625
IntegerType,
17-
Session,
1826
StringType,
19-
SystemTool,
20-
avg,
21-
col,
22-
stddev,
2327
)
24-
from fenic import max as max_
25-
from fenic import min as min_
26-
from fenic.core._logical_plan import LogicalPlan
27-
from fenic.core._logical_plan.plans import InMemorySource
28-
from fenic.core._utils.schema import convert_custom_dtype_to_polars
29-
from fenic.core.error import ConfigurationError, ValidationError
3028

3129
PROFILE_MAX_SAMPLE_SIZE = 10_000
3230

@@ -44,11 +42,11 @@ class DatasetSpec:
4442
description: str
4543
df: DataFrame
4644

47-
def auto_generate_system_tools(
45+
def _auto_generate_system_tools(
4846
datasets: List[DatasetSpec],
4947
session: Session,
5048
*,
51-
tool_group_name: str,
49+
tool_namespace: Optional[str],
5250
max_result_limit: int = 100,
5351
) -> List[SystemTool]:
5452
"""Generate core tools spanning all datasets: Schema, Profile, Analyze.
@@ -67,7 +65,7 @@ def auto_generate_system_tools(
6765
schema_tool = _auto_generate_schema_tool(
6866
datasets,
6967
session,
70-
tool_name=f"{tool_group_name} - Schema",
68+
tool_name=f"{tool_namespace} - Schema" if tool_namespace else "Schema",
7169
tool_description="\n\n".join([
7270
"Show the schema (column names and types) for any or all of the datasets listed below. This call should be the first step in exploring the available datasets.",
7371
group_desc,
@@ -77,7 +75,7 @@ def auto_generate_system_tools(
7775
profile_tool = _auto_generate_profile_tool(
7876
datasets,
7977
session,
80-
tool_name=f"{tool_group_name} - Profile",
78+
tool_name=f"{tool_namespace} - Profile" if tool_namespace else "Profile",
8179
tool_description="\n".join([
8280
"Return dataset data profile: row_count and per-column stats for any or all of the datasets listed below.",
8381
"This call should be used as a follow up after calling the `Schema` tool."
@@ -91,8 +89,8 @@ def auto_generate_system_tools(
9189
read_tool = _auto_generate_read_tool(
9290
datasets,
9391
session,
94-
tool_name=f"{tool_group_name} - Read",
95-
tool_description="\n\n".join([
92+
tool_name=f"{tool_namespace} - Read" if tool_namespace else "Read",
93+
tool_description="\n".join([
9694
"Read rows from a single dataset. Use to sample data, or to execute simple queries over the data that do not require filtering or grouping.",
9795
"Use `include_columns` and `exclude_columns` to filter columns by name -- this is important to conserve token usage. Use the `Profile` tool to understand the columns and their sizes.",
9896
"Available datasets:",
@@ -104,8 +102,8 @@ def auto_generate_system_tools(
104102
search_summary_tool = _auto_generate_search_summary_tool(
105103
datasets,
106104
session,
107-
tool_name=f"{tool_group_name} - Search Summary",
108-
tool_description="\n\n".join([
105+
tool_name=f"{tool_namespace} - Search Summary" if tool_namespace else "Search Summary",
106+
tool_description="\n".join([
109107
"Perform a substring/regex search across all datasets and return a summary of the number of matches per dataset.",
110108
"Available datasets:",
111109
group_desc,
@@ -114,8 +112,8 @@ def auto_generate_system_tools(
114112
search_content_tool = _auto_generate_search_content_tool(
115113
datasets,
116114
session,
117-
tool_name=f"{tool_group_name} - Search Content",
118-
tool_description="\n\n".join([
115+
tool_name=f"{tool_namespace} - Search Content" if tool_namespace else "Search Content",
116+
tool_description="\n".join([
119117
"Return matching rows from a single dataset using substring/regex across string columns.",
120118
"Available datasets:",
121119
group_desc,
@@ -126,8 +124,8 @@ def auto_generate_system_tools(
126124
analyze_tool = _auto_generate_sql_tool(
127125
datasets,
128126
session,
129-
tool_name=f"{tool_group_name} - Analyze",
130-
tool_description="\n\n".join([
127+
tool_name=f"{tool_namespace} - Analyze" if tool_namespace else "Analyze",
128+
tool_description="\n".join([
131129
"Execute Read-Only (SELECT) SQL over the provided datasets using fenic's SQL support.",
132130
"DDL/DML and multiple top-level queries are not allowed.",
133131
"For text search, prefer regular expressions (REGEXP_MATCHES()/REGEXP_EXTRACT()).",
@@ -142,14 +140,11 @@ def auto_generate_system_tools(
142140
return [schema_tool, profile_tool, read_tool, search_summary_tool, search_content_tool, analyze_tool]
143141

144142

145-
def build_datasets_from_tables(table_names: List[str], session: Session) -> List[DatasetSpec]:
143+
def _build_datasets_from_tables(table_names: List[str], session: Session) -> List[DatasetSpec]:
146144
"""Resolve catalog table names into DatasetSpec list with validated descriptions.
147145
148146
Raises ConfigurationError if any table is missing or lacks a non-empty description.
149147
"""
150-
if len(table_names) == 0:
151-
raise ConfigurationError("No tables provided for tool generation.")
152-
153148
missing_desc: List[str] = []
154149
missing_tables: List[str] = []
155150
specs: List[DatasetSpec] = []
@@ -398,8 +393,7 @@ async def schema_func(
398393

399394
for name, d in selected.items():
400395
# Build a single-row DataFrame with a common list<struct{column,type}> schema column
401-
schema_entries = [{"column": f.name, "type": str(convert_custom_dtype_to_polars(f.data_type))} for f in
402-
d.schema.column_fields]
396+
schema_entries = [{"column": f.name, "type": str(f.data_type)} for f in d.schema.column_fields]
403397
dataset_names.append(name)
404398
dataset_schemas.append(schema_entries)
405399

@@ -768,3 +762,27 @@ async def _compute_profile_rows(
768762
stats.string_example_values = sampled
769763
rows_list.append(stats)
770764
return rows_list
765+
766+
767+
def auto_generate_system_tools_from_tables(
768+
table_names: Optional[List[str]],
769+
session: Session,
770+
*,
771+
tool_namespace: Optional[str],
772+
max_result_limit: int = 100,
773+
) -> List[SystemTool]:
774+
"""Generate Schema/Profile/Read/Search [Content/Summary]/Analyze tools from catalog tables.
775+
776+
Validates that each table exists and has a non-empty description in catalog metadata.
777+
"""
778+
if table_names is None:
779+
table_names = session.catalog.list_tables()
780+
if not table_names:
781+
raise ConfigurationError("No tables provided for tool generation and no existing tables found in the catalog.")
782+
datasets = _build_datasets_from_tables(table_names, session)
783+
return _auto_generate_system_tools(
784+
datasets,
785+
session,
786+
tool_namespace=tool_namespace,
787+
max_result_limit=max_result_limit,
788+
)

src/fenic/api/mcp/server.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
"""
77
from typing import List, Optional
88

9+
from fenic.api.mcp._tool_generation_utils import auto_generate_system_tools_from_tables
910
from fenic.api.mcp.tools import (
10-
ToolGenerationConfig,
11-
auto_generate_system_tools_from_tables,
11+
SystemToolConfig,
1212
)
1313
from fenic.api.session.session import Session
1414
from fenic.core.error import ConfigurationError
@@ -21,7 +21,7 @@ def create_mcp_server(
2121
server_name: str,
2222
*,
2323
user_defined_tools: Optional[List[UserDefinedTool]] = None,
24-
system_tools: Optional[ToolGenerationConfig] = None,
24+
system_tools: Optional[SystemToolConfig] = None,
2525
concurrency_limit: int = 8,
2626
) -> FenicMCPServer:
2727
"""Create an MCP server from datasets and tools.
@@ -37,11 +37,13 @@ def create_mcp_server(
3737
generated_system_tools = []
3838
user_defined_tools = user_defined_tools or []
3939
if system_tools:
40-
generated_system_tools.extend(auto_generate_system_tools_from_tables(
41-
system_tools.table_names,
42-
session,
43-
tool_group_name=system_tools.tool_group_name,
44-
max_result_limit=system_tools.max_result_rows)
40+
generated_system_tools.extend(
41+
auto_generate_system_tools_from_tables(
42+
system_tools.table_names,
43+
session,
44+
tool_namespace=system_tools.tool_namespace,
45+
max_result_limit=system_tools.max_result_rows
46+
)
4547
)
4648
if not (user_defined_tools or system_tools):
4749
raise ConfigurationError("No tools provided. Either provide tools or set generate_automated_tools=True and provide datasets.")

src/fenic/api/mcp/tools.py

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,46 +14,67 @@
1414
from dataclasses import dataclass
1515
from typing import (
1616
List,
17+
Optional,
1718
)
1819

19-
from fenic.api.mcp._tool_generation_utils import (
20-
auto_generate_system_tools,
21-
build_datasets_from_tables,
22-
)
23-
from fenic.api.session.session import Session
24-
from fenic.core.mcp.types import SystemTool
25-
2620

2721
@dataclass
28-
class ToolGenerationConfig:
29-
"""Configuration for automated tool generation.
22+
class SystemToolConfig:
23+
"""Configuration for canonical system tools.
3024
31-
Attributes:
32-
table_names: List of table names.
33-
tool_group_name: Name of the tool group.
34-
max_result_rows: Maximum number of rows to be returned from Read/Analyze tools.
35-
"""
25+
fenic can automatically generate a set of canonical tools for operating on one or more fenic tables.
3626
37-
table_names: List[str]
38-
tool_group_name: str
39-
max_result_rows: int = 100
27+
- Schema: list columns/types for any or all tables
28+
- Profile: column statistics (counts, basic numeric analysis [min, max, mean, etc.], contextual information for text columns [average_length, etc.])
29+
- Read: read a selection of rows from a single table. These rows can be paged over, filtered and can use column projections.
30+
- Search Summary: regex search across all text columns in all tables -- returns back dataframe names with result counts.
31+
- Search Content: regex search across a single table, specifying one or more text columns to search across -- returns back rows corresponding to the query.
32+
- Analyze: Write raw SQL to perform complex analysis on one or more tables.
4033
34+
Attributes:
35+
table_names: List of the fenic table names the tools should be able to access. If not provided, the generated tools will be able to access all tables in the catalog.
36+
tool_namespace: If provided, will prefix the names of the generated tools with this namespace value.
37+
For example, by default the generated tools will be named `read`, `profile`, etc. With multiple fenic
38+
MCP servers, these tool names will clash, which can be confusing. In order to disambiguate, the `tool_namespace`
39+
is prefixed to the tool name (in snake case), so a `tool_namespace` of `fenic` would create the tools `fenic_read`,
40+
`fenic_profile`, etc.
41+
max_result_rows: Maximum number of rows to be returned from Read/Analyze tools.
4142
42-
def auto_generate_system_tools_from_tables(
43-
table_names: List[str],
44-
session: Session,
45-
*,
46-
tool_group_name: str,
47-
max_result_limit: int = 100,
48-
) -> List[SystemTool]:
49-
"""Generate Schema/Profile/Read/Search/Analyze tools from catalog tables.
43+
Example:
44+
```python
45+
from fenic import SystemToolConfig
46+
from fenic.api.mcp.tools import SystemToolConfig
47+
from fenic.api.mcp.server import create_mcp_server
48+
from fenic.api.session.session import Session
49+
session = Session.get_or_create(...)
50+
df = session.create_dataframe({
51+
"c1": [1, 2, 3],
52+
"c2": [4, 5, 6]
53+
})
54+
df.write.save_as_table("table1", mode="overwrite")
55+
session.catalog.set_table_description("table1", "Table 1 Description")
56+
server = create_mcp_server(session, "Test Server", system_tools=SystemToolConfig(
57+
table_names=["table1"],
58+
tool_namespace="Auto",
59+
max_result_rows=100
60+
))
61+
```
5062
51-
Validates that each table exists and has a non-empty description in catalog metadata.
63+
Example: Allow generated tools to access all tables in the catalog. Equivalent to passing `table_names=session.catalog.list_tables()`
64+
```python
65+
from fenic import SystemToolConfig
66+
from fenic.api.mcp.tools import SystemToolConfig
67+
from fenic.api.mcp.server import create_mcp_server
68+
from fenic.api.session.session import Session
69+
session = Session.get_or_create(...)
70+
# Assuming you already have one or more tables saved to the catalog, with descriptions.
71+
server = create_mcp_server(session, "Test Server", system_tools=SystemToolConfig(
72+
tool_namespace="Auto",
73+
max_result_rows=100
74+
))
75+
```
5276
"""
53-
datasets = build_datasets_from_tables(table_names, session)
54-
return auto_generate_system_tools(
55-
datasets,
56-
session,
57-
tool_group_name=tool_group_name,
58-
max_result_limit=max_result_limit,
59-
)
77+
78+
table_names: Optional[List[str]] = None
79+
tool_namespace: Optional[str] = None
80+
max_result_rows: int = 100

0 commit comments

Comments
 (0)