Skip to content

Commit 392169d

Browse files
committed
address more pr comments, add ability for dynamic tools to set hints to mcp server, improve tests, improve documentation for the tools for the model
1 parent b406a38 commit 392169d

File tree

9 files changed

+207
-118
lines changed

9 files changed

+207
-118
lines changed

src/fenic/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
asc_nulls_first,
3232
asc_nulls_last,
3333
async_udf,
34-
auto_generate_core_tools_from_tables,
3534
avg,
3635
coalesce,
3736
col,
@@ -231,7 +230,6 @@
231230
"ParameterizedToolDefinition",
232231
"DynamicToolDefinition",
233232
"ToolGenerationConfig",
234-
"auto_generate_core_tools_from_tables",
235233
"create_mcp_server",
236234
"run_mcp_server_asgi",
237235
"run_mcp_server_async",

src/fenic/_backends/local/catalog.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def create_tool(
527527
# Ensure the tool is valid by resolving it.
528528
tool_definition = bind_tool(tool_name, tool_description, tool_params, result_limit, tool_query)
529529
cursor = self.db_conn.cursor()
530-
if self._does_tool_exist(cursor, tool_name):
530+
if self.system_tables.describe_tool(cursor, tool_name):
531531
if ignore_if_exists:
532532
return False
533533
raise ToolAlreadyExistsError(tool_name)
@@ -686,14 +686,6 @@ def get_metrics_for_session(self, session_id: str) -> Dict[str, float]:
686686
"""Get metrics for a specific session from the metrics system read-only table."""
687687
return self.system_tables.get_metrics_for_session(self.db_conn.cursor(), session_id)
688688

689-
def _does_tool_exist(self, cursor: duckdb.DuckDBPyConnection, tool_name: str) -> bool:
690-
try:
691-
return self.system_tables.describe_tool(cursor, tool_name) is not None
692-
except Exception as e:
693-
raise CatalogError(
694-
f"Failed to check if tool: {tool_name} exists"
695-
) from e
696-
697689
def _does_table_exist(self, cursor: duckdb.DuckDBPyConnection, table_identifier: TableIdentifier) -> bool:
698690
try:
699691
return cursor.execute(

src/fenic/api/mcp/tool_generation.py

Lines changed: 85 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -96,15 +96,29 @@ def fenic_tool(
9696
tool_description: str,
9797
max_result_limit: Optional[int] = None,
9898
default_table_format: TableFormat = "markdown",
99-
):
99+
read_only: bool = True,
100+
idempotent: bool = True,
101+
destructive: bool = False,
102+
open_world: bool = False,
103+
) -> Callable[[Callable[..., DataFrame]], DynamicToolDefinition]:
100104
"""Decorator to bind a DataFrame to a user-authored tool function.
101105
106+
Args:
107+
tool_name: The name of the tool.
108+
tool_description: The description of the tool.
109+
max_result_limit: The maximum number of results to return.
110+
default_table_format: The default table format to return.
111+
read_only: A hint to provide to the model that the tool does not modify its environment.
112+
idempotent: A hint to provide to the model that calling the tool multiple times with the same input will always return the same result (redundant if read_only is True).
113+
destructive: A hint to provide to the model that the tool may destructively modify its environment.
114+
open_world: A hint to provide to the model that the tool may interact with an "open world" of external entities outside of the MCP server's environment.
115+
102116
Example:
103117
@dynamic_tool(tool_name="find_rust", tool_description="...")
104118
def find_rust(
105119
query: Annotated[str, "Natural language query"],
106120
) -> DataFrame:
107-
pred = fc.semantic.predicate("Matches: {{q}} Data: {{bio}}", q=query, bio=fc.col("bio"))
121+
pred = fc.semantic.predicate("Matches: {{q}} Data: {{bio}}", q=fc.lit(query), bio=fc.col("bio"))
108122
return df.filter(pred)
109123
110124
mcp_server = fc.create_mcp_server(
@@ -114,6 +128,14 @@ def find_rust(
114128
)
115129
fc.run_mcp_server_sync(mcp_server)
116130
131+
Example: Creating an open-world tool that reaches out to an external API. The open_world flag indicates to the model that the tool may interact with an "open world" of external entities
132+
@fenic_tool(tool_name="search_knowledge_base", tool_description="...", open_world=True)
133+
def search_knowledge_base(
134+
query: Annotated[str, "Knowledge base search query"],
135+
) -> DataFrame:
136+
results = requests.get(...)
137+
return fc.create_dataframe(results)
138+
117139
Notes:
118140
- The decorated function MUST NOT use *args/**kwargs
119141
- The decorated function MUST return a fenic DataFrame.
@@ -136,6 +158,10 @@ def wrapper(*args, **kwargs) -> LogicalPlan:
136158
description=tool_description,
137159
max_result_limit=max_result_limit,
138160
default_table_format=default_table_format,
161+
read_only=read_only,
162+
idempotent=idempotent,
163+
destructive=destructive,
164+
open_world=open_world,
139165
_func=wrapper,
140166
)
141167

@@ -165,10 +191,12 @@ def _auto_generate_read_tool(
165191
name_to_df: Dict[str, DataFrame] = {d.table_name: d.df for d in datasets}
166192
def read_func(
167193
df_name: Annotated[str, "Dataset name to read rows from."],
168-
limit: Annotated[Optional[Union[int, str]], "Max rows to read"] = None,
194+
limit: Annotated[Optional[Union[int, str]], "Max rows to read within a page"] = result_limit,
169195
offset: Annotated[Optional[Union[int, str]], "Row offset to start from (requires order_by)"] = None,
170196
order_by: Annotated[Optional[str], "Comma separated list of columns to order by (required for offset)"] = None,
171197
sort_ascending: Annotated[Optional[Union[bool, str]], "Sort ascending for all order_by columns"] = True,
198+
include_columns: Annotated[Optional[str], "Comma separated list of columns to include in the result"] = None,
199+
exclude_columns: Annotated[Optional[str], "Comma separated list of columns to exclude from the result"] = None,
172200
) -> LogicalPlan:
173201

174202
if df_name not in name_to_df:
@@ -178,16 +206,12 @@ def read_func(
178206
offset = int(offset) if isinstance(offset, str) else offset
179207
sort_ascending = bool(sort_ascending) if isinstance(sort_ascending, str) else sort_ascending
180208
order_by = [c.strip() for c in order_by.split(",") if c.strip()] if order_by else None
181-
182-
# order_by when not paginating via OFFSET (to avoid double sorting)
183-
if order_by and offset is None:
184-
missing_order = [c for c in order_by if c not in df.columns]
185-
if missing_order:
186-
raise ValidationError(
187-
f"order_by column(s) {missing_order} do not exist in DataFrame. Available columns: {', '.join(df.columns)}"
188-
)
189-
df = df.order_by(order_by, ascending=sort_ascending)
190-
209+
include_columns = [c.strip() for c in include_columns.split(",") if c.strip()] if include_columns else None
210+
exclude_columns = [c.strip() for c in exclude_columns.split(",") if c.strip()] if exclude_columns else None
211+
if include_columns:
212+
df = df.select(*include_columns)
213+
if exclude_columns:
214+
df = df.select(*[c for c in df.columns if c not in exclude_columns])
191215
# Apply paging (handles offset+order_by via SQL and optional limit)
192216
return _apply_paging(
193217
df,
@@ -203,14 +227,9 @@ def read_func(
203227
description=tool_description,
204228
_func=read_func,
205229
max_result_limit=result_limit,
230+
add_limit_parameter=False,
206231
)
207232

208-
"""
209-
Replace single search generator with two split generators:
210-
- auto_generate_search_summary_tool
211-
- auto_generate_search_content_tool
212-
"""
213-
214233
def _auto_generate_search_summary_tool(
215234
datasets: List[DatasetSpec],
216235
session: Session,
@@ -266,18 +285,17 @@ def auto_generate_search_content_tool(
266285

267286
def _string_columns(df: DataFrame, selected: Optional[List[str]]) -> List[str]:
268287
if selected:
269-
selected_columns = [c.strip() for c in selected.split(",") if c.strip()]
270-
missing = [c for c in selected_columns if c not in df.columns]
288+
missing = [c for c in selected if c not in df.columns]
271289
if missing:
272290
raise ValidationError(f"Column(s) {missing} not found. Available: {', '.join(df.columns)}")
273-
return selected_columns
291+
return selected
274292
return [f.name for f in df.schema.column_fields if f.data_type == StringType]
275293

276294
def search_rows(
277295
df_name: Annotated[str, "Dataset name to search (single dataset)"],
278296
pattern: Annotated[str, "Regex pattern to search for (use (?i) for case-insensitive)."],
279-
limit: Annotated[Optional[Union[int, str]], "Max rows to return (accepts number or numeric string)"] = None,
280-
offset: Optional[Union[int, str]] = None,
297+
limit: Annotated[Optional[Union[int, str]], "Max rows to read within a page of search results"] = result_limit,
298+
offset: Annotated[Optional[Union[int, str]], "Row offset to start from (requires order_by)"] = None,
281299
order_by: Annotated[Optional[str], "Comma separated list of column names to order by (required with offset)"] = None,
282300
sort_ascending: Annotated[Optional[Union[bool, str]], "Sort ascending"] = True,
283301
search_columns: Annotated[Optional[str], "Comma separated list of column names search within; if omitted, matches in any string coluumn will be returned. Use this to query only specific columns in the search as needed."] = None,
@@ -317,9 +335,9 @@ def search_rows(
317335
description=tool_description,
318336
_func=search_rows,
319337
max_result_limit=result_limit,
338+
add_limit_parameter=False,
320339
)
321340

322-
323341
def _auto_generate_schema_tool(
324342
datasets: List[DatasetSpec],
325343
session: Session,
@@ -386,8 +404,6 @@ def schema_func(
386404
max_result_limit=None,
387405
)
388406

389-
390-
391407
def _auto_generate_sql_tool(
392408
datasets: List[DatasetSpec],
393409
session: Session,
@@ -405,17 +421,10 @@ def _auto_generate_sql_tool(
405421
if len(datasets) == 0:
406422
raise ConfigurationError("Cannot create SQL tool: no datasets provided.")
407423

408-
def _assert_full_sql_shape(sql_text: str) -> None:
409-
text = sql_text.strip().lower()
410-
if not text.startswith("select"):
411-
raise ValidationError("Only SELECT is allowed in full_sql")
412-
413424
def analyze_func(
414425
full_sql: Annotated[str, "Full SELECT SQL. Refer to DataFrames by name in braces, e.g., {orders}."]
415426
) -> LogicalPlan:
416-
sql_text = full_sql.strip()
417-
_assert_full_sql_shape(sql_text)
418-
return session.sql(sql_text, **{spec.table_name: spec.df for spec in datasets})._logical_plan
427+
return session.sql(full_sql.strip(), **{spec.table_name: spec.df for spec in datasets})._logical_plan
419428

420429
# Enhanced description with dataset names and descriptions
421430
lines: List[str] = [tool_description.strip(), "", "Datasets available:"]
@@ -430,16 +439,15 @@ def analyze_func(
430439
example_name = "data"
431440
lines.extend(
432441
[
433-
"",
434-
"Notes:",
435-
"- SQL dialect: DuckDB.",
436-
"- For text search, prefer regular expressions using REGEXP_MATCHES().",
437-
"- Paging: use ORDER BY to define row order, then LIMIT and OFFSET for pages.",
438-
"",
439-
"Examples:", # nosec B608 - example text only
440-
f"- SELECT * FROM {{{example_name}}} WHERE REGEXP_MATCHES(message, '(?i)error|fail') LIMIT 100", # nosec B608 - example text only
441-
f"- SELECT dept, COUNT(*) AS n FROM {{{example_name}}} WHERE status = 'active' GROUP BY dept HAVING n > 10 ORDER BY n DESC LIMIT 100", # nosec B608 - example text only
442-
f"- -- Paging: page 2 of size 50\n SELECT * FROM {{{example_name}}} ORDER BY created_at DESC LIMIT 50 OFFSET 50", # nosec B608 - example text only
442+
"\n\nNotes:\n",
443+
"- SQL dialect: DuckDB.\n",
444+
"- For text search, prefer regular expressions using REGEXP_MATCHES().\n",
445+
"- Paging: use ORDER BY to define row order, then LIMIT and OFFSET for pages.\n",
446+
f"- Returns a maximum of {result_limit} rows.\n",
447+
"Examples:\n", # nosec B608 - example text only
448+
f"- SELECT * FROM {example_name} WHERE REGEXP_MATCHES(message, '(?i)error|fail') LIMIT {result_limit}", # nosec B608 - example text only
449+
f"- SELECT dept, COUNT(*) AS n FROM {example_name} WHERE status = 'active' GROUP BY dept HAVING n > 10 ORDER BY n DESC LIMIT {result_limit}", # nosec B608 - example text only
450+
f"- Paging: page 2 of size {result_limit}\n SELECT * FROM {example_name} ORDER BY created_at DESC LIMIT {result_limit} OFFSET {result_limit}", # nosec B608 - example text only
443451
]
444452
)
445453
enhanced_description = "\n".join(lines)
@@ -449,6 +457,7 @@ def analyze_func(
449457
description=enhanced_description,
450458
_func=analyze_func,
451459
max_result_limit=result_limit,
460+
add_limit_parameter=False,
452461
)
453462
return tool
454463

@@ -473,35 +482,41 @@ def _apply_paging(
473482
order_by: list[str] | None,
474483
sort_ascending: bool | None,
475484
) -> LogicalPlan:
476-
"""Apply deterministic paging semantics: ORDER BY + LIMIT/OFFSET via SQL fallback.
485+
"""Apply ordering, limit, and offset via a single SQL statement.
477486
478-
- If offset is provided, order_by must also be provided; performs SQL-based ORDER BY LIMIT OFFSET.
479-
- If only limit is provided, uses DataFrame.limit.
480-
- Otherwise, returns the original plan.
487+
- If offset is provided, order_by must also be provided to ensure deterministic paging.
488+
- Validates that all order_by columns exist.
489+
- Builds: SELECT * FROM {src} [ORDER BY ...] [LIMIT N] [OFFSET M]
490+
- When no ordering/limit/offset are provided, returns the original plan.
481491
"""
482-
if offset is not None:
483-
if not order_by:
484-
raise ValidationError("offset requires order_by to ensure deterministic paging.")
492+
if order_by:
485493
missing_order = [c for c in order_by if c not in df.columns]
486494
if missing_order:
487495
raise ValidationError(
488496
f"order_by column(s) {missing_order} do not exist in DataFrame. Available columns: {', '.join(df.columns)}"
489497
)
490-
direction = "ASC" if (sort_ascending is None or sort_ascending) else "DESC"
498+
499+
if offset is not None and not order_by:
500+
raise ValidationError("offset requires order_by to ensure deterministic paging.")
501+
502+
if order_by is None and limit is None and offset is None:
503+
return df._logical_plan
504+
505+
direction = "ASC" if (sort_ascending is None or sort_ascending) else "DESC"
506+
lim_val = None if limit is None else int(str(limit))
507+
off_val = None if offset is None else int(str(offset))
508+
509+
base_sql = "SELECT * FROM {src}"
510+
if order_by:
491511
safe_order_by = ", ".join(order_by)
492-
lim_val = None if limit is None else int(str(limit))
493-
off_val = int(str(offset))
494-
base_sql = "SELECT * FROM {src} ORDER BY " + safe_order_by + f" {direction}" #nosec B608: little to no SQL injection risk as this is running on limited user-provided dataframe.
495-
if lim_val is not None:
496-
base_sql += f" LIMIT {lim_val}"
512+
base_sql += " ORDER BY " + safe_order_by + f" {direction}" #nosec B608
513+
if lim_val is not None:
514+
base_sql += f" LIMIT {lim_val}"
515+
if off_val is not None:
497516
base_sql += f" OFFSET {off_val}"
498-
df_with_paging = session.sql(base_sql, src=df)
499-
return df_with_paging._logical_plan
500-
501-
if limit is not None:
502-
return df.limit(int(str(limit)))._logical_plan
503517

504-
return df._logical_plan
518+
df_with_paging = session.sql(base_sql, src=df)
519+
return df_with_paging._logical_plan
505520

506521

507522
def _auto_generate_profile_tool(
@@ -682,6 +697,9 @@ def _auto_generate_core_tools(
682697
683698
- Schema: list columns/types for any or all datasets
684699
- Profile: dataset statistics for any or all datasets
700+
- Read: read rows from a single dataset to sample the data
701+
- Search Summary: regex search across all datasets and return a summary of the number of matches per dataset
702+
- Search Content: return matching rows from a single dataset using regex matching across string columns
685703
- Analyze: DuckDB SELECT-only SQL across datasets
686704
"""
687705
group_desc = "; ".join(
@@ -715,7 +733,7 @@ def _auto_generate_core_tools(
715733
session,
716734
tool_name=f"{tool_group_name} - Read",
717735
tool_description="\n\n".join([
718-
"Read single dataset rows: subset columns, limit, offset, order_by, sort_ascending. ",
736+
"Read rows from a single dataset. Use to sample data, or to execute simple queries over the data that do not require filtering or grouping.",
719737
"Available datasets:\n",
720738
group_desc,
721739
]),
@@ -727,7 +745,7 @@ def _auto_generate_core_tools(
727745
session,
728746
tool_name=f"{tool_group_name} - Search Summary",
729747
tool_description="\n\n".join([
730-
"Perform a substring/regex search across all datasets and return a summary of the number of matches per dataset. ",
748+
"Perform a substring/regex search across all datasets and return a summary of the number of matches per dataset.",
731749
"Available datasets:\n",
732750
group_desc,
733751
]),
@@ -738,7 +756,7 @@ def _auto_generate_core_tools(
738756
tool_name=f"{tool_group_name} - Search Content",
739757
tool_description="\n\n".join([
740758
"Return matching rows from a single dataset using substring/regex across string columns.",
741-
"Available datasets:\n",
759+
"Available datasets:",
742760
group_desc,
743761
]),
744762
result_limit=sql_max_rows,

src/fenic/core/_utils/misc.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import uuid
23

34

@@ -34,3 +35,13 @@ def generate_unique_arrow_view_name() -> str:
3435
'temp_arrow_view_1a2b3c4d5e6f...'
3536
"""
3637
return f"temp_arrow_view_{uuid.uuid4().hex}"
38+
39+
def to_snake_case(name: str) -> str:
40+
result = name
41+
return "_".join(
42+
re.sub(
43+
"([A-Z][a-z]+)",
44+
r" \1",
45+
re.sub("([A-Z]+)", r" \1", result.replace("-", " ")),
46+
).split()
47+
).lower()

0 commit comments

Comments
 (0)