11#!/usr/bin/env python3
22"""
3- TigerData Documentation LLM Generator
3+ Tiger Data Documentation LLM Generator
44
55This script generates a comprehensive llms-full.txt file for LLM training from
6- TigerData documentation. It processes all markdown files following the page-index.js
6+ Tiger Data documentation. It processes all markdown files following the page-index.js
77structure and applies various transformations.
88
99Features:
@@ -36,54 +36,141 @@ def __init__(self, docs_dir: str = "."):
3636 self .link_references = {}
3737 self .processed_files = set ()
3838
39- # Load variables from remote vars.js and add comprehensive mappings
39+ # Load variables aligned with vars.js from timescale/web-documentation
4040 self .variables = self ._load_comprehensive_variables ()
4141
4242 def _load_comprehensive_variables (self ) -> Dict [str , str ]:
43- """Load comprehensive variable mappings."""
43+ """Load comprehensive variable mappings aligned with vars.js ."""
4444 return {
45- # General Variables
45+ # General Variables (aligned with vars.js)
4646 '$PRODUCT_PREFIX' : 'Tiger' ,
47- '$COMPANY' : 'TigerData' ,
47+ '$COMPANY' : 'Tiger Data' ,
4848 '$COMPANY_URL' : 'https://www.tigerdata.com' ,
4949 '$PG' : 'Postgres' ,
50-
51- # Pricing Variables
50+
51+ # Pricing Variables (aligned with vars.js)
5252 '$PRICING_PLAN_CAP' : 'Pricing plan' ,
5353 '$PRICING_PLAN' : 'pricing plan' ,
5454 '$SCALE' : 'Scale' ,
55- '$PERFORMANCE' : 'Performance' ,
55+ '$PERFORMANCE' : 'Performance' ,
5656 '$ENTERPRISE' : 'Enterprise' ,
57-
58- # Product Variables
59- '$CLOUD_LONG' : 'Tiger' ,
60- '$CLOUD_SHORT' : 'Tiger' ,
57+
58+ # Product Variables (aligned with vars.js)
59+ '$CLOUD_LONG' : 'Tiger Cloud' ,
6160 '$LAKE_LONG' : 'Tiger Lake' ,
6261 '$LAKE_SHORT' : 'Tiger Lake' ,
6362 '$TIMESCALE_DB' : 'TimescaleDB' ,
64- '$PRODUCTS_ALL' : 'TigerData products' ,
65- '$PRODUCTS_CL_DB' : 'Tiger and TimescaleDB' ,
63+ '$PRODUCTS_ALL' : 'Tiger Data products' ,
64+ '$PRODUCTS_CL_DB' : 'Tiger Cloud and TimescaleDB' ,
6665 '$TDB_APACHE' : 'TimescaleDB Apache 2 Edition' ,
6766 '$TDB_COMMUNITY' : 'TimescaleDB Community Edition' ,
68-
69- # Service Variables
70- '$SERVICE_LONG' : 'Tiger service' ,
67+
68+ # Self-hosted Variables (from vars.js)
69+ '$SELF_SHORT_CAP' : 'Self-hosted' ,
70+ '$SELF_SHORT' : 'self-hosted' ,
71+ '$SELF_LONG_CAP' : 'Self-hosted TimescaleDB' ,
72+ '$SELF_LONG' : 'self-hosted TimescaleDB' ,
73+
74+ # Console Variables (from vars.js)
75+ '$CONSOLE' : 'Tiger Cloud Console' ,
76+ '$CONSOLE_LONG' : 'Tiger Cloud Console' ,
77+ '$CONSOLE_SHORT' : 'Console' ,
78+
79+ # CLI Variables (from vars.js)
80+ '$CLI_LONG' : 'Tiger CLI' ,
81+ '$CLI_SHORT' : 'CLI' ,
82+
83+ # REST API Variables (from vars.js)
84+ '$REST_LONG' : 'Tiger REST API' ,
85+ '$REST_SHORT' : 'REST API' ,
86+
87+ # Eon Variables (from vars.js)
88+ '$EON_SHORT' : 'Eon' ,
89+ '$EON_LONG' : 'Tiger Eon' ,
90+
91+ # MCP Variables (from vars.js)
92+ '$MCP_LONG' : 'Tiger Model Context Protocol Server' ,
93+ '$MCP_SHORT' : 'Tiger MCP Server' ,
94+
95+ # Agents Variables (from vars.js)
96+ '$AGENTS_LONG' : 'Tiger Agents for Work' ,
97+ '$AGENTS_SHORT' : 'Tiger Agent' ,
98+ '$AGENTS_CLI' : 'Tiger Agent CLI' ,
99+
100+ # Connector Variables (from vars.js)
101+ '$S3_CONNECTOR' : 'source S3 connector' ,
102+ '$S3_CONNECTOR_CAP' : 'Source S3 connector' ,
103+ '$PG_CONNECTOR' : 'source Postgres connector' ,
104+ '$PG_CONNECTOR_CAP' : 'Source Postgres connector' ,
105+
106+ # Additional Tool Variables (from vars.js)
107+ '$SQL_ASSISTANT_LONG' : 'Tiger Cloud SQL assistant' ,
108+ '$CLOUD_EDITOR' : 'Tiger Cloud SQL editor' ,
109+ '$SKIPSCAN_LONG' : 'Tiger Data SkipScan' ,
110+ '$SKIPSCAN_SHORT' : 'SkipScan' ,
111+ '$PGAI_LONG' : 'pgai on Tiger Data' ,
112+ '$PGAI_SHORT' : 'pgai' ,
113+ '$PGVECTORSCALE' : 'pgvectorscale' ,
114+ '$PG_SPOT' : 'pgspot' ,
115+ '$PG_VECTORIZER' : 'PgVectorizer' ,
116+
117+ # URL Variables (from vars.js)
118+ '$CONSOLE_URL' : 'https://console.cloud.timescale.com/' ,
119+ '$MST_CONSOLE_URL' : 'https://portal.managed.timescale.com/' ,
120+ '$POPSQL_URL' : 'https://popsql.com/' ,
121+ '$WEBSITE_MARKETING' : 'www.tigerdata.com' ,
122+ '$WEBSITE_DOCS' : 'docs.tigerdata.com/' ,
123+ '$CONTACT_SALES' :
'[email protected] ' ,
124+ '$CONTACT_COMPANY' : 'https://www.tigerdata.com/contact/' ,
125+
126+ # Project Variables (from vars.js)
127+ '$PROJECT_LONG' : 'Tiger Cloud project' ,
128+ '$PROJECT_SHORT_CAP' : 'Project' ,
129+ '$ACCOUNT_SHORT' : 'account' ,
130+
131+ # Service Variables (from vars.js)
132+ '$TIGER_POSTGRES' : 'Tiger Postgres' ,
133+ '$SERVICE_SHORT_CAP' : 'Service' ,
134+ '$MST_SERVICE_LONG' : 'Managed Service for TimescaleDB service' ,
135+ '$MST_SERVICE_SHORT' : 'MST service' ,
136+
137+ # Feature Variables (from vars.js)
138+ '$HYPERTABLE_CAP' : 'Hypertable' ,
139+ '$HYPERCORE_CAP' : 'Hypercore' ,
140+ '$ROWSTORE_CAP' : 'Rowstore' ,
141+ '$COLUMNSTORE_CAP' : 'Columnstore' ,
142+ '$CHUNK_CAP' : 'Chunk' ,
143+ '$CHUNK_SKIPPING_CAP' : 'Chunk skipping' ,
144+ '$CHUNK_SKIPPING' : 'chunk skipping' ,
145+ '$MAT_HYPERTABLE_CAP' : 'Materialized hypertable' ,
146+ '$MAT_HYPERTABLE' : 'materialized hypertable' ,
147+ '$CAGG_CAP' : 'Continuous aggregate' ,
148+ '$RTAGG_CAP' : 'Real-time aggregate' ,
149+ '$RTAGG' : 'real-time aggregate' ,
150+ '$TIME_BUCKET_CAP' : 'Time bucket' ,
151+ '$HA_REPLICA_CAP' : 'High-availability replica' ,
152+ '$HA_REPLICA_SHORT' : 'HA replica' ,
153+ '$READ_REPLICA_CAP' : 'Read replica' ,
154+ '$JOB_CAP' : 'Job' ,
155+ '$PAR_COPY_CAP' : 'Parallel copy' ,
156+ '$PAR_COPY' : 'parallel copy' ,
157+ '$OPS_MODE_CAP' : 'Ops mode' ,
158+ '$DATA_MODE_CAP' : 'Data mode' ,
159+
160+ # Legacy Service Variables (keeping for compatibility)
161+ '$SERVICE_LONG' : 'Tiger Cloud service' ,
71162 '$SERVICE_SHORT' : 'service' ,
72163 '$MST_LONG' : 'Managed Service for TimescaleDB' ,
73164 '$MST_SHORT' : 'MST' ,
74165 '$MST_SERVICE_SHORT' : 'service' ,
75166 '$MST_SERVICE_LONG' : 'service' ,
76167 '$MST_CONSOLE_SHORT' : 'MST Console' ,
77- '$CONSOLE' : 'Console' ,
78- '$CONSOLE_LONG' : 'Tiger Console' ,
79- '$CONSOLE_SHORT' : 'Console' ,
80- '$SELF_LONG' : 'self-hosted TimescaleDB' ,
81- '$SELF_SHORT' : 'self-hosted TimescaleDB' ,
168+ '$MST_CONSOLE_LONG' : 'MST Console' ,
82169
83170 # Feature Variables
84171 '$HYPERTABLE' : 'hypertable' ,
85172 '$HYPERTABLES' : 'hypertables' ,
86- '$HYPERCORE' : 'Hypercore ' ,
173+ '$HYPERCORE' : 'hypercore ' ,
87174 '$COLUMNSTORE' : 'columnstore' ,
88175 '$ROWSTORE' : 'rowstore' ,
89176 '$CHUNK' : 'chunk' ,
@@ -123,13 +210,10 @@ def _load_comprehensive_variables(self) -> Dict[str, str]:
123210 '$DOCS' : 'docs' ,
124211
125212 # Account & Project Variables
126- '$ACCOUNT_LONG' : 'TigerData account' ,
213+ '$ACCOUNT_LONG' : 'Tiger Data account' ,
127214 '$PROJECT_SHORT' : 'project' ,
128- '$JOB' : 'job' ,
129215 '$SOURCE' : 'source' ,
130216 '$TARGET' : 'target' ,
131- '$VPC' : 'VPC' ,
132- '$DATA_MODE' : 'data mode' ,
133217
134218 # Tool Variables
135219 '$TOOLKIT_LONG' : 'TimescaleDB Toolkit' ,
@@ -141,20 +225,30 @@ def _load_comprehensive_variables(self) -> Dict[str, str]:
141225 '$TIGER_POSTGRES' : 'TimescaleDB' ,
142226 '$POSTGRESQL' : 'PostgreSQL' ,
143227
144- # Additional Variables
145- '$OPS_MODE' : 'operations mode' ,
228+ # Additional Variables (aligned with vars.js)
229+ '$OPS_MODE' : 'ops mode' ,
146230 '$SQL_EDITOR' : 'SQL editor' ,
147- '$MST_CONSOLE_LONG' : 'MST Console' ,
148231 '$POPSQL' : 'PopSQL' ,
149- '$ACCOUNT_SHORT' : 'account' ,
150- '$PROJECT_LONG' : 'TigerData project' ,
151232 '$HA_REPLICA' : 'high availability replica' ,
152- '$TIME_BUCKET' : 'time_bucket ' ,
233+ '$TIME_BUCKET' : 'time bucket ' ,
153234 '$BODY' : 'body' ,
154235 '$__' : '_' ,
155236 '$SERVICE_URL_WITH_PORT' : 'service URL with port' ,
156- '$IO_BOOST' : 'IO boost' ,
237+ '$IO_BOOST' : 'I/O boost' ,
157238 '$DB_NAME' : 'database name' ,
239+ '$VPC' : 'VPC' ,
240+
241+ # Missing variables from vars.js
242+ '$JOB' : 'job' ,
243+ '$CAGG' : 'continuous aggregate' ,
244+ '$RTAGG' : 'real-time aggregate' ,
245+ '$TIME_BUCKET' : 'time bucket' ,
246+ '$HA_REPLICA' : 'high-availability replica' ,
247+ '$READ_REPLICA' : 'read replica' ,
248+ '$PAR_COPY' : 'parallel copy' ,
249+ '$CHUNK_SKIPPING' : 'chunk skipping' ,
250+ '$MAT_HYPERTABLE' : 'materialized hypertable' ,
251+ '$DATA_MODE' : 'data mode' ,
158252
159253 # Compound Variables (mixed case patterns)
160254 'Hypercore_CAP' : 'Hypercore' ,
@@ -384,9 +478,28 @@ def process_frontmatter_and_title(self, content: str) -> str:
384478
385479 def replace_variables (self , content : str ) -> str :
386480 """Replace $VARIABLES with their actual values, including pluralized forms."""
387- # First handle exact matches
388- for var , replacement in self .variables .items ():
389- content = content .replace (var , replacement )
481+ import re
482+
483+ # First handle template literal style ${VARIABLE} patterns
484+ def replace_template_literal (match ):
485+ var_name = match .group (1 )
486+ full_var = f'${ var_name } '
487+ if full_var in self .variables :
488+ return self .variables [full_var ]
489+ return match .group (0 ) # Return original if not found
490+
491+ # Replace ${VARIABLE} patterns
492+ content = re .sub (r'\$\{([A-Z0-9_]+)\}' , replace_template_literal , content )
493+
494+ # Then handle exact $VARIABLE matches using regex with word boundaries
495+ def replace_exact_variable (match ):
496+ var_name = match .group (0 )
497+ if var_name in self .variables :
498+ return self .variables [var_name ]
499+ return var_name
500+
501+ # Replace $VARIABLE patterns with proper boundaries to avoid partial matches
502+ content = re .sub (r'\$[A-Z0-9_]+(?=\s|[^A-Z0-9_]|$)' , replace_exact_variable , content )
390503
391504 # Then handle pluralized variables (e.g., $HYPERTABLE_CAPs -> hypertables)
392505 import re
@@ -411,13 +524,13 @@ def replace_pluralized(match):
411524 return match .group (0 ) # Return original if no base variable found
412525
413526 # Pattern to match $VARIABLE_CAP + suffix (like s, ed, ing, etc.)
414- content = re .sub (r'\$([A-Z_ ]+)_CAP([a-z]+)' , replace_pluralized , content )
415-
527+ content = re .sub (r'\$([A-Z0-9_ ]+)_CAP([a-z]+)' , replace_pluralized , content )
528+
416529 # Also handle direct pluralization like $HYPERTABLEs
417530 def replace_direct_plural (match ):
418531 var_name = match .group (1 )
419532 suffix = match .group (2 )
420-
533+
421534 # Look up the base variable
422535 base_var = f'${ var_name } '
423536 if base_var in self .variables :
@@ -432,9 +545,9 @@ def replace_direct_plural(match):
432545 else :
433546 return base_replacement + suffix .lower ()
434547 return match .group (0 )
435-
548+
436549 # Pattern to match $VARIABLE + suffix
437- content = re .sub (r'\$([A-Z_ ]+)([a-z]+)' , replace_direct_plural , content )
550+ content = re .sub (r'\$([A-Z0-9_ ]+)([a-z]+)' , replace_direct_plural , content )
438551
439552 return content
440553
@@ -758,7 +871,7 @@ def should_exclude_file(self, file_path: Path) -> bool:
758871
759872 def generate_documentation (self ) -> str :
760873 """Generate the complete documentation."""
761- print ("Starting TigerData documentation generation..." )
874+ print ("Starting Tiger Data documentation generation..." )
762875
763876 # Get ordered sections from main page index
764877 ordered_sections = self .parse_main_page_index ()
@@ -819,7 +932,7 @@ def save_documentation(self, content: str, output_file: str = "llms-full.txt"):
819932
820933def main ():
821934 """Main function to generate the documentation."""
822- print ("TigerData Documentation LLM Generator" )
935+ print ("Tiger Data Documentation LLM Generator" )
823936 print ("=" * 50 )
824937
825938 # Create generator instance
0 commit comments