Introduce tokenizing options for full and partial mode (#1669)

martin-fleck-at · web-flow · commit 9a1c0216123d · 2024-09-06T18:00:20.000+02:00
Add tokenizing mode to tokenizing method
- Full: We get the full text to tokenize
- Partial: We get only a portion of the text to tokenize

In indentation lexing, we do not auto-complete dedents for partial mode
diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
@@ -7,11 +7,11 @@
 import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition, TokenVocabulary } from 'chevrotain';
 import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
 import type { LexingReport, TokenBuilderOptions } from './token-builder.js';
-import type { LexerResult } from './lexer.js';
+import type { LexerResult, TokenizeOptions } from './lexer.js';
 import type { LangiumCoreServices } from '../services.js';
 import { createToken, createTokenInstance, Lexer } from 'chevrotain';
 import { DefaultTokenBuilder } from './token-builder.js';
-import { DefaultLexer, isTokenTypeArray } from './lexer.js';
+import { DEFAULT_TOKENIZE_OPTIONS, DefaultLexer, isTokenTypeArray } from './lexer.js';
 
 type IndentationAwareDelimiter<TokenName extends string> = [begin: TokenName, end: TokenName];
 
@@ -179,11 +179,11 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         }
     }
 
-    override popLexingReport(text: string): IndentationLexingReport {
-        const result = super.popLexingReport(text);
+    override flushLexingReport(text: string): IndentationLexingReport {
+        const result = super.flushLexingReport(text);
         return {
             ...result,
-            remainingDedents: this.popRemainingDedents(text),
+            remainingDedents: this.flushRemainingDedents(text),
         };
     }
 
@@ -203,9 +203,12 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      *
      * @param text The full input string.
      * @param offset The current position at which to attempt a match
+     * @param tokens Previously scanned tokens
+     * @param groups Token Groups
      * @returns The current and previous indentation levels and the matched whitespace
      */
-    protected matchWhitespace(text: string, offset: number, _tokens: IToken[], _groups: Record<string, IToken[]>): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } {
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    protected matchWhitespace(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } {
         this.whitespaceRegExp.lastIndex = offset;
         const match = this.whitespaceRegExp.exec(text);
         return {
@@ -251,12 +254,10 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      *
      * @param text The full input string.
      * @param offset The offset at which to attempt a match
-     * @param tokens Previously scanned Tokens
+     * @param tokens Previously scanned tokens
      * @param groups Token Groups
      */
     protected indentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
-        const { indentTokenName } = this.options;
-
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
@@ -274,7 +275,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         const indentToken = this.createIndentationTokenInstance(
             this.indentTokenType,
             text,
-            match?.[0] ?? indentTokenName,
+            match?.[0] ?? '',
             offset,
         );
         tokens.push(indentToken);
@@ -288,12 +289,10 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      *
      * @param text The full input string.
      * @param offset The offset at which to attempt a match
-     * @param tokens Previously scanned Tokens
+     * @param tokens Previously scanned tokens
      * @param groups Token Groups
      */
     protected dedentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
-        const { dedentTokenName } = this.options;
-
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
@@ -316,7 +315,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
                 offset,
                 length: match?.[0]?.length ?? 0,
                 line: this.getLineNumber(text, offset),
-                column: 0
+                column: 1
             });
             return null;
         }
@@ -327,7 +326,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
             const token = this.createIndentationTokenInstance(
                 this.dedentTokenType,
                 text,
-                match?.[0] ?? dedentTokenName,
+                match?.[0] ?? '',
                 offset,
             );
             tokens.push(token);
@@ -362,7 +361,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      * @param text Full text that was tokenized
      * @returns Remaining dedent tokens to match all previous indents at the end of the file
      */
-    popRemainingDedents(text: string): IToken[] {
+    flushRemainingDedents(text: string): IToken[] {
         const remainingDedents: IToken[] = [];
         while (this.indentationStack.length > 1) {
             remainingDedents.push(
@@ -402,13 +401,15 @@ export class IndentationAwareLexer extends DefaultLexer {
         }
     }
 
-    override tokenize(text: string): LexerResult {
+    override tokenize(text: string, options: TokenizeOptions = DEFAULT_TOKENIZE_OPTIONS): LexerResult {
         const result = super.tokenize(text);
 
         // consuming all remaining dedents and remove them as they might not be serializable
         const report = result.report as IndentationLexingReport;
-        const remainingDedents = report.remainingDedents;
-        result.tokens.push(...remainingDedents);
+        if (options?.mode === 'full') {
+            // auto-complete document with remaining dedents
+            result.tokens.push(...report.remainingDedents);
+        }
         report.remainingDedents = [];
 
         // remove any "indent-dedent" pair with an empty body as these are typically
diff --git a/packages/langium/src/parser/langium-parser.ts b/packages/langium/src/parser/langium-parser.ts
@@ -527,7 +527,7 @@ export class LangiumCompletionParser extends AbstractLangiumParser {
 
     parse(input: string): CompletionParserResult {
         this.resetState();
-        const tokens = this.lexer.tokenize(input);
+        const tokens = this.lexer.tokenize(input, { mode: 'partial' });
         this.tokens = tokens.tokens;
         this.wrapper.input = [...this.tokens];
         this.mainRule.call(this.wrapper, {});
diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
@@ -25,9 +25,17 @@ export interface LexerResult {
     report?: LexingReport;
 }
 
+export type TokenizeMode = 'full' | 'partial';
+
+export interface TokenizeOptions {
+    mode?: TokenizeMode;
+}
+
+export const DEFAULT_TOKENIZE_OPTIONS: TokenizeOptions = { mode: 'full' };
+
 export interface Lexer {
     readonly definition: TokenTypeDictionary;
-    tokenize(text: string): LexerResult;
+    tokenize(text: string, options?: TokenizeOptions): LexerResult;
 }
 
 export class DefaultLexer implements Lexer {
@@ -36,7 +44,7 @@ export class DefaultLexer implements Lexer {
     protected tokenBuilder: TokenBuilder;
     protected tokenTypes: TokenTypeDictionary;
 
-    constructor( services: LangiumCoreServices) {
+    constructor(services: LangiumCoreServices) {
         this.tokenBuilder = services.parser.TokenBuilder;
         const tokens = this.tokenBuilder.buildTokens(services.Grammar, {
             caseInsensitive: services.LanguageMetaData.caseInsensitive
@@ -52,13 +60,13 @@ export class DefaultLexer implements Lexer {
         return this.tokenTypes;
     }
 
-    tokenize(text: string): LexerResult {
+    tokenize(text: string, _options: TokenizeOptions = DEFAULT_TOKENIZE_OPTIONS): LexerResult {
         const chevrotainResult = this.chevrotainLexer.tokenize(text);
         return {
             tokens: chevrotainResult.tokens,
             errors: chevrotainResult.errors,
             hidden: chevrotainResult.groups.hidden ?? [],
-            report: this.tokenBuilder.popLexingReport?.(text)
+            report: this.tokenBuilder.flushLexingReport?.(text)
         };
     }
 
diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
@@ -25,7 +25,7 @@ export interface TokenBuilder {
      *
      * @param text The text that was tokenized.
      */
-    popLexingReport?(text: string): LexingReport;
+    flushLexingReport?(text: string): LexingReport;
 }
 
 /**
@@ -36,8 +36,10 @@ export interface LexingReport {
     diagnostics: LexingDiagnostic[];
 }
 
+export type LexingDiagnosticSeverity = 'error' | 'warning' | 'info' | 'hint';
+
 export interface LexingDiagnostic extends ILexingError {
-    severity?: 'error' | 'warning' | 'info' | 'hint';
+    severity?: LexingDiagnosticSeverity;
 }
 
 export class DefaultTokenBuilder implements TokenBuilder {
@@ -64,7 +66,8 @@ export class DefaultTokenBuilder implements TokenBuilder {
         return tokens;
     }
 
-    popLexingReport(_text: string): LexingReport {
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    flushLexingReport(text: string): LexingReport {
         return { diagnostics: this.popDiagnostics() };
     }
 
diff --git a/packages/langium/src/validation/document-validator.ts b/packages/langium/src/validation/document-validator.ts
@@ -11,14 +11,14 @@ import type { ParseResult } from '../parser/langium-parser.js';
 import type { LangiumCoreServices } from '../services.js';
 import type { AstNode, CstNode } from '../syntax-tree.js';
 import type { LangiumDocument } from '../workspace/documents.js';
-import type { DiagnosticData, DiagnosticInfo, ValidationAcceptor, ValidationCategory, ValidationRegistry } from './validation-registry.js';
+import type { DiagnosticData, DiagnosticInfo, ValidationAcceptor, ValidationCategory, ValidationRegistry, ValidationSeverity } from './validation-registry.js';
 import { CancellationToken } from '../utils/cancellation.js';
 import { findNodeForKeyword, findNodeForProperty } from '../utils/grammar-utils.js';
 import { streamAst } from '../utils/ast-utils.js';
 import { tokenToRange } from '../utils/cst-utils.js';
 import { interruptAndCheck, isOperationCancelled } from '../utils/promise-utils.js';
 import { diagnosticData } from './validation-registry.js';
-import type { LexingDiagnostic } from '../parser/token-builder.js';
+import type { LexingDiagnostic, LexingDiagnosticSeverity } from '../parser/token-builder.js';
 
 export interface ValidationOptions {
     /**
@@ -100,7 +100,7 @@ export class DefaultDocumentValidator implements DocumentValidator {
     protected processLexingErrors(parseResult: ParseResult, diagnostics: Diagnostic[], _options: ValidationOptions): void {
         const lexerDiagnostics = [...parseResult.lexerErrors, ...parseResult.lexerReport?.diagnostics ?? []] as LexingDiagnostic[];
         for (const lexerDiagnostic of lexerDiagnostics) {
-            const severity = lexerDiagnostic?.severity ?? 'error';
+            const severity = lexerDiagnostic.severity ?? 'error';
             const diagnostic: Diagnostic = {
                 severity: toDiagnosticSeverity(severity),
                 range: {
@@ -180,7 +180,7 @@ export class DefaultDocumentValidator implements DocumentValidator {
 
     protected async validateAst(rootNode: AstNode, options: ValidationOptions, cancelToken = CancellationToken.None): Promise<Diagnostic[]> {
         const validationItems: Diagnostic[] = [];
-        const acceptor: ValidationAcceptor = <N extends AstNode>(severity: 'error' | 'warning' | 'info' | 'hint', message: string, info: DiagnosticInfo<N>) => {
+        const acceptor: ValidationAcceptor = <N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N>) => {
             validationItems.push(this.toDiagnostic(severity, message, info));
         };
 
@@ -194,7 +194,7 @@ export class DefaultDocumentValidator implements DocumentValidator {
         return validationItems;
     }
 
-    protected toDiagnostic<N extends AstNode>(severity: 'error' | 'warning' | 'info' | 'hint', message: string, info: DiagnosticInfo<N, string>): Diagnostic {
+    protected toDiagnostic<N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N, string>): Diagnostic {
         return {
             message,
             range: getDiagnosticRange(info),
@@ -233,7 +233,7 @@ export function getDiagnosticRange<N extends AstNode>(info: DiagnosticInfo<N, st
     return cstNode.range;
 }
 
-export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticSeverity {
+export function toDiagnosticSeverity(severity: LexingDiagnosticSeverity): DiagnosticSeverity {
     switch (severity) {
         case 'error':
             return 1; // according to vscode-languageserver-types/lib/esm/main.js#DiagnosticSeverity.Error
@@ -248,7 +248,7 @@ export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'h
     }
 }
 
-export function toDiagnosticData(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticData {
+export function toDiagnosticData(severity: LexingDiagnosticSeverity): DiagnosticData {
     switch (severity) {
         case 'error':
             return diagnosticData(DocumentValidator.LexingError);
diff --git a/packages/langium/src/validation/validation-registry.ts b/packages/langium/src/validation/validation-registry.ts
@@ -57,7 +57,9 @@ export function diagnosticData(code: string): DiagnosticData {
     return { code };
 }
 
-export type ValidationAcceptor = <N extends AstNode>(severity: 'error' | 'warning' | 'info' | 'hint', message: string, info: DiagnosticInfo<N>) => void
+export type ValidationSeverity = 'error' | 'warning' | 'info' | 'hint';
+
+export type ValidationAcceptor = <N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N>) => void
 
 export type ValidationCheck<T extends AstNode = AstNode> = (node: T, accept: ValidationAcceptor, cancelToken: CancellationToken) => MaybePromise<void>;
 
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
@@ -11,7 +11,7 @@ import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder }
 import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
 import type { LangiumServices, PartialLangiumServices } from 'langium/lsp';
 import { expandToString } from 'langium/generate';
-import { parseHelper } from 'langium/test';
+import { expectCompletion, parseHelper } from 'langium/test';
 import type { IMultiModeLexerDefinition } from 'chevrotain';
 
 const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar;
@@ -193,6 +193,18 @@ describe('IndentationAwareLexer', () => {
         expect(dedent.tokenType.name).toBe('DEDENT');
     });
 
+    test('should NOT add remaining dedents to the end if partial tokenizing', async () => {
+        const lexer = await getLexer(sampleGrammar);
+        const { tokens } = lexer.tokenize(expandToString`
+        // single-line comment
+        {
+            name`, { mode: 'partial' });
+        expect(tokens).toHaveLength(3);
+
+        const [/* L_BRAC */, indent, /* id */] = tokens;
+        expect(indent.tokenType.name).toBe('INDENT');
+    });
+
     test('should not return any tokens for empty input', async () => {
         const lexer = await getLexer(sampleGrammar);
         const { tokens } = lexer.tokenize('');
@@ -389,6 +401,28 @@ describe('IndentationAware parsing', () => {
         expect(return2.value).toBe(true);
     });
 
+    test.fails('should offer correct auto-completion parsing', async () => {
+        const text = expandToString`
+        <|>if true:
+            <|>return true
+        <|>else:
+            <|>if false:
+                <|>return true
+                <|>return false
+            <|>return true
+        `;
+
+        const services = await createIndentationAwareServices(sampleGrammar);
+        const completion = expectCompletion(services);
+        await completion({ text, index: 0, expectedItems: ['if', 'return'] });
+        // PR 1669: the lines below currently fail as the completion provider may wrongly assumes that all whitespace tokens are hidden
+        await completion({ text, index: 1, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 2, expectedItems: ['else'] });
+        await completion({ text, index: 3, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 4, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 5, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 6, expectedItems: ['if', 'return'] });
+    });
 });
 
 type Statement = If | Return;

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ export interface TokenBuilder {`
`25`	`25`	`*`
`26`	`26`	`* @param text The text that was tokenized.`
`27`	`27`	`*/`
`28`		`- popLexingReport?(text: string): LexingReport;`
	`28`	`+ flushLexingReport?(text: string): LexingReport;`
`29`	`29`	`}`
`30`	`30`
`31`	`31`	`/**`
`@@ -36,8 +36,10 @@ export interface LexingReport {`
`36`	`36`	`diagnostics: LexingDiagnostic[];`
`37`	`37`	`}`
`38`	`38`
	`39`	`+export type LexingDiagnosticSeverity = 'error' \| 'warning' \| 'info' \| 'hint';`
	`40`	`+`
`39`	`41`	`export interface LexingDiagnostic extends ILexingError {`
`40`		`- severity?: 'error' \| 'warning' \| 'info' \| 'hint';`
	`42`	`+ severity?: LexingDiagnosticSeverity;`
`41`	`43`	`}`
`42`	`44`
`43`	`45`	`export class DefaultTokenBuilder implements TokenBuilder {`
`@@ -64,7 +66,8 @@ export class DefaultTokenBuilder implements TokenBuilder {`
`64`	`66`	`return tokens;`
`65`	`67`	`}`
`66`	`68`
`67`		`- popLexingReport(_text: string): LexingReport {`
	`69`	`+ // eslint-disable-next-line @typescript-eslint/no-unused-vars`
	`70`	`+ flushLexingReport(text: string): LexingReport {`
`68`	`71`	`return { diagnostics: this.popDiagnostics() };`
`69`	`72`	`}`
`70`	`73`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,9 @@ export function diagnosticData(code: string): DiagnosticData {`
`57`	`57`	`return { code };`
`58`	`58`	`}`
`59`	`59`
`60`		`-export type ValidationAcceptor = <N extends AstNode>(severity: 'error' \| 'warning' \| 'info' \| 'hint', message: string, info: DiagnosticInfo<N>) => void`
	`60`	`+export type ValidationSeverity = 'error' \| 'warning' \| 'info' \| 'hint';`
	`61`	`+`
	`62`	`+export type ValidationAcceptor = <N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N>) => void`
`61`	`63`
`62`	`64`	`export type ValidationCheck<T extends AstNode = AstNode> = (node: T, accept: ValidationAcceptor, cancelToken: CancellationToken) => MaybePromise<void>;`
`63`	`65`