Skip to content

Commit 1d28e8e

Browse files
authored
Merge pull request #1208 from ywang1110/fuzzySharp
Add FuzzySharp-based text analysis plugin for synonym detection, typo correction and entity extraction
2 parents ff08a39 + 3162be4 commit 1d28e8e

27 files changed

+1131
-1
lines changed

BotSharp.sln

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ExcelHandle
149149
EndProject
150150
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ImageHandler", "src\Plugins\BotSharp.Plugin.ImageHandler\BotSharp.Plugin.ImageHandler.csproj", "{242F2D93-FCCE-4982-8075-F3052ECCA92C}"
151151
EndProject
152+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.FuzzySharp", "src\Plugins\BotSharp.Plugin.FuzzySharp\BotSharp.Plugin.FuzzySharp.csproj", "{E7C243B9-E751-B3B4-8F16-95C76CA90D31}"
153+
EndProject
152154
Global
153155
GlobalSection(SolutionConfigurationPlatforms) = preSolution
154156
Debug|Any CPU = Debug|Any CPU
@@ -629,6 +631,14 @@ Global
629631
{242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|Any CPU.Build.0 = Release|Any CPU
630632
{242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.ActiveCfg = Release|Any CPU
631633
{242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.Build.0 = Release|Any CPU
634+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
635+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.Build.0 = Debug|Any CPU
636+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.ActiveCfg = Debug|Any CPU
637+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.Build.0 = Debug|Any CPU
638+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.ActiveCfg = Release|Any CPU
639+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.Build.0 = Release|Any CPU
640+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.ActiveCfg = Release|Any CPU
641+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.Build.0 = Release|Any CPU
632642
EndGlobalSection
633643
GlobalSection(SolutionProperties) = preSolution
634644
HideSolutionNode = FALSE
@@ -701,6 +711,7 @@ Global
701711
{0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
702712
{FC63C875-E880-D8BB-B8B5-978AB7B62983} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
703713
{242F2D93-FCCE-4982-8075-F3052ECCA92C} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
714+
{E7C243B9-E751-B3B4-8F16-95C76CA90D31} = {51AFE054-AE99-497D-A593-69BAEFB5106F}
704715
EndGlobalSection
705716
GlobalSection(ExtensibilityGlobals) = postSolution
706717
SolutionGuid = {A9969D89-C98B-40A5-A12B-FC87E55B3A19}

Directory.Packages.props

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
66
</PropertyGroup>
77
<ItemGroup>
8+
<PackageVersion Include="CsvHelper" Version="33.1.0" />
9+
<PackageVersion Include="FuzzySharp" Version="2.0.2" />
810
<PackageVersion Include="EntityFramework" Version="6.4.4" />
911
<PackageVersion Include="Google_GenerativeAI" Version="3.4.1" />
1012
<PackageVersion Include="Google_GenerativeAI.Live" Version="3.4.1" />
@@ -18,6 +20,7 @@
1820
<PackageVersion Include="Microsoft.Extensions.Logging" Version="10.0.0" />
1921
<PackageVersion Include="Microsoft.Extensions.Caching.Memory" Version="8.0.1" />
2022
<PackageVersion Include="Newtonsoft.Json" Version="13.0.3" />
23+
<PackageVersion Include="SharpFuzz" Version="2.2.0" />
2124
<PackageVersion Include="SharpHook" Version="5.3.9" />
2225
<PackageVersion Include="SixLabors.ImageSharp" Version="3.1.12" />
2326
<PackageVersion Include="System.ClientModel" Version="1.3.0" />
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
namespace BotSharp.Abstraction.Knowledges;
2+
3+
public interface IPhraseCollection
4+
{
5+
Task<Dictionary<string, HashSet<string>>> LoadVocabularyAsync();
6+
Task<Dictionary<string, (string DbPath, string CanonicalForm)>> LoadSynonymMappingAsync();
7+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace BotSharp.Abstraction.Knowledges;
2+
3+
public interface IPhraseService
4+
{
5+
Task<List<SearchPhrasesResult>> SearchPhrasesAsync(string term);
6+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
namespace BotSharp.Abstraction.Knowledges.Models;
3+
4+
public class SearchPhrasesResult
5+
{
6+
public string Token { get; set; } = string.Empty;
7+
public List<string> Sources { get; set; } = new();
8+
public string CanonicalForm { get; set; } = string.Empty;
9+
public string MatchType { get; set; } = string.Empty;
10+
public double Confidence { get; set; }
11+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>$(TargetFramework)</TargetFramework>
5+
<Nullable>enable</Nullable>
6+
<LangVersion>$(LangVersion)</LangVersion>
7+
<VersionPrefix>$(BotSharpVersion)</VersionPrefix>
8+
<GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild>
9+
<GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile>
10+
<OutputPath>$(SolutionDir)packages</OutputPath>
11+
</PropertyGroup>
12+
13+
<ItemGroup>
14+
<PackageReference Include="CsvHelper" />
15+
<PackageReference Include="FuzzySharp" />
16+
</ItemGroup>
17+
18+
<ItemGroup>
19+
<ProjectReference Include="..\..\Infrastructure\BotSharp.Abstraction\BotSharp.Abstraction.csproj" />
20+
</ItemGroup>
21+
</Project>
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
2+
namespace BotSharp.Plugin.FuzzySharp.Constants;
3+
4+
public static class MatchReason
5+
{
6+
/// <summary>
7+
/// Token matched a synonym term (e.g., HVAC -> Air Conditioning/Heating)
8+
/// </summary>
9+
public const string SynonymMatch = "synonym_match";
10+
11+
/// <summary>
12+
/// Token exactly matched a vocabulary entry
13+
/// </summary>
14+
public const string ExactMatch = "exact_match";
15+
16+
/// <summary>
17+
/// Token was flagged as a potential typo and a correction was suggested
18+
/// </summary>
19+
public const string TypoCorrection = "typo_correction";
20+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
namespace BotSharp.Plugin.FuzzySharp.Constants;
3+
4+
public static class TextConstants
5+
{
6+
/// <summary>
7+
/// Characters that need to be separated during tokenization (by adding spaces before and after)
8+
/// Includes: parentheses, brackets, braces, punctuation marks, special symbols, etc.
9+
/// This ensures "(IH)" is split into "(", "IH", ")"
10+
/// </summary>
11+
public static readonly char[] SeparatorChars =
12+
{
13+
// Parentheses and brackets
14+
'(', ')', '[', ']', '{', '}',
15+
// Punctuation marks
16+
',', '.', ';', ':', '!', '?',
17+
// Special symbols
18+
'=', '@', '#', '$', '%', '^', '&', '*', '+', '-', '\\', '|', '<', '>', '~', '`'
19+
};
20+
21+
/// <summary>
22+
/// Whitespace characters used as token separators during tokenization.
23+
/// Includes: space, tab, newline, and carriage return.
24+
/// </summary>
25+
public static readonly char[] TokenSeparators =
26+
{
27+
' ', '\t', '\n', '\r'
28+
};
29+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
using BotSharp.Abstraction.Knowledges;
2+
using BotSharp.Abstraction.Knowledges.Models;
3+
using Microsoft.AspNetCore.Http;
4+
using Microsoft.AspNetCore.Mvc;
5+
using Microsoft.Extensions.Logging;
6+
7+
namespace BotSharp.Plugin.FuzzySharp.Controllers;
8+
9+
[ApiController]
10+
public class FuzzySharpController : ControllerBase
11+
{
12+
private readonly IPhraseService _phraseService;
13+
private readonly ILogger<FuzzySharpController> _logger;
14+
15+
public FuzzySharpController(
16+
IPhraseService phraseService,
17+
ILogger<FuzzySharpController> logger)
18+
{
19+
_phraseService = phraseService;
20+
_logger = logger;
21+
}
22+
23+
/// <summary>
24+
/// Analyze text for typos and entities using vocabulary.
25+
///
26+
/// Returns:
27+
/// - `original`: Original input text
28+
/// - `tokens`: Tokenized text (only included if `include_tokens=true`)
29+
/// - `flagged`: List of flagged items (each with `match_type`):
30+
/// - `synonym_match` - Business abbreviations (confidence=1.0)
31+
/// - `exact_match` - Exact vocabulary matches (confidence=1.0)
32+
/// - `typo_correction` - Spelling corrections (confidence less than 1.0)
33+
/// - `processing_time_ms`: Processing time in milliseconds
34+
/// </summary>
35+
/// <param name="request">Text analysis request</param>
36+
/// <returns>Text analysis response</returns>
37+
[HttpPost("fuzzy-sharp/analyze-text")]
38+
[ProducesResponseType(typeof(List<SearchPhrasesResult>), StatusCodes.Status200OK)]
39+
[ProducesResponseType(StatusCodes.Status400BadRequest)]
40+
[ProducesResponseType(StatusCodes.Status500InternalServerError)]
41+
public async Task<IActionResult> AnalyzeText([FromBody] string text)
42+
{
43+
try
44+
{
45+
if (string.IsNullOrWhiteSpace(text))
46+
{
47+
return BadRequest(new { error = "Text is required" });
48+
}
49+
50+
var result = await _phraseService.SearchPhrasesAsync(text);
51+
return Ok(result);
52+
}
53+
catch (Exception ex)
54+
{
55+
_logger.LogError(ex, "Error analyzing and searching entities");
56+
return StatusCode(500, new { error = $"Error analyzing and searching entities: {ex.Message}" });
57+
}
58+
}
59+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
namespace BotSharp.Plugin.FuzzySharp.FuzzSharp.Arguments;
3+
4+
public class TextAnalysisRequest
5+
{
6+
public string Text { get; set; } = string.Empty;
7+
public string? VocabularyFolderName { get; set; }
8+
public string? SynonymMappingFile { get; set; }
9+
public double Cutoff { get; set; } = 0.82;
10+
public int TopK { get; set; } = 5;
11+
public int MaxNgram { get; set; } = 5;
12+
public bool IncludeTokens { get; set; } = false;
13+
}

0 commit comments

Comments
 (0)