Port all Speech-to-Text code to TypeScript (#225)

compulim · web-flow · commit d9ef9408ad2f · 2024-11-22T03:31:54.000-08:00
* Clean up

* Port all STT to TypeScript

* Add entry

* Fix ESLint

* Fix ESLint
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### Changed
 
-- 💥 Modernized some code with TypeScript, more type-aligned to W3C Speech API, and moved to official Event Target API, in PR [#220](https://github.com/compulim/web-speech-cognitive-services/pull/220) and [#224](https://github.com/compulim/web-speech-cognitive-services/pull/224)
+- 💥 Modernized some code with TypeScript, more type-aligned to W3C Speech API, and moved to official Event Target API, in PR [#220](https://github.com/compulim/web-speech-cognitive-services/pull/220), [#224](https://github.com/compulim/web-speech-cognitive-services/pull/224) and [#225](https://github.com/compulim/web-speech-cognitive-services/pull/225)
    -  `SpeechRecognitionResult` and `SpeechRecognitionResultList` is now a array-like object, use `Array.from()` to convert them into an array
 - Updated build tools and added named exports via CJS/ESM
 - Bumped dependencies, in PR [#216](https://github.com/compulim/web-speech-cognitive-services/pull/216) and [#218](https://github.com/compulim/web-speech-cognitive-services/issues/218)
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/SpeechSDK.ts b/packages/web-speech-cognitive-services/src/SpeechServices/SpeechSDK.ts
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText.ts b/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText.ts
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.ts b/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.ts
@@ -4,13 +4,13 @@
 /* eslint no-empty-function: "off" */
 /* eslint no-magic-numbers: ["error", { "ignore": [0, 100, 150] }] */
 
-import patchOptions from '../patchOptions';
+import patchOptions, { type PatchOptionsInit } from '../patchOptions';
 import SpeechSDK from '../SpeechSDK';
 import createSpeechRecognitionPonyfillFromRecognizer from './createSpeechRecognitionPonyfillFromRecognizer';
 
 const { AudioConfig, OutputFormat, SpeechConfig, SpeechRecognizer } = SpeechSDK;
 
-export default function createSpeechRecognitionPonyfill(options) {
+export default function createSpeechRecognitionPonyfill(options: PatchOptionsInit) {
   const {
     audioConfig = AudioConfig.fromDefaultMicrophoneInput(),
 
@@ -33,23 +33,26 @@ export default function createSpeechRecognitionPonyfill(options) {
     return {};
   }
 
-  const createRecognizer = async lang => {
-    const { authorizationToken, region, speechRecognitionHostname, subscriptionKey } = await fetchCredentials();
+  const createRecognizer = async (lang: string) => {
+    const credentials = await fetchCredentials();
     let speechConfig;
 
-    if (speechRecognitionHostname) {
-      const host = { hostname: speechRecognitionHostname, port: 443, protocol: 'wss:' };
+    if (typeof credentials.speechRecognitionHostname !== 'undefined') {
+      const host = new URL('wss://hostname:443');
 
-      if (authorizationToken) {
+      host.hostname = credentials.speechRecognitionHostname;
+
+      if (credentials.authorizationToken) {
         speechConfig = SpeechConfig.fromHost(host);
-        speechConfig.authorizationToken = authorizationToken;
+        speechConfig.authorizationToken = credentials.authorizationToken;
       } else {
-        speechConfig = SpeechConfig.fromHost(host, subscriptionKey);
+        speechConfig = SpeechConfig.fromHost(host, credentials.subscriptionKey);
       }
     } else {
-      speechConfig = authorizationToken
-        ? SpeechConfig.fromAuthorizationToken(authorizationToken, region)
-        : SpeechConfig.fromSubscription(subscriptionKey, region);
+      speechConfig =
+        typeof credentials.authorizationToken !== 'undefined'
+          ? SpeechConfig.fromAuthorizationToken(credentials.authorizationToken, credentials.region)
+          : SpeechConfig.fromSubscription(credentials.subscriptionKey, credentials.region);
     }
 
     if (speechRecognitionEndpointId) {
@@ -66,7 +69,6 @@ export default function createSpeechRecognitionPonyfill(options) {
   };
 
   return createSpeechRecognitionPonyfillFromRecognizer({
-    audioConfig,
     createRecognizer,
     enableTelemetry,
     looseEvents,
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfillFromRecognizer.ts b/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfillFromRecognizer.ts
@@ -16,15 +16,15 @@ import createPromiseQueue from '../../Util/createPromiseQueue';
 import SpeechSDK from '../SpeechSDK';
 import cognitiveServiceEventResultToWebSpeechRecognitionResult from './cognitiveServiceEventResultToWebSpeechRecognitionResult';
 import cognitiveServicesAsyncToPromise from './cognitiveServicesAsyncToPromise';
+import EventListenerMap from './private/EventListenerMap';
+import prepareAudioConfig from './private/prepareAudioConfig';
+import serializeRecognitionResult from './private/serializeRecognitionResult';
 import SpeechGrammarList from './SpeechGrammarList';
 import SpeechRecognitionErrorEvent from './SpeechRecognitionErrorEvent';
 import SpeechRecognitionEvent from './SpeechRecognitionEvent';
 import { type SpeechRecognitionEventListenerMap } from './SpeechRecognitionEventListenerMap';
 import type SpeechRecognitionResult from './SpeechRecognitionResult';
 import SpeechRecognitionResultList from './SpeechRecognitionResultList';
-import EventListenerMap from './private/EventListenerMap';
-import prepareAudioConfig from './private/prepareAudioConfig';
-import serializeRecognitionResult from './private/serializeRecognitionResult';
 
 // https://docs.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/speechconfig?view=azure-node-latest#outputformat
 // {
@@ -54,7 +54,7 @@ type CreateSpeechRecognitionPonyfillFromRecognizerInit = {
   createRecognizer: (lang: string) => Promise<SpeechRecognizerType>;
   enableTelemetry: boolean;
   looseEvents: boolean;
-  referenceGrammars: [];
+  referenceGrammars?: readonly string[] | undefined;
   textNormalization: 'display' | 'itn' | 'lexical' | 'maskeditn';
 };
 
@@ -355,7 +355,7 @@ export default function createSpeechRecognitionPonyfillFromRecognizer({
         // HACK: We are using the internal of SpeechRecognizer because they did not expose it
         const { dynamicGrammar } = recognizer['privReco'];
 
-        referenceGrammars && referenceGrammars.length && dynamicGrammar.addReferenceGrammar(referenceGrammars);
+        referenceGrammars && referenceGrammars.length && dynamicGrammar.addReferenceGrammar([...referenceGrammars]);
         phrases && phrases.length && dynamicGrammar.addPhrase([...phrases]);
 
         await cognitiveServicesAsyncToPromise<void>(recognizer.startContinuousRecognitionAsync, recognizer)();
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText/private/prepareAudioConfig.ts b/packages/web-speech-cognitive-services/src/SpeechServices/SpeechToText/private/prepareAudioConfig.ts
@@ -6,7 +6,9 @@ import {
 import averageAmplitude from './averageAmplitude';
 
 export default function prepareAudioConfig(audioConfig: AudioConfig) {
-  const audioConfigImpl = audioConfig as AudioConfigImpl; // HACK: Need internals of AudioConfig.
+  // Speech SDK also force cast AudioConfig to AudioConfigImpl and pass it to ServiceRecognizerBase to use attach() and other methods.
+  // https://github.com/microsoft/cognitive-services-speech-sdk-js/blob/a6e9d2a202534565ccc97650861a6b296de48ecf/src/sdk/SpeechRecognizer.ts#L291C27-L291C43
+  const audioConfigImpl = audioConfig as AudioConfigImpl;
   const originalAttach = audioConfigImpl.attach;
   const boundOriginalAttach = audioConfigImpl.attach.bind(audioConfigImpl);
   let firstChunk = false;
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/fetchAuthorizationToken.ts b/packages/web-speech-cognitive-services/src/SpeechServices/fetchAuthorizationToken.ts
@@ -1,6 +1,14 @@
 const TOKEN_URL_TEMPLATE = 'https://{region}.api.cognitive.microsoft.com/sts/v1.0/issueToken';
 
-export default async function ({ region, subscriptionKey }) {
+type FetchAuthorizationTokenInit = {
+  region: string;
+  subscriptionKey: string;
+};
+
+export default async function fetchAuthorizationToken({
+  region,
+  subscriptionKey
+}: FetchAuthorizationTokenInit): Promise<string> {
   const res = await fetch(TOKEN_URL_TEMPLATE.replace(/\{region\}/u, region), {
     headers: {
       'Ocp-Apim-Subscription-Key': subscriptionKey
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/patchOptions.js b/packages/web-speech-cognitive-services/src/SpeechServices/patchOptions.js
diff --git a/packages/web-speech-cognitive-services/src/SpeechServices/patchOptions.ts b/packages/web-speech-cognitive-services/src/SpeechServices/patchOptions.ts
@@ -0,0 +1,151 @@
+import { type AudioConfig } from 'microsoft-cognitiveservices-speech-sdk';
+import resolveFunctionOrReturnValue from './resolveFunctionOrReturnValue';
+
+let shouldWarnOnSubscriptionKey = true;
+
+export type Credentials = Readonly<
+  (
+    | { authorizationToken: string; subscriptionKey?: undefined }
+    | { authorizationToken?: undefined; subscriptionKey: string }
+  ) &
+    (
+      | {
+          customVoiceHostname?: undefined;
+          region: string;
+          speechRecognitionHostname?: undefined;
+          speechSynthesisHostname?: undefined;
+        }
+      | {
+          customVoiceHostname: string;
+          region?: undefined;
+          speechRecognitionHostname: string;
+          speechSynthesisHostname: string;
+        }
+    )
+>;
+
+export type PatchOptionsInit = {
+  audioConfig: AudioConfig;
+  credentials?: (() => Credentials | Promise<Credentials>) | Credentials | Promise<Credentials>;
+  enableTelemetry: boolean;
+  looseEvent?: boolean | undefined;
+  looseEvents?: boolean | undefined;
+  referenceGrammars?: readonly string[] | undefined;
+  region?: string | undefined;
+  speechRecognitionEndpointId: string;
+  textNormalization: 'display' | 'itn' | 'lexical' | 'maskeditn';
+} & (
+  | {
+      authorizationToken: string;
+      subscriptionKey?: undefined;
+    }
+  | {
+      authorizationToken?: undefined;
+      subscriptionKey: string;
+    }
+);
+
+type PatchedOptions = Readonly<{
+  audioConfig: AudioConfig;
+  enableTelemetry: boolean;
+  fetchCredentials: () => Promise<Credentials>;
+  looseEvents: boolean;
+  referenceGrammars: readonly string[] | undefined;
+  speechRecognitionEndpointId: string | undefined;
+  textNormalization: 'display' | 'itn' | 'lexical' | 'maskeditn';
+}>;
+
+export default function patchOptions(init: PatchOptionsInit): PatchedOptions {
+  const {
+    audioConfig,
+    authorizationToken,
+    enableTelemetry,
+    looseEvent,
+    referenceGrammars,
+    region = 'westus',
+    speechRecognitionEndpointId,
+    subscriptionKey,
+    textNormalization
+  } = init;
+
+  let { credentials, looseEvents } = init;
+
+  if (typeof looseEvent !== 'undefined') {
+    console.warn('web-speech-cognitive-services: The option "looseEvent" should be named as "looseEvents".');
+
+    looseEvents = looseEvent;
+  }
+
+  if (!credentials) {
+    if (!authorizationToken && !subscriptionKey) {
+      throw new Error('web-speech-cognitive-services: Credentials must be specified.');
+    } else {
+      console.warn(
+        'web-speech-cognitive-services: We are deprecating authorizationToken, region, and subscriptionKey. Please use credentials instead. The deprecated option will be removed on or after 2020-11-14.'
+      );
+
+      credentials = async () =>
+        typeof init.authorizationToken !== 'undefined'
+          ? { authorizationToken: await resolveFunctionOrReturnValue<string>(init.authorizationToken), region }
+          : { region, subscriptionKey: await resolveFunctionOrReturnValue<string>(init.subscriptionKey) };
+    }
+  }
+
+  return Object.freeze({
+    audioConfig,
+    enableTelemetry,
+    fetchCredentials: async () => {
+      const {
+        authorizationToken,
+        customVoiceHostname,
+        region,
+        speechRecognitionHostname,
+        speechSynthesisHostname,
+        subscriptionKey
+      } = await resolveFunctionOrReturnValue<Credentials>(credentials);
+
+      if ((!authorizationToken && !subscriptionKey) || (authorizationToken && subscriptionKey)) {
+        throw new Error(
+          'web-speech-cognitive-services: Either "authorizationToken" or "subscriptionKey" must be provided.'
+        );
+      } else if (!region && !(speechRecognitionHostname && speechSynthesisHostname)) {
+        throw new Error(
+          'web-speech-cognitive-services: Either "region" or "speechRecognitionHostname" and "speechSynthesisHostname" must be set.'
+        );
+      } else if (region && (customVoiceHostname || speechRecognitionHostname || speechSynthesisHostname)) {
+        throw new Error(
+          'web-speech-cognitive-services: Only either "region" or "customVoiceHostname", "speechRecognitionHostname" and "speechSynthesisHostname" can be set.'
+        );
+      } else if (authorizationToken) {
+        if (typeof authorizationToken !== 'string') {
+          throw new Error('web-speech-cognitive-services: "authorizationToken" must be a string.');
+        }
+      } else if (typeof subscriptionKey !== 'string') {
+        throw new Error('web-speech-cognitive-services: "subscriptionKey" must be a string.');
+      }
+
+      if (shouldWarnOnSubscriptionKey && subscriptionKey) {
+        console.warn(
+          'web-speech-cognitive-services: In production environment, subscription key should not be used, authorization token should be used instead.'
+        );
+
+        shouldWarnOnSubscriptionKey = false;
+      }
+
+      return {
+        ...(typeof authorizationToken !== 'undefined' ? { authorizationToken } : { subscriptionKey }),
+        ...(typeof region !== 'undefined'
+          ? { region }
+          : {
+              customVoiceHostname,
+              speechRecognitionHostname,
+              speechSynthesisHostname
+            })
+      } satisfies Credentials;
+    },
+    looseEvents: !!looseEvents,
+    referenceGrammars: referenceGrammars && Object.freeze([...referenceGrammars]),
+    speechRecognitionEndpointId,
+    textNormalization
+  });
+}