hans00 · yocontra · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/__tests__/index.test.ts b/__tests__/index.test.ts
@@ -6,7 +6,7 @@ describe('Index', function() {
     expect(phonemize('this is an apple.')).toEqual('ˈðɪs ˈɪz ˈæn ˈæpəɫ.')
     expect(phonemize('John\'s package', true)).toEqual([
       {
-        "phoneme": "ˈdʒɑnz",
+        "phoneme": "ˈdʒɑːnz",
         "position": 0,
         "word": "John's"
       },
@@ -33,12 +33,12 @@ describe('Index', function() {
 
   it('rule based or compound word', function() {
     expect(phonemize('buggie')).toEqual('ˈbʌɡi')
-    expect(phonemize('supercar')).toEqual('ˈsupɝˈkɑɹ')
-    expect(phonemize('pneumonoultramicroscopicsilicovolcanoconiosis')).toEqual('ˈnumoʊˈnoʊˈəɫtɹəˈˌmaɪkɹəskɑpɪkˈsiˈɫikoʊˈvɑɫkeɪnoʊˈkɑnˈaɪoʊˈsɪs')
+    expect(phonemize('supercar')).toEqual('ˈsuːpɝˈkɑːɹ')
+    expect(phonemize('pneumonoultramicroscopicsilicovolcanoconiosis')).toEqual('ˈnuːmoʊˈnoʊˈəɫtɹəˈˌmaɪkɹəskɑːpɪkˈsiːˈɫiːkoʊˈvɑːɫkeɪnoʊˈkɑːnˈaɪoʊˈsɪs')
   })
 
   it('chinese', function() {
-    expect(phonemize('中文 TTS')).toEqual('ʈʂʊŋ˥˥ wən˧˥ ˈtiˈtiˈɛs')
+    expect(phonemize('中文 TTS')).toEqual('ʈʂʊŋ˥˥ wən˧˥ ˈtiːˈtiːˈɛs')
     expect(phonemize('中文的抑揚頓挫')).toEqual('ʈʂʊŋ˥˥ wən˧˥ tə˧ i˥˩ jɑŋ˧˥ tuən˥˩ tsʰuɔ˥˩')
     expect(phonemize('還原 還你 還是 還不是')).toEqual('xuan˧˥ juan˧˥ xuan˧˥ ni˧˩˧ xaɪ˧˥ ʂɨ˥˩ xaɪ˧˥ pu˥˩ ʂɨ˥˩')
   })
@@ -102,13 +102,13 @@ describe('Index', function() {
   it('Number processing', function() {
     // Basic number expansion tests
     expect(phonemize('5')).toEqual('ˈfaɪv')
-    expect(phonemize('123')).toEqual('ˈwən ˈhəndɝd ˈtwɛni ˈθɹi')
+    expect(phonemize('123')).toEqual('ˈwən ˈhəndɝd ˈtwɛni ˈθɹiː')
   })
 
   it('Abbreviation expansion', function() {
     // Basic abbreviation tests
     expect(phonemize('Mr. Smith')).toContain('ˈmɪstɝ')
-    expect(phonemize('Dr. Johnson')).toContain('ˈdɑktɝ')
+    expect(phonemize('Dr. Johnson')).toContain('ˈdɑːktɝ')
   })
 
   it('Custom tokenizer creation', function() {
@@ -125,7 +125,7 @@ describe('Index', function() {
   })
 
   it('Uppercase acronym processing', function() {
-    expect(phonemize('TTS')).toEqual('ˈtiˈtiˈɛs')
+    expect(phonemize('TTS')).toEqual('ˈtiːˈtiːˈɛs')
     expect(phonemize('AI')).toEqual('ˈeɪaɪ')
 
     expect(phonemize('Xyz')).not.toContain('ˌɛks')

diff --git a/scripts/build-dict.ts b/scripts/build-dict.ts
@@ -21,8 +21,15 @@ function parseDict(content: string): DictEntry {
 
     let [, word, phonesStr] = match;
 
-    const ipa = phonesStr.match(/^\/([^\/]+)\//)?.[1];
-    if (!ipa) continue;
+    // Parse all pronunciation variants (format: /vɑr1/, /vɔr2/, ...)
+    const variants = [...phonesStr.matchAll(/\/([^\/]+)\//g)].map(m => m[1]);
+    if (variants.length === 0) continue;
+
+    // When multiple variants exist, prefer the one containing ɔ (THOUGHT vowel)
+    // over ɑ (LOT vowel). The ipa-dict source lists ɑ variants first for words
+    // like "caught", "bought", "law", "fall", "walk", etc., but ɔ better
+    // represents standard American English pronunciation for these words.
+    const ipa = variants.find(v => v.includes("ɔ")) ?? variants[0];
     dict[word.toLowerCase()] = ipa;
   }
 

diff --git a/src/tokenizer.ts b/src/tokenizer.ts
@@ -15,6 +15,35 @@ import type ChineseG2P from "./zh-g2p";
 // Tokenization regex patterns
 const TOKEN_REGEX = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g;
 
+// All IPA vowel symbols used for rhotacized vowel detection
+const IPA_VOWELS = 'ɑɒæəɔɛɜɪʊʌaeioɚuɝ';
+
+/**
+ * Fix rhotacized vowel distinctions for English IPA output.
+ *
+ * The G2P dictionary uses ɝ for all rhotacized vowels, but standard IPA
+ * distinguishes:
+ *   - Stressed NURSE vowel (ɜː): "bird" = bɜːd, "word" = wɜːd
+ *   - Unstressed rhotacized schwa (ɚ): "doctor" = dɑːktɚ, "letter" = lɛtɚ
+ *
+ * Additionally, when ɚ appears before a vowel, a linking ɹ consonant
+ * surfaces (e.g. "centuries" = sɛntʃɚɹiz, "batteries" = bætɚɹiz).
+ */
+function fixRhotacizedVowels(ipa: string): string {
+  // 1. Stressed ɝ → ɜː (NURSE vowel: first vowel after a stress mark)
+  const reStressedRhot = new RegExp(`([ˈˌ][^${IPA_VOWELS}\\sˈˌ]*)ɝ`, 'g')
+  let result = ipa.replace(reStressedRhot, '$1ɜː')
+
+  // 2. Remaining unstressed ɝ → ɚ
+  result = result.replace(/ɝ/g, 'ɚ')
+
+  // 3. Linking ɹ: insert ɹ between ɚ and a following vowel
+  const reLinkingR = new RegExp(`ɚ(?=[${IPA_VOWELS}])`, 'g')
+  result = result.replace(reLinkingR, 'ɚɹ')
+
+  return result
+}
+
 /**
  * Configuration options for tokenizer behavior
  */
@@ -266,6 +295,9 @@ export class Tokenizer {
     } else {
       // IPA format processing
 
+      // Fix rhotacized vowel distinctions (ɝ → ɜː stressed, ɚ unstressed)
+      phonemes = fixRhotacizedVowels(phonemes);
+
       // Convert Chinese tone format if requested
       if (this.options.toneFormat === "arrow") {
         phonemes = convertChineseTonesToArrows(phonemes);