diff --git a/__tests__/index.test.ts b/__tests__/index.test.ts index 8cedb2d..13aff35 100644 --- a/__tests__/index.test.ts +++ b/__tests__/index.test.ts @@ -6,7 +6,7 @@ describe('Index', function() { expect(phonemize('this is an apple.')).toEqual('ˈðɪs ˈɪz ˈæn ˈæpəɫ.') expect(phonemize('John\'s package', true)).toEqual([ { - "phoneme": "ˈdʒɑnz", + "phoneme": "ˈdʒɑːnz", "position": 0, "word": "John's" }, @@ -33,12 +33,12 @@ describe('Index', function() { it('rule based or compound word', function() { expect(phonemize('buggie')).toEqual('ˈbʌɡi') - expect(phonemize('supercar')).toEqual('ˈsupɝˈkɑɹ') - expect(phonemize('pneumonoultramicroscopicsilicovolcanoconiosis')).toEqual('ˈnumoʊˈnoʊˈəɫtɹəˈˌmaɪkɹəskɑpɪkˈsiˈɫikoʊˈvɑɫkeɪnoʊˈkɑnˈaɪoʊˈsɪs') + expect(phonemize('supercar')).toEqual('ˈsuːpɝˈkɑːɹ') + expect(phonemize('pneumonoultramicroscopicsilicovolcanoconiosis')).toEqual('ˈnuːmoʊˈnoʊˈəɫtɹəˈˌmaɪkɹəskɑːpɪkˈsiːˈɫiːkoʊˈvɑːɫkeɪnoʊˈkɑːnˈaɪoʊˈsɪs') }) it('chinese', function() { - expect(phonemize('中文 TTS')).toEqual('ʈʂʊŋ˥˥ wən˧˥ ˈtiˈtiˈɛs') + expect(phonemize('中文 TTS')).toEqual('ʈʂʊŋ˥˥ wən˧˥ ˈtiːˈtiːˈɛs') expect(phonemize('中文的抑揚頓挫')).toEqual('ʈʂʊŋ˥˥ wən˧˥ tə˧ i˥˩ jɑŋ˧˥ tuən˥˩ tsʰuɔ˥˩') expect(phonemize('還原 還你 還是 還不是')).toEqual('xuan˧˥ juan˧˥ xuan˧˥ ni˧˩˧ xaɪ˧˥ ʂɨ˥˩ xaɪ˧˥ pu˥˩ ʂɨ˥˩') }) @@ -102,13 +102,13 @@ describe('Index', function() { it('Number processing', function() { // Basic number expansion tests expect(phonemize('5')).toEqual('ˈfaɪv') - expect(phonemize('123')).toEqual('ˈwən ˈhəndɝd ˈtwɛni ˈθɹi') + expect(phonemize('123')).toEqual('ˈwən ˈhəndɝd ˈtwɛni ˈθɹiː') }) it('Abbreviation expansion', function() { // Basic abbreviation tests expect(phonemize('Mr. Smith')).toContain('ˈmɪstɝ') - expect(phonemize('Dr. Johnson')).toContain('ˈdɑktɝ') + expect(phonemize('Dr. Johnson')).toContain('ˈdɑːktɝ') }) it('Custom tokenizer creation', function() { @@ -125,7 +125,7 @@ describe('Index', function() { }) it('Uppercase acronym processing', function() { - expect(phonemize('TTS')).toEqual('ˈtiˈtiˈɛs') + expect(phonemize('TTS')).toEqual('ˈtiːˈtiːˈɛs') expect(phonemize('AI')).toEqual('ˈeɪaɪ') expect(phonemize('Xyz')).not.toContain('ˌɛks') diff --git a/scripts/build-dict.ts b/scripts/build-dict.ts index 7b295e7..c1b3818 100644 --- a/scripts/build-dict.ts +++ b/scripts/build-dict.ts @@ -21,8 +21,15 @@ function parseDict(content: string): DictEntry { let [, word, phonesStr] = match; - const ipa = phonesStr.match(/^\/([^\/]+)\//)?.[1]; - if (!ipa) continue; + // Parse all pronunciation variants (format: /vɑr1/, /vɔr2/, ...) + const variants = [...phonesStr.matchAll(/\/([^\/]+)\//g)].map(m => m[1]); + if (variants.length === 0) continue; + + // When multiple variants exist, prefer the one containing ɔ (THOUGHT vowel) + // over ɑ (LOT vowel). The ipa-dict source lists ɑ variants first for words + // like "caught", "bought", "law", "fall", "walk", etc., but ɔ better + // represents standard American English pronunciation for these words. + const ipa = variants.find(v => v.includes("ɔ")) ?? variants[0]; dict[word.toLowerCase()] = ipa; } diff --git a/src/tokenizer.ts b/src/tokenizer.ts index c841b49..61e347e 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -15,6 +15,35 @@ import type ChineseG2P from "./zh-g2p"; // Tokenization regex patterns const TOKEN_REGEX = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g; +// All IPA vowel symbols used for rhotacized vowel detection +const IPA_VOWELS = 'ɑɒæəɔɛɜɪʊʌaeioɚuɝ'; + +/** + * Fix rhotacized vowel distinctions for English IPA output. + * + * The G2P dictionary uses ɝ for all rhotacized vowels, but standard IPA + * distinguishes: + * - Stressed NURSE vowel (ɜː): "bird" = bɜːd, "word" = wɜːd + * - Unstressed rhotacized schwa (ɚ): "doctor" = dɑːktɚ, "letter" = lɛtɚ + * + * Additionally, when ɚ appears before a vowel, a linking ɹ consonant + * surfaces (e.g. "centuries" = sɛntʃɚɹiz, "batteries" = bætɚɹiz). + */ +function fixRhotacizedVowels(ipa: string): string { + // 1. Stressed ɝ → ɜː (NURSE vowel: first vowel after a stress mark) + const reStressedRhot = new RegExp(`([ˈˌ][^${IPA_VOWELS}\\sˈˌ]*)ɝ`, 'g') + let result = ipa.replace(reStressedRhot, '$1ɜː') + + // 2. Remaining unstressed ɝ → ɚ + result = result.replace(/ɝ/g, 'ɚ') + + // 3. Linking ɹ: insert ɹ between ɚ and a following vowel + const reLinkingR = new RegExp(`ɚ(?=[${IPA_VOWELS}])`, 'g') + result = result.replace(reLinkingR, 'ɚɹ') + + return result +} + /** * Configuration options for tokenizer behavior */ @@ -266,6 +295,9 @@ export class Tokenizer { } else { // IPA format processing + // Fix rhotacized vowel distinctions (ɝ → ɜː stressed, ɚ unstressed) + phonemes = fixRhotacizedVowels(phonemes); + // Convert Chinese tone format if requested if (this.options.toneFormat === "arrow") { phonemes = convertChineseTonesToArrows(phonemes);