Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions __tests__/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ describe('Index', function() {
expect(phonemize('this is an apple.')).toEqual('ˈðɪs ˈɪz ˈæn ˈæpəɫ.')
expect(phonemize('John\'s package', true)).toEqual([
{
"phoneme": "ˈdʒɑnz",
"phoneme": "ˈdʒɑːnz",
"position": 0,
"word": "John's"
},
Expand All @@ -33,12 +33,12 @@ describe('Index', function() {

it('rule based or compound word', function() {
expect(phonemize('buggie')).toEqual('ˈbʌɡi')
expect(phonemize('supercar')).toEqual('ˈsupɝˈkɑɹ')
expect(phonemize('pneumonoultramicroscopicsilicovolcanoconiosis')).toEqual('ˈnumoʊˈnoʊˈəɫtɹəˈˌmaɪkɹəskɑpɪkˈsiˈɫikoʊˈvɑɫkeɪnoʊˈkɑnˈaɪoʊˈsɪs')
expect(phonemize('supercar')).toEqual('ˈsuːpɝˈkɑːɹ')
expect(phonemize('pneumonoultramicroscopicsilicovolcanoconiosis')).toEqual('ˈnuːmoʊˈnoʊˈəɫtɹəˈˌmaɪkɹəskɑːpɪkˈsiːˈɫiːkoʊˈvɑːɫkeɪnoʊˈkɑːnˈaɪoʊˈsɪs')
})

it('chinese', function() {
expect(phonemize('中文 TTS')).toEqual('ʈʂʊŋ˥˥ wən˧˥ ˈtiˈtiˈɛs')
expect(phonemize('中文 TTS')).toEqual('ʈʂʊŋ˥˥ wən˧˥ ˈtiːˈtiːˈɛs')
expect(phonemize('中文的抑揚頓挫')).toEqual('ʈʂʊŋ˥˥ wən˧˥ tə˧ i˥˩ jɑŋ˧˥ tuən˥˩ tsʰuɔ˥˩')
expect(phonemize('還原 還你 還是 還不是')).toEqual('xuan˧˥ juan˧˥ xuan˧˥ ni˧˩˧ xaɪ˧˥ ʂɨ˥˩ xaɪ˧˥ pu˥˩ ʂɨ˥˩')
})
Expand Down Expand Up @@ -102,13 +102,13 @@ describe('Index', function() {
it('Number processing', function() {
// Basic number expansion tests
expect(phonemize('5')).toEqual('ˈfaɪv')
expect(phonemize('123')).toEqual('ˈwən ˈhəndɝd ˈtwɛni ˈθɹi')
expect(phonemize('123')).toEqual('ˈwən ˈhəndɝd ˈtwɛni ˈθɹiː')
})

it('Abbreviation expansion', function() {
// Basic abbreviation tests
expect(phonemize('Mr. Smith')).toContain('ˈmɪstɝ')
expect(phonemize('Dr. Johnson')).toContain('ˈdɑktɝ')
expect(phonemize('Dr. Johnson')).toContain('ˈdɑːktɝ')
})

it('Custom tokenizer creation', function() {
Expand All @@ -125,7 +125,7 @@ describe('Index', function() {
})

it('Uppercase acronym processing', function() {
expect(phonemize('TTS')).toEqual('ˈtiˈtiˈɛs')
expect(phonemize('TTS')).toEqual('ˈtiːˈtiːˈɛs')
expect(phonemize('AI')).toEqual('ˈeɪaɪ')

expect(phonemize('Xyz')).not.toContain('ˌɛks')
Expand Down
11 changes: 9 additions & 2 deletions scripts/build-dict.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,15 @@ function parseDict(content: string): DictEntry {

let [, word, phonesStr] = match;

const ipa = phonesStr.match(/^\/([^\/]+)\//)?.[1];
if (!ipa) continue;
// Parse all pronunciation variants (format: /vɑr1/, /vɔr2/, ...)
const variants = [...phonesStr.matchAll(/\/([^\/]+)\//g)].map(m => m[1]);
if (variants.length === 0) continue;

// When multiple variants exist, prefer the one containing ɔ (THOUGHT vowel)
// over ɑ (LOT vowel). The ipa-dict source lists ɑ variants first for words
// like "caught", "bought", "law", "fall", "walk", etc., but ɔ better
// represents standard American English pronunciation for these words.
const ipa = variants.find(v => v.includes("ɔ")) ?? variants[0];
dict[word.toLowerCase()] = ipa;
}

Expand Down
32 changes: 32 additions & 0 deletions src/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,35 @@ import type ChineseG2P from "./zh-g2p";
// Tokenization regex patterns
const TOKEN_REGEX = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g;

// All IPA vowel symbols used for rhotacized vowel detection
const IPA_VOWELS = 'ɑɒæəɔɛɜɪʊʌaeioɚuɝ';

/**
* Fix rhotacized vowel distinctions for English IPA output.
*
* The G2P dictionary uses ɝ for all rhotacized vowels, but standard IPA
* distinguishes:
* - Stressed NURSE vowel (ɜː): "bird" = bɜːd, "word" = wɜːd
* - Unstressed rhotacized schwa (ɚ): "doctor" = dɑːktɚ, "letter" = lɛtɚ
*
* Additionally, when ɚ appears before a vowel, a linking ɹ consonant
* surfaces (e.g. "centuries" = sɛntʃɚɹiz, "batteries" = bætɚɹiz).
*/
function fixRhotacizedVowels(ipa: string): string {
// 1. Stressed ɝ → ɜː (NURSE vowel: first vowel after a stress mark)
const reStressedRhot = new RegExp(`([ˈˌ][^${IPA_VOWELS}\\sˈˌ]*)ɝ`, 'g')
let result = ipa.replace(reStressedRhot, '$1ɜː')

// 2. Remaining unstressed ɝ → ɚ
result = result.replace(/ɝ/g, 'ɚ')

// 3. Linking ɹ: insert ɹ between ɚ and a following vowel
const reLinkingR = new RegExp(`ɚ(?=[${IPA_VOWELS}])`, 'g')
result = result.replace(reLinkingR, 'ɚɹ')

return result
}

/**
* Configuration options for tokenizer behavior
*/
Expand Down Expand Up @@ -266,6 +295,9 @@ export class Tokenizer {
} else {
// IPA format processing

// Fix rhotacized vowel distinctions (ɝ → ɜː stressed, ɚ unstressed)
phonemes = fixRhotacizedVowels(phonemes);

// Convert Chinese tone format if requested
if (this.options.toneFormat === "arrow") {
phonemes = convertChineseTonesToArrows(phonemes);
Expand Down
Loading