|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Example: Train a morphological analysis model and export dictionary files |
| 4 | +
|
| 5 | +This example demonstrates how to: |
| 6 | +1. Train a model from a corpus using lindera.train() |
| 7 | +2. Export dictionary files from the trained model using lindera.export() |
| 8 | +
|
| 9 | +Note: This requires the 'train' feature to be enabled when building lindera-python: |
| 10 | + maturin develop --features train |
| 11 | +""" |
| 12 | + |
| 13 | +import tempfile |
| 14 | +from pathlib import Path |
| 15 | + |
| 16 | +import lindera |
| 17 | + |
| 18 | + |
| 19 | +def create_training_data(tmpdir: Path): |
| 20 | + """Create minimal training data based on lindera/resources/training format""" |
| 21 | + |
| 22 | + # Create seed lexicon (vocabulary with initial costs) |
| 23 | + # Format: surface,left_id,right_id,cost,features... |
| 24 | + seed_file = tmpdir / "seed.csv" |
| 25 | + seed_file.write_text( |
| 26 | + "外国,0,0,0,名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n" |
| 27 | + "人,0,0,0,名詞,接尾,一般,*,*,*,人,ジン,ジン\n" |
| 28 | + "参政,0,0,0,名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n" |
| 29 | + "権,0,0,0,名詞,接尾,一般,*,*,*,権,ケン,ケン\n" |
| 30 | + "これ,0,0,0,名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n" |
| 31 | + "は,0,0,0,助詞,係助詞,*,*,*,*,は,ハ,ワ\n" |
| 32 | + "テスト,0,0,0,名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n" |
| 33 | + "です,0,0,0,助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n" |
| 34 | + "。,0,0,0,記号,句点,*,*,*,*,。,。,。\n" |
| 35 | + "形態,0,0,0,名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n" |
| 36 | + "素,0,0,0,名詞,接尾,一般,*,*,*,素,ソ,ソ\n" |
| 37 | + "解析,0,0,0,名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n" |
| 38 | + "を,0,0,0,助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n" |
| 39 | + "行う,0,0,0,動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n" |
| 40 | + ) |
| 41 | + |
| 42 | + # Create character definition (defines character types) |
| 43 | + char_def_file = tmpdir / "char.def" |
| 44 | + char_def_file.write_text( |
| 45 | + "# Character definition for training\n" |
| 46 | + "DEFAULT 0 1 0\n" |
| 47 | + "HIRAGANA 1 1 0\n" |
| 48 | + "KATAKANA 1 1 0\n" |
| 49 | + "KANJI 0 0 2\n" |
| 50 | + "ALPHA 1 1 0\n" |
| 51 | + "NUMERIC 1 1 0\n" |
| 52 | + "\n" |
| 53 | + "# Character mappings (simplified)\n" |
| 54 | + "0x3041..0x3096 HIRAGANA\n" |
| 55 | + "0x30A1..0x30F6 KATAKANA\n" |
| 56 | + "0x4E00..0x9FAF KANJI\n" |
| 57 | + "0x0030..0x0039 NUMERIC\n" |
| 58 | + "0x0041..0x005A ALPHA\n" |
| 59 | + "0x0061..0x007A ALPHA\n" |
| 60 | + ) |
| 61 | + |
| 62 | + # Create unknown word definition (for out-of-vocabulary words) |
| 63 | + unk_def_file = tmpdir / "unk.def" |
| 64 | + unk_def_file.write_text( |
| 65 | + "# Unknown word definitions\n" |
| 66 | + "DEFAULT,0,0,0,名詞,一般,*,*,*,*,*,*,*\n" |
| 67 | + "HIRAGANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n" |
| 68 | + "KATAKANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n" |
| 69 | + "KANJI,0,0,0,名詞,一般,*,*,*,*,*,*,*\n" |
| 70 | + "ALPHA,0,0,0,名詞,固有名詞,一般,*,*,*,*,*,*\n" |
| 71 | + "NUMERIC,0,0,0,名詞,数,*,*,*,*,*,*,*\n" |
| 72 | + ) |
| 73 | + |
| 74 | + # Create feature definition (defines features for CRF training) |
| 75 | + feature_def_file = tmpdir / "feature.def" |
| 76 | + feature_def_file.write_text( |
| 77 | + "# Feature template definitions for training\n" |
| 78 | + "# These define how features are extracted from the morphological data\n" |
| 79 | + "\n" |
| 80 | + "# Unigram features (word-level features)\n" |
| 81 | + "UNIGRAM U00:%F[0] # Part of speech\n" |
| 82 | + "UNIGRAM U01:%F[0],%F?[1] # POS + sub-category\n" |
| 83 | + "UNIGRAM U02:%F[0],%F[1],%F?[2] # POS hierarchy\n" |
| 84 | + "\n" |
| 85 | + "# Bigram features (transition features between words)\n" |
| 86 | + "# Format: BIGRAM label:%L[index]/%R[index]\n" |
| 87 | + "# %L = left context (previous word), %R = right context (next word)\n" |
| 88 | + "BIGRAM B00:%L[0]/%R[0] # POS-to-POS transition\n" |
| 89 | + "BIGRAM B01:%L[0],%L?[1]/%R[0] # Left POS hierarchy to right POS\n" |
| 90 | + "BIGRAM B02:%L[0]/%R[0],%R?[1] # Left POS to right POS hierarchy\n" |
| 91 | + "BIGRAM B03:%L[0],%L[1],%L?[2]/%R[0] # Detailed left to simple right\n" |
| 92 | + ) |
| 93 | + |
| 94 | + # Create rewrite definition (for feature rewriting) |
| 95 | + rewrite_def_file = tmpdir / "rewrite.def" |
| 96 | + rewrite_def_file.write_text( |
| 97 | + "# Rewrite rules for feature normalization\n" |
| 98 | + "# Format: original_pattern\treplacement_pattern\n" |
| 99 | + "\n" |
| 100 | + '# Test rewrite: convert "名詞,一般" to "NOUN,GENERAL"\n' |
| 101 | + "名詞,一般\tNOUN,GENERAL\n" |
| 102 | + "\n" |
| 103 | + '# Test rewrite: convert "助詞,係助詞" to "PARTICLE,KAKUJOSHI"\n' |
| 104 | + "助詞,係助詞\tPARTICLE,KAKUJOSHI\n" |
| 105 | + "\n" |
| 106 | + "# Normalize numeric expressions\n" |
| 107 | + "数\tNUM\n" |
| 108 | + ) |
| 109 | + |
| 110 | + # Create training corpus (annotated text) |
| 111 | + # Format: surface\tfeatures (tab-separated) |
| 112 | + # Each sentence ends with "EOS" |
| 113 | + corpus_file = tmpdir / "corpus.txt" |
| 114 | + corpus_file.write_text( |
| 115 | + "外国\t名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n" |
| 116 | + "人\t名詞,接尾,一般,*,*,*,人,ジン,ジン\n" |
| 117 | + "参政\t名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n" |
| 118 | + "権\t名詞,接尾,一般,*,*,*,権,ケン,ケン\n" |
| 119 | + "EOS\n" |
| 120 | + "\n" |
| 121 | + "これ\t名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n" |
| 122 | + "は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n" |
| 123 | + "テスト\t名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n" |
| 124 | + "です\t助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n" |
| 125 | + "。\t記号,句点,*,*,*,*,。,。,。\n" |
| 126 | + "EOS\n" |
| 127 | + "\n" |
| 128 | + "形態\t名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n" |
| 129 | + "素\t名詞,接尾,一般,*,*,*,素,ソ,ソ\n" |
| 130 | + "解析\t名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n" |
| 131 | + "を\t助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n" |
| 132 | + "行う\t動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n" |
| 133 | + "EOS\n" |
| 134 | + ) |
| 135 | + |
| 136 | + # Create metadata for dictionary export |
| 137 | + metadata_file = tmpdir / "metadata.json" |
| 138 | + metadata_file.write_text( |
| 139 | + "{\n" ' "name": "custom-dict",\n' ' "version": "1.0.0",\n' ' "encoding": "utf-8"\n' "}\n" |
| 140 | + ) |
| 141 | + |
| 142 | + return { |
| 143 | + "seed": seed_file, |
| 144 | + "char_def": char_def_file, |
| 145 | + "unk_def": unk_def_file, |
| 146 | + "feature_def": feature_def_file, |
| 147 | + "rewrite_def": rewrite_def_file, |
| 148 | + "corpus": corpus_file, |
| 149 | + "metadata": metadata_file, |
| 150 | + } |
| 151 | + |
| 152 | + |
| 153 | +def main(): |
| 154 | + """Main training and export workflow""" |
| 155 | + print("=== Lindera Training and Export Example ===\n") |
| 156 | + |
| 157 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 158 | + tmpdir = Path(tmpdir) |
| 159 | + print(f"Working directory: {tmpdir}\n") |
| 160 | + |
| 161 | + # Step 1: Create training data |
| 162 | + print("Step 1: Creating training data...") |
| 163 | + files = create_training_data(tmpdir) |
| 164 | + print("✓ Training data created\n") |
| 165 | + |
| 166 | + # Step 2: Train model |
| 167 | + print("Step 2: Training model...") |
| 168 | + model_file = tmpdir / "model.dat" |
| 169 | + |
| 170 | + lindera.train( |
| 171 | + seed=str(files["seed"]), |
| 172 | + corpus=str(files["corpus"]), |
| 173 | + char_def=str(files["char_def"]), |
| 174 | + unk_def=str(files["unk_def"]), |
| 175 | + feature_def=str(files["feature_def"]), |
| 176 | + rewrite_def=str(files["rewrite_def"]), |
| 177 | + output=str(model_file), |
| 178 | + lambda_=0.01, # L1 regularization |
| 179 | + max_iter=10, # Number of training iterations |
| 180 | + max_threads=None, # Auto-detect CPU cores |
| 181 | + ) |
| 182 | + |
| 183 | + print(f"✓ Model trained and saved to: {model_file}\n") |
| 184 | + |
| 185 | + # Step 3: Export dictionary files |
| 186 | + print("Step 3: Exporting dictionary files...") |
| 187 | + export_dir = tmpdir / "exported_dict" |
| 188 | + |
| 189 | + lindera.export( |
| 190 | + model=str(model_file), |
| 191 | + output=str(export_dir), |
| 192 | + metadata=str(files["metadata"]), |
| 193 | + ) |
| 194 | + |
| 195 | + print(f"✓ Dictionary files exported to: {export_dir}\n") |
| 196 | + |
| 197 | + # Step 4: List exported files |
| 198 | + print("Step 4: Exported files:") |
| 199 | + exported_files = sorted(export_dir.glob("*")) |
| 200 | + for file in exported_files: |
| 201 | + size = file.stat().st_size |
| 202 | + print(f" - {file.name} ({size:,} bytes)") |
| 203 | + |
| 204 | + print("\n✓ Training and export completed successfully!") |
| 205 | + |
| 206 | + |
| 207 | +if __name__ == "__main__": |
| 208 | + main() |
0 commit comments