Skip to content

Commit 6d6225a

Browse files
committed
Add training and export functionality with Python bindings
1 parent 3c97bcd commit 6d6225a

File tree

10 files changed

+1318
-129
lines changed

10 files changed

+1318
-129
lines changed

Cargo.lock

Lines changed: 475 additions & 48 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "lindera-python"
3-
version = "1.1.0"
3+
version = "1.2.0"
44
edition = "2024"
55
description = "Python binding for Lindera."
66
documentation = "https://docs.rs/lindera-python"
@@ -35,11 +35,13 @@ embedded-cc-cedict = [
3535
embedded-cjk = [
3636
"lindera/embedded-cjk",
3737
] # Include CJK dictionary (CC-CEDICT, IPADIC, ko-dic)
38+
train = ["lindera/train"] # Enable training functionality
3839
default = [] # No directories included
3940

4041
[dependencies]
4142
pyo3 = { version = "0.26.0", features = ["extension-module"] }
42-
serde = { version = "1.0.223", features = ["derive"] }
43+
serde = { version = "1.0.228", features = ["derive"] }
4344
serde_json = "1.0.145"
45+
num_cpus = "1.17.0"
4446

45-
lindera = "1.2.0"
47+
lindera = { path = "../lindera/lindera" }

Makefile

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,32 +25,33 @@ format: ## Format the project
2525
poetry run black ./examples ./tests
2626

2727
lint: ## Lint the project
28-
cargo clippy --features=embedded-ipadic
28+
cargo clippy --features=embedded-ipadic,train
2929
poetry run isort --check-only --diff ./examples ./tests
3030
poetry run black --check ./examples ./tests
3131
poetry run flake8 ./examples ./tests
3232
poetry run mypy ./examples ./tests
3333

3434
develop: ## Build Python module in development mode and install it into the current Python environment
35-
poetry run maturin develop --features=embedded-ipadic
35+
poetry run maturin develop --features=embedded-ipadic,train
3636

3737
build: ## Build the project
38-
poetry run maturin build -i python --release --features=embedded-ipadic
38+
poetry run maturin build -i python --release --features=embedded-ipadic,train
3939

4040
.PHONY: tests
4141
test: ## Test the project
42-
cargo test --features=embedded-ipadic
43-
poetry run maturin develop --features=embedded-ipadic
42+
cargo test --features=embedded-ipadic,train
43+
poetry run maturin develop --features=embedded-ipadic,train
4444
poetry run pytest -v ./tests
4545

4646
.PHONY: run-examples
4747
run-examples: ## Run examples
48-
poetry run maturin develop --features=embedded-ipadic
48+
poetry run maturin develop --features=embedded-ipadic,train
4949
poetry run python ./examples/build_ipadic.py
5050
poetry run python ./examples/tokenize.py
5151
poetry run python ./examples/tokenize_with_userdict.py
5252
poetry run python ./examples/tokenize_with_decompose.py
5353
poetry run python ./examples/tokenize_with_filters.py
54+
poetry run python ./examples/train_and_export.py
5455

5556
publish: ## Publish package to crates.io
5657
ifeq ($(shell curl -s -XGET -H "User-Agent: $(USER_AGENT) ($(USER)@$(HOSTNAME))" https://crates.io/api/v1/crates/lindera-python | jq -r '.versions[].num' | grep $(LINDERA_PYTHON_VERSION)),)

README.md

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ lindera-python provides a comprehensive Python interface to the Lindera 1.1.1 mo
2121
- **CharacterFilter**: Pre-processing filters for text normalization
2222
- **TokenFilter**: Post-processing filters for token refinement
2323
- **Metadata & Schema**: Dictionary structure and configuration management
24+
- **Training & Export** (optional): Train custom morphological analysis models from corpus data
2425

2526
### Supported Dictionaries
2627

@@ -228,6 +229,7 @@ See `examples/` directory for comprehensive examples including:
228229
- `tokenize.py`: Basic tokenization
229230
- `tokenize_with_filters.py`: Using character and token filters
230231
- `tokenize_with_userdict.py`: Custom user dictionary
232+
- `train_and_export.py`: Train and export custom dictionaries (requires `train` feature)
231233
- Multi-language tokenization
232234
- Advanced configuration options
233235

@@ -251,6 +253,57 @@ See `examples/` directory for comprehensive examples including:
251253
- User dictionary support for domain-specific terms
252254
- CSV format for easy customization
253255

256+
## Dictionary Training (Experimental)
257+
258+
lindera-python supports training custom morphological analysis models from annotated corpus data when built with the `train` feature.
259+
260+
### Building with Training Support
261+
262+
```shell
263+
# Install with training support
264+
(.venv) % maturin develop --features train
265+
```
266+
267+
### Training a Model
268+
269+
```python
270+
import lindera
271+
272+
# Train a model from corpus
273+
lindera.train(
274+
seed="path/to/seed.csv", # Seed lexicon
275+
corpus="path/to/corpus.txt", # Training corpus
276+
char_def="path/to/char.def", # Character definitions
277+
unk_def="path/to/unk.def", # Unknown word definitions
278+
feature_def="path/to/feature.def", # Feature templates
279+
rewrite_def="path/to/rewrite.def", # Rewrite rules
280+
output="model.dat", # Output model file
281+
lambda_=0.01, # L1 regularization
282+
max_iter=100, # Max iterations
283+
max_threads=None # Auto-detect CPU cores
284+
)
285+
```
286+
287+
### Exporting Dictionary Files
288+
289+
```python
290+
# Export trained model to dictionary files
291+
lindera.export(
292+
model="model.dat", # Trained model
293+
output="exported_dict/", # Output directory
294+
metadata="metadata.json" # Optional metadata file
295+
)
296+
```
297+
298+
This will create:
299+
- `lex.csv`: Lexicon file
300+
- `matrix.def`: Connection cost matrix
301+
- `unk.def`: Unknown word definitions
302+
- `char.def`: Character definitions
303+
- `metadata.json`: Dictionary metadata (if provided)
304+
305+
See `examples/train_and_export.py` for a complete example.
306+
254307
## API Reference
255308

256309
### Core Classes
@@ -263,4 +316,9 @@ See `examples/` directory for comprehensive examples including:
263316
- `Metadata`: Dictionary metadata and configuration
264317
- `Schema`: Dictionary schema definition
265318

319+
### Training Functions (requires `train` feature)
320+
321+
- `train()`: Train a morphological analysis model from corpus
322+
- `export()`: Export trained model to dictionary files
323+
266324
See the `test_basic.py` file for comprehensive API usage examples.

examples/train_and_export.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Example: Train a morphological analysis model and export dictionary files
4+
5+
This example demonstrates how to:
6+
1. Train a model from a corpus using lindera.train()
7+
2. Export dictionary files from the trained model using lindera.export()
8+
9+
Note: This requires the 'train' feature to be enabled when building lindera-python:
10+
maturin develop --features train
11+
"""
12+
13+
import tempfile
14+
from pathlib import Path
15+
16+
import lindera
17+
18+
19+
def create_training_data(tmpdir: Path):
20+
"""Create minimal training data based on lindera/resources/training format"""
21+
22+
# Create seed lexicon (vocabulary with initial costs)
23+
# Format: surface,left_id,right_id,cost,features...
24+
seed_file = tmpdir / "seed.csv"
25+
seed_file.write_text(
26+
"外国,0,0,0,名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n"
27+
"人,0,0,0,名詞,接尾,一般,*,*,*,人,ジン,ジン\n"
28+
"参政,0,0,0,名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n"
29+
"権,0,0,0,名詞,接尾,一般,*,*,*,権,ケン,ケン\n"
30+
"これ,0,0,0,名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n"
31+
"は,0,0,0,助詞,係助詞,*,*,*,*,は,ハ,ワ\n"
32+
"テスト,0,0,0,名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n"
33+
"です,0,0,0,助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n"
34+
"。,0,0,0,記号,句点,*,*,*,*,。,。,。\n"
35+
"形態,0,0,0,名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n"
36+
"素,0,0,0,名詞,接尾,一般,*,*,*,素,ソ,ソ\n"
37+
"解析,0,0,0,名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n"
38+
"を,0,0,0,助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n"
39+
"行う,0,0,0,動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n"
40+
)
41+
42+
# Create character definition (defines character types)
43+
char_def_file = tmpdir / "char.def"
44+
char_def_file.write_text(
45+
"# Character definition for training\n"
46+
"DEFAULT 0 1 0\n"
47+
"HIRAGANA 1 1 0\n"
48+
"KATAKANA 1 1 0\n"
49+
"KANJI 0 0 2\n"
50+
"ALPHA 1 1 0\n"
51+
"NUMERIC 1 1 0\n"
52+
"\n"
53+
"# Character mappings (simplified)\n"
54+
"0x3041..0x3096 HIRAGANA\n"
55+
"0x30A1..0x30F6 KATAKANA\n"
56+
"0x4E00..0x9FAF KANJI\n"
57+
"0x0030..0x0039 NUMERIC\n"
58+
"0x0041..0x005A ALPHA\n"
59+
"0x0061..0x007A ALPHA\n"
60+
)
61+
62+
# Create unknown word definition (for out-of-vocabulary words)
63+
unk_def_file = tmpdir / "unk.def"
64+
unk_def_file.write_text(
65+
"# Unknown word definitions\n"
66+
"DEFAULT,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
67+
"HIRAGANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
68+
"KATAKANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
69+
"KANJI,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
70+
"ALPHA,0,0,0,名詞,固有名詞,一般,*,*,*,*,*,*\n"
71+
"NUMERIC,0,0,0,名詞,数,*,*,*,*,*,*,*\n"
72+
)
73+
74+
# Create feature definition (defines features for CRF training)
75+
feature_def_file = tmpdir / "feature.def"
76+
feature_def_file.write_text(
77+
"# Feature template definitions for training\n"
78+
"# These define how features are extracted from the morphological data\n"
79+
"\n"
80+
"# Unigram features (word-level features)\n"
81+
"UNIGRAM U00:%F[0] # Part of speech\n"
82+
"UNIGRAM U01:%F[0],%F?[1] # POS + sub-category\n"
83+
"UNIGRAM U02:%F[0],%F[1],%F?[2] # POS hierarchy\n"
84+
"\n"
85+
"# Bigram features (transition features between words)\n"
86+
"# Format: BIGRAM label:%L[index]/%R[index]\n"
87+
"# %L = left context (previous word), %R = right context (next word)\n"
88+
"BIGRAM B00:%L[0]/%R[0] # POS-to-POS transition\n"
89+
"BIGRAM B01:%L[0],%L?[1]/%R[0] # Left POS hierarchy to right POS\n"
90+
"BIGRAM B02:%L[0]/%R[0],%R?[1] # Left POS to right POS hierarchy\n"
91+
"BIGRAM B03:%L[0],%L[1],%L?[2]/%R[0] # Detailed left to simple right\n"
92+
)
93+
94+
# Create rewrite definition (for feature rewriting)
95+
rewrite_def_file = tmpdir / "rewrite.def"
96+
rewrite_def_file.write_text(
97+
"# Rewrite rules for feature normalization\n"
98+
"# Format: original_pattern\treplacement_pattern\n"
99+
"\n"
100+
'# Test rewrite: convert "名詞,一般" to "NOUN,GENERAL"\n'
101+
"名詞,一般\tNOUN,GENERAL\n"
102+
"\n"
103+
'# Test rewrite: convert "助詞,係助詞" to "PARTICLE,KAKUJOSHI"\n'
104+
"助詞,係助詞\tPARTICLE,KAKUJOSHI\n"
105+
"\n"
106+
"# Normalize numeric expressions\n"
107+
"数\tNUM\n"
108+
)
109+
110+
# Create training corpus (annotated text)
111+
# Format: surface\tfeatures (tab-separated)
112+
# Each sentence ends with "EOS"
113+
corpus_file = tmpdir / "corpus.txt"
114+
corpus_file.write_text(
115+
"外国\t名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n"
116+
"人\t名詞,接尾,一般,*,*,*,人,ジン,ジン\n"
117+
"参政\t名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n"
118+
"権\t名詞,接尾,一般,*,*,*,権,ケン,ケン\n"
119+
"EOS\n"
120+
"\n"
121+
"これ\t名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n"
122+
"は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n"
123+
"テスト\t名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n"
124+
"です\t助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n"
125+
"。\t記号,句点,*,*,*,*,。,。,。\n"
126+
"EOS\n"
127+
"\n"
128+
"形態\t名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n"
129+
"素\t名詞,接尾,一般,*,*,*,素,ソ,ソ\n"
130+
"解析\t名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n"
131+
"を\t助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n"
132+
"行う\t動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n"
133+
"EOS\n"
134+
)
135+
136+
# Create metadata for dictionary export
137+
metadata_file = tmpdir / "metadata.json"
138+
metadata_file.write_text(
139+
"{\n" ' "name": "custom-dict",\n' ' "version": "1.0.0",\n' ' "encoding": "utf-8"\n' "}\n"
140+
)
141+
142+
return {
143+
"seed": seed_file,
144+
"char_def": char_def_file,
145+
"unk_def": unk_def_file,
146+
"feature_def": feature_def_file,
147+
"rewrite_def": rewrite_def_file,
148+
"corpus": corpus_file,
149+
"metadata": metadata_file,
150+
}
151+
152+
153+
def main():
154+
"""Main training and export workflow"""
155+
print("=== Lindera Training and Export Example ===\n")
156+
157+
with tempfile.TemporaryDirectory() as tmpdir:
158+
tmpdir = Path(tmpdir)
159+
print(f"Working directory: {tmpdir}\n")
160+
161+
# Step 1: Create training data
162+
print("Step 1: Creating training data...")
163+
files = create_training_data(tmpdir)
164+
print("✓ Training data created\n")
165+
166+
# Step 2: Train model
167+
print("Step 2: Training model...")
168+
model_file = tmpdir / "model.dat"
169+
170+
lindera.train(
171+
seed=str(files["seed"]),
172+
corpus=str(files["corpus"]),
173+
char_def=str(files["char_def"]),
174+
unk_def=str(files["unk_def"]),
175+
feature_def=str(files["feature_def"]),
176+
rewrite_def=str(files["rewrite_def"]),
177+
output=str(model_file),
178+
lambda_=0.01, # L1 regularization
179+
max_iter=10, # Number of training iterations
180+
max_threads=None, # Auto-detect CPU cores
181+
)
182+
183+
print(f"✓ Model trained and saved to: {model_file}\n")
184+
185+
# Step 3: Export dictionary files
186+
print("Step 3: Exporting dictionary files...")
187+
export_dir = tmpdir / "exported_dict"
188+
189+
lindera.export(
190+
model=str(model_file),
191+
output=str(export_dir),
192+
metadata=str(files["metadata"]),
193+
)
194+
195+
print(f"✓ Dictionary files exported to: {export_dir}\n")
196+
197+
# Step 4: List exported files
198+
print("Step 4: Exported files:")
199+
exported_files = sorted(export_dir.glob("*"))
200+
for file in exported_files:
201+
size = file.stat().st_size
202+
print(f" - {file.name} ({size:,} bytes)")
203+
204+
print("\n✓ Training and export completed successfully!")
205+
206+
207+
if __name__ == "__main__":
208+
main()

0 commit comments

Comments
 (0)