lindera
diff --git a/‎Cargo.lock‎
Lines changed: 475 additions & 48 deletions b/‎Cargo.lock‎
Lines changed: 475 additions & 48 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 5 additions & 3 deletions b/‎Cargo.toml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 7 additions & 6 deletions b/‎Makefile‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 58 additions & 0 deletions b/‎README.md‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎examples/train_and_export.py‎
Lines changed: 208 additions & 0 deletions b/‎examples/train_and_export.py‎
Lines changed: 208 additions & 0 deletions
@@ -1,6 +1,6 @@
 [package]
 name = "lindera-python"
-version = "1.1.0"
+version = "1.2.0"
 edition = "2024"
 description = "Python binding for Lindera."
 documentation = "https://docs.rs/lindera-python"
@@ -35,11 +35,13 @@ embedded-cc-cedict = [
 embedded-cjk = [
     "lindera/embedded-cjk",
 ] # Include CJK dictionary (CC-CEDICT, IPADIC, ko-dic)
+train = ["lindera/train"] # Enable training functionality
 default = [] # No directories included
 
 [dependencies]
 pyo3 = { version = "0.26.0", features = ["extension-module"] }
-serde = { version = "1.0.223", features = ["derive"] }
+serde = { version = "1.0.228", features = ["derive"] }
 serde_json = "1.0.145"
+num_cpus = "1.17.0"
 
-lindera = "1.2.0"
+lindera = { path = "../lindera/lindera" }
@@ -25,32 +25,33 @@ format: ## Format the project
 	poetry run black ./examples ./tests
 
 lint: ## Lint the project
-	cargo clippy --features=embedded-ipadic
+	cargo clippy --features=embedded-ipadic,train
 	poetry run isort --check-only --diff ./examples ./tests
 	poetry run black --check ./examples ./tests
 	poetry run flake8 ./examples ./tests
 	poetry run mypy ./examples ./tests
 
 develop: ## Build Python module in development mode and install it into the current Python environment
-	poetry run maturin develop --features=embedded-ipadic
+	poetry run maturin develop --features=embedded-ipadic,train
 
 build: ## Build the project
-	poetry run maturin build -i python --release --features=embedded-ipadic
+	poetry run maturin build -i python --release --features=embedded-ipadic,train
 
 .PHONY: tests
 test: ## Test the project
-	cargo test --features=embedded-ipadic
-	poetry run maturin develop --features=embedded-ipadic
+	cargo test --features=embedded-ipadic,train
+	poetry run maturin develop --features=embedded-ipadic,train
 	poetry run pytest -v ./tests
 
 .PHONY: run-examples
 run-examples: ## Run examples
-	poetry run maturin develop --features=embedded-ipadic
+	poetry run maturin develop --features=embedded-ipadic,train
 	poetry run python ./examples/build_ipadic.py
 	poetry run python ./examples/tokenize.py
 	poetry run python ./examples/tokenize_with_userdict.py
 	poetry run python ./examples/tokenize_with_decompose.py
 	poetry run python ./examples/tokenize_with_filters.py
+	poetry run python ./examples/train_and_export.py
 
 publish: ## Publish package to crates.io
 ifeq ($(shell curl -s -XGET -H "User-Agent: $(USER_AGENT) ($(USER)@$(HOSTNAME))" https://crates.io/api/v1/crates/lindera-python | jq -r '.versions[].num' | grep $(LINDERA_PYTHON_VERSION)),)
 
@@ -21,6 +21,7 @@ lindera-python provides a comprehensive Python interface to the Lindera 1.1.1 mo
 - **CharacterFilter**: Pre-processing filters for text normalization
 - **TokenFilter**: Post-processing filters for token refinement
 - **Metadata & Schema**: Dictionary structure and configuration management
+- **Training & Export** (optional): Train custom morphological analysis models from corpus data
 
 ### Supported Dictionaries
 
@@ -228,6 +229,7 @@ See `examples/` directory for comprehensive examples including:
 - `tokenize.py`: Basic tokenization
 - `tokenize_with_filters.py`: Using character and token filters
 - `tokenize_with_userdict.py`: Custom user dictionary
+- `train_and_export.py`: Train and export custom dictionaries (requires `train` feature)
 - Multi-language tokenization
 - Advanced configuration options
 
@@ -251,6 +253,57 @@ See `examples/` directory for comprehensive examples including:
 - User dictionary support for domain-specific terms
 - CSV format for easy customization
 
+## Dictionary Training (Experimental)
+
+lindera-python supports training custom morphological analysis models from annotated corpus data when built with the `train` feature.
+
+### Building with Training Support
+
+```shell
+# Install with training support
+(.venv) % maturin develop --features train
+```
+
+### Training a Model
+
+```python
+import lindera
+
+# Train a model from corpus
+lindera.train(
+    seed="path/to/seed.csv",           # Seed lexicon
+    corpus="path/to/corpus.txt",       # Training corpus
+    char_def="path/to/char.def",       # Character definitions
+    unk_def="path/to/unk.def",         # Unknown word definitions
+    feature_def="path/to/feature.def", # Feature templates
+    rewrite_def="path/to/rewrite.def", # Rewrite rules
+    output="model.dat",                # Output model file
+    lambda_=0.01,                      # L1 regularization
+    max_iter=100,                      # Max iterations
+    max_threads=None                   # Auto-detect CPU cores
+)
+```
+
+### Exporting Dictionary Files
+
+```python
+# Export trained model to dictionary files
+lindera.export(
+    model="model.dat",              # Trained model
+    output="exported_dict/",        # Output directory
+    metadata="metadata.json"        # Optional metadata file
+)
+```
+
+This will create:
+- `lex.csv`: Lexicon file
+- `matrix.def`: Connection cost matrix
+- `unk.def`: Unknown word definitions
+- `char.def`: Character definitions
+- `metadata.json`: Dictionary metadata (if provided)
+
+See `examples/train_and_export.py` for a complete example.
+
 ## API Reference
 
 ### Core Classes
@@ -263,4 +316,9 @@ See `examples/` directory for comprehensive examples including:
 - `Metadata`: Dictionary metadata and configuration
 - `Schema`: Dictionary schema definition
 
+### Training Functions (requires `train` feature)
+
+- `train()`: Train a morphological analysis model from corpus
+- `export()`: Export trained model to dictionary files
+
 See the `test_basic.py` file for comprehensive API usage examples.
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+Example: Train a morphological analysis model and export dictionary files
+
+This example demonstrates how to:
+1. Train a model from a corpus using lindera.train()
+2. Export dictionary files from the trained model using lindera.export()
+
+Note: This requires the 'train' feature to be enabled when building lindera-python:
+    maturin develop --features train
+"""
+
+import tempfile
+from pathlib import Path
+
+import lindera
+
+
+def create_training_data(tmpdir: Path):
+    """Create minimal training data based on lindera/resources/training format"""
+
+    # Create seed lexicon (vocabulary with initial costs)
+    # Format: surface,left_id,right_id,cost,features...
+    seed_file = tmpdir / "seed.csv"
+    seed_file.write_text(
+        "外国,0,0,0,名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n"
+        "人,0,0,0,名詞,接尾,一般,*,*,*,人,ジン,ジン\n"
+        "参政,0,0,0,名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n"
+        "権,0,0,0,名詞,接尾,一般,*,*,*,権,ケン,ケン\n"
+        "これ,0,0,0,名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n"
+        "は,0,0,0,助詞,係助詞,*,*,*,*,は,ハ,ワ\n"
+        "テスト,0,0,0,名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n"
+        "です,0,0,0,助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n"
+        "。,0,0,0,記号,句点,*,*,*,*,。,。,。\n"
+        "形態,0,0,0,名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n"
+        "素,0,0,0,名詞,接尾,一般,*,*,*,素,ソ,ソ\n"
+        "解析,0,0,0,名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n"
+        "を,0,0,0,助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n"
+        "行う,0,0,0,動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n"
+    )
+
+    # Create character definition (defines character types)
+    char_def_file = tmpdir / "char.def"
+    char_def_file.write_text(
+        "# Character definition for training\n"
+        "DEFAULT 0 1 0\n"
+        "HIRAGANA 1 1 0\n"
+        "KATAKANA 1 1 0\n"
+        "KANJI 0 0 2\n"
+        "ALPHA 1 1 0\n"
+        "NUMERIC 1 1 0\n"
+        "\n"
+        "# Character mappings (simplified)\n"
+        "0x3041..0x3096 HIRAGANA\n"
+        "0x30A1..0x30F6 KATAKANA\n"
+        "0x4E00..0x9FAF KANJI\n"
+        "0x0030..0x0039 NUMERIC\n"
+        "0x0041..0x005A ALPHA\n"
+        "0x0061..0x007A ALPHA\n"
+    )
+
+    # Create unknown word definition (for out-of-vocabulary words)
+    unk_def_file = tmpdir / "unk.def"
+    unk_def_file.write_text(
+        "# Unknown word definitions\n"
+        "DEFAULT,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
+        "HIRAGANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
+        "KATAKANA,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
+        "KANJI,0,0,0,名詞,一般,*,*,*,*,*,*,*\n"
+        "ALPHA,0,0,0,名詞,固有名詞,一般,*,*,*,*,*,*\n"
+        "NUMERIC,0,0,0,名詞,数,*,*,*,*,*,*,*\n"
+    )
+
+    # Create feature definition (defines features for CRF training)
+    feature_def_file = tmpdir / "feature.def"
+    feature_def_file.write_text(
+        "# Feature template definitions for training\n"
+        "# These define how features are extracted from the morphological data\n"
+        "\n"
+        "# Unigram features (word-level features)\n"
+        "UNIGRAM U00:%F[0]    # Part of speech\n"
+        "UNIGRAM U01:%F[0],%F?[1]    # POS + sub-category\n"
+        "UNIGRAM U02:%F[0],%F[1],%F?[2]    # POS hierarchy\n"
+        "\n"
+        "# Bigram features (transition features between words)\n"
+        "# Format: BIGRAM label:%L[index]/%R[index]\n"
+        "# %L = left context (previous word), %R = right context (next word)\n"
+        "BIGRAM B00:%L[0]/%R[0]    # POS-to-POS transition\n"
+        "BIGRAM B01:%L[0],%L?[1]/%R[0]    # Left POS hierarchy to right POS\n"
+        "BIGRAM B02:%L[0]/%R[0],%R?[1]    # Left POS to right POS hierarchy\n"
+        "BIGRAM B03:%L[0],%L[1],%L?[2]/%R[0]    # Detailed left to simple right\n"
+    )
+
+    # Create rewrite definition (for feature rewriting)
+    rewrite_def_file = tmpdir / "rewrite.def"
+    rewrite_def_file.write_text(
+        "# Rewrite rules for feature normalization\n"
+        "# Format: original_pattern\treplacement_pattern\n"
+        "\n"
+        '# Test rewrite: convert "名詞,一般" to "NOUN,GENERAL"\n'
+        "名詞,一般\tNOUN,GENERAL\n"
+        "\n"
+        '# Test rewrite: convert "助詞,係助詞" to "PARTICLE,KAKUJOSHI"\n'
+        "助詞,係助詞\tPARTICLE,KAKUJOSHI\n"
+        "\n"
+        "# Normalize numeric expressions\n"
+        "数\tNUM\n"
+    )
+
+    # Create training corpus (annotated text)
+    # Format: surface\tfeatures (tab-separated)
+    # Each sentence ends with "EOS"
+    corpus_file = tmpdir / "corpus.txt"
+    corpus_file.write_text(
+        "外国\t名詞,一般,*,*,*,*,外国,ガイコク,ガイコク\n"
+        "人\t名詞,接尾,一般,*,*,*,人,ジン,ジン\n"
+        "参政\t名詞,サ変接続,*,*,*,*,参政,サンセイ,サンセイ\n"
+        "権\t名詞,接尾,一般,*,*,*,権,ケン,ケン\n"
+        "EOS\n"
+        "\n"
+        "これ\t名詞,代名詞,一般,*,*,*,これ,コレ,コレ\n"
+        "は\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n"
+        "テスト\t名詞,サ変接続,*,*,*,*,テスト,テスト,テスト\n"
+        "です\t助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\n"
+        "。\t記号,句点,*,*,*,*,。,。,。\n"
+        "EOS\n"
+        "\n"
+        "形態\t名詞,一般,*,*,*,*,形態,ケイタイ,ケイタイ\n"
+        "素\t名詞,接尾,一般,*,*,*,素,ソ,ソ\n"
+        "解析\t名詞,サ変接続,*,*,*,*,解析,カイセキ,カイセキ\n"
+        "を\t助詞,格助詞,一般,*,*,*,を,ヲ,ヲ\n"
+        "行う\t動詞,自立,*,*,五段・ワ行促音便,基本形,行う,オコナウ,オコナウ\n"
+        "EOS\n"
+    )
+
+    # Create metadata for dictionary export
+    metadata_file = tmpdir / "metadata.json"
+    metadata_file.write_text(
+        "{\n" '  "name": "custom-dict",\n' '  "version": "1.0.0",\n' '  "encoding": "utf-8"\n' "}\n"
+    )
+
+    return {
+        "seed": seed_file,
+        "char_def": char_def_file,
+        "unk_def": unk_def_file,
+        "feature_def": feature_def_file,
+        "rewrite_def": rewrite_def_file,
+        "corpus": corpus_file,
+        "metadata": metadata_file,
+    }
+
+
+def main():
+    """Main training and export workflow"""
+    print("=== Lindera Training and Export Example ===\n")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+        print(f"Working directory: {tmpdir}\n")
+
+        # Step 1: Create training data
+        print("Step 1: Creating training data...")
+        files = create_training_data(tmpdir)
+        print("✓ Training data created\n")
+
+        # Step 2: Train model
+        print("Step 2: Training model...")
+        model_file = tmpdir / "model.dat"
+
+        lindera.train(
+            seed=str(files["seed"]),
+            corpus=str(files["corpus"]),
+            char_def=str(files["char_def"]),
+            unk_def=str(files["unk_def"]),
+            feature_def=str(files["feature_def"]),
+            rewrite_def=str(files["rewrite_def"]),
+            output=str(model_file),
+            lambda_=0.01,  # L1 regularization
+            max_iter=10,  # Number of training iterations
+            max_threads=None,  # Auto-detect CPU cores
+        )
+
+        print(f"✓ Model trained and saved to: {model_file}\n")
+
+        # Step 3: Export dictionary files
+        print("Step 3: Exporting dictionary files...")
+        export_dir = tmpdir / "exported_dict"
+
+        lindera.export(
+            model=str(model_file),
+            output=str(export_dir),
+            metadata=str(files["metadata"]),
+        )
+
+        print(f"✓ Dictionary files exported to: {export_dir}\n")
+
+        # Step 4: List exported files
+        print("Step 4: Exported files:")
+        exported_files = sorted(export_dir.glob("*"))
+        for file in exported_files:
+            size = file.stat().st_size
+            print(f"  - {file.name} ({size:,} bytes)")
+
+        print("\n✓ Training and export completed successfully!")
+
+
+if __name__ == "__main__":
+    main()