Skip to content

Commit c59a79f

Browse files
committed
Add example
1 parent fa3142a commit c59a79f

File tree

3 files changed

+57
-1
lines changed

3 files changed

+57
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.idea/
22

33
# Byte-compiled / optimized / DLL files
4-
__pycache__/
4+
**/__pycache__/
55
*.py[cod]
66
*$py.class
77

examples/build_ipadic.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import tarfile
2+
import urllib.request
3+
4+
from lindera_py import Segmenter, Tokenizer, build_dictionary, load_dictionary, version
5+
6+
7+
def main():
8+
# https://Lindera.dev/mecab-ipadic-2.7.0-20070801.tar.gz
9+
url = "https://lindera.dev/mecab-ipadic-2.7.0-20070801.tar.gz"
10+
filename = "/tmp/mecab-ipadic-2.7.0-20070801.tar.gz"
11+
12+
# Add User-Agent header to avoid 403 error
13+
opener = urllib.request.build_opener()
14+
opener.addheaders = [("User-Agent", f"lindera-py/{version()}")]
15+
urllib.request.install_opener(opener)
16+
17+
# Download dictionary source file
18+
urllib.request.urlretrieve(url, filename)
19+
20+
# Extract the dictionary source file
21+
with tarfile.open(filename, "r:gz") as tar:
22+
tar.extractall("/tmp/", filter="data")
23+
24+
source_path = "/tmp/mecab-ipadic-2.7.0-20070801"
25+
destination_path = "/tmp/lindera-ipadic-2.7.0-20070801"
26+
27+
# Build dictionary
28+
build_dictionary("ipadic", source_path, destination_path)
29+
30+
# Load the built dictionary
31+
dictionary = load_dictionary(path=destination_path)
32+
33+
# create a segmenter
34+
segmenter = Segmenter("normal", dictionary)
35+
36+
# create a tokenizer
37+
tokenizer = Tokenizer(segmenter)
38+
39+
text = "関西国際空港限定トートバッグを東京スカイツリーの最寄り駅であるとうきょうスカイツリー駅で買う"
40+
print(f"text: {text}\n")
41+
42+
# tokenize the text
43+
tokens = tokenizer.tokenize(text)
44+
45+
for token in tokens:
46+
print(token.text)
47+
48+
49+
if __name__ == "__main__":
50+
main()

src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ use crate::segmenter::PySegmenter;
1414
use crate::token::PyToken;
1515
use crate::tokenizer::{PyTokenizer, PyTokenizerBuilder};
1616

17+
#[pyfunction]
18+
pub fn version() -> String {
19+
env!("CARGO_PKG_VERSION").to_string()
20+
}
21+
1722
#[pymodule]
1823
fn lindera_py(module: &Bound<'_, PyModule>) -> PyResult<()> {
1924
module.add_class::<PyToken>()?;
@@ -28,5 +33,6 @@ fn lindera_py(module: &Bound<'_, PyModule>) -> PyResult<()> {
2833
module.add_function(wrap_pyfunction!(load_dictionary, module)?)?;
2934
module.add_function(wrap_pyfunction!(load_user_dictionary, module)?)?;
3035

36+
module.add_function(wrap_pyfunction!(version, module)?)?;
3137
Ok(())
3238
}

0 commit comments

Comments
 (0)