Skip to content

Commit feaf125

Browse files
committed
Add Ukrainian stemmer
1 parent 940a1fe commit feaf125

File tree

3 files changed

+286
-1
lines changed

3 files changed

+286
-1
lines changed

bin/readable_sbl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,17 @@
2121

2222
def build_chars_map(sbl_file)
2323
chars_hash = {}
24+
exceptions = ["'"] # apostrophe
25+
2426
File.readlines(sbl_file).each do |line|
2527
char = line.match(/stringdef\s+(\S+)\s+'\{U\+(\S+)\}'/) # E.g.: stringdef zh '{U+0436}'
2628
next if char.nil?
2729

28-
chars_hash[char[1]] = '' << char[2].to_i(16) # extracts { 'zh': '0436' } => { 'zh': 'ж' }
30+
latin = char[1]
31+
utf = '' << char[2].to_i(16)
32+
next unless exceptions.index(utf).nil?
33+
34+
chars_hash[latin] = utf # extracts { 'zh': '0436' } => { 'zh': 'ж' }
2935
end
3036

3137
chars_hash

bin/utf_to_sbl

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env ruby
2+
3+
# Replaces Latin chars and prints SBL file:
4+
# $ bin/utf_to_sbl ./explanations/ukrainian.sbl.utf
5+
6+
# Converts Unicode chars into Latin letters:
7+
#
8+
# define perfective_gerund as (
9+
# [substring] among (
10+
# 'вши'
11+
# 'вшись'
12+
# ...
13+
# =>
14+
# define perfective_gerund as (
15+
# [substring] among (
16+
# '{v}{sh}{i}'
17+
# '{v}{sh}{i}{s}{'}'
18+
# ...
19+
20+
21+
def build_chars_map(utf_file)
22+
chars_hash = {}
23+
exceptions = ["'"] # apostrophe
24+
25+
File.readlines(utf_file).each do |line|
26+
char = line.match(/stringdef\s+(\S+)\s+'\{U\+(\S+)\}'/) # E.g.: stringdef zh '{U+0436}'
27+
next if char.nil?
28+
29+
latin = char[1]
30+
utf = '' << char[2].to_i(16)
31+
next unless exceptions.index(utf).nil?
32+
33+
chars_hash[utf] = latin # extracts { 'zh': '0436' } => { 'ж': 'zh' }
34+
end
35+
36+
chars_hash
37+
end
38+
39+
def readable_sbl_file(utf_file, chars_map)
40+
File.readlines(utf_file).map do |line|
41+
readable_line = line
42+
chars_map.each do |real_letter, latin_letter|
43+
readable_line = readable_line.gsub(real_letter, "{#{latin_letter}}")
44+
end
45+
readable_line
46+
end.join
47+
end
48+
49+
utf_file = ARGV.first.to_s
50+
51+
puts("Run script with 'utf' file, e.g.: 'bin/utf_to_sbl ./explanations/ukrainian.sbl.utf'") and exit if utf_file.empty?
52+
53+
puts("File '#{utf_file}' doesn't exist") and exit unless File.exist?(utf_file)
54+
55+
chars_map = build_chars_map(utf_file)
56+
puts readable_sbl_file(utf_file, chars_map)

explanations/ukrainian.sbl.utf

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
stringescapes {}
2+
3+
/* the 33 Ukrainian letters and apostrophe represented by single quote */
4+
5+
stringdef a '{U+0430}'
6+
stringdef b '{U+0431}'
7+
stringdef v '{U+0432}'
8+
stringdef gh '{U+0433}'
9+
stringdef g '{U+0491}'
10+
stringdef d '{U+0434}'
11+
stringdef e '{U+0435}'
12+
stringdef ye '{U+0454}'
13+
stringdef zh '{U+0436}'
14+
stringdef z '{U+0437}'
15+
stringdef y '{U+0438}'
16+
stringdef i '{U+0456}'
17+
stringdef yi '{U+0457}'
18+
stringdef i` '{U+0439}'
19+
stringdef k '{U+043A}'
20+
stringdef l '{U+043B}'
21+
stringdef m '{U+043C}'
22+
stringdef n '{U+043D}'
23+
stringdef o '{U+043E}'
24+
stringdef p '{U+043F}'
25+
stringdef r '{U+0440}'
26+
stringdef s '{U+0441}'
27+
stringdef t '{U+0442}'
28+
stringdef u '{U+0443}'
29+
stringdef f '{U+0444}'
30+
stringdef kh '{U+0445}'
31+
stringdef ts '{U+0446}'
32+
stringdef ch '{U+0447}'
33+
stringdef sh '{U+0448}'
34+
stringdef shch '{U+0449}'
35+
stringdef soft '{U+044C}'
36+
stringdef iu '{U+044E}'
37+
stringdef ia '{U+044F}'
38+
stringdef apostrophe '{U+0027}'
39+
40+
routines ( mark_regions R2
41+
// perfective_gerund
42+
adjective
43+
adjectival
44+
// reflexive
45+
// verb
46+
// noun
47+
// derivational
48+
// tidy_up
49+
)
50+
51+
externals ( stem )
52+
53+
integers ( pV p2 )
54+
55+
groupings ( v )
56+
57+
define v 'аеєиіїоуюя'
58+
59+
define mark_regions as (
60+
61+
$pV = limit
62+
$p2 = limit
63+
do (
64+
gopast v setmark pV gopast non-v
65+
gopast v gopast non-v setmark p2
66+
)
67+
)
68+
69+
backwardmode (
70+
71+
define R2 as $p2 <= cursor
72+
73+
// define perfective_gerund as (
74+
// [substring] among (
75+
// // 'в'
76+
// // 'вші'
77+
// // 'вшіс{'}'
78+
// // ('а' or 'я' delete)
79+
// // 'ів'
80+
// // 'івші'
81+
// // 'івшіс{'}'
82+
// // 'ив'
83+
// // 'ивші'
84+
// // 'ившіс{'}'
85+
// // (delete)
86+
// )
87+
// )
88+
89+
define adjective as (
90+
[substring] among (
91+
'ього'
92+
(delete)
93+
// 'ее' 'іе' 'ие' 'ое' 'імі' 'имі'
94+
// 'ей' 'ій' 'ий' 'ой' 'ем' 'ім'
95+
// 'им' 'ом' 'еґо' 'оґо' 'ему'
96+
// 'ому' 'іх' 'их' 'ую' 'юю' 'ая'
97+
// 'яя'
98+
// // and -
99+
// 'ою' // - which is somewhat archaic
100+
// 'ею' // - soft form of ою
101+
// (delete)
102+
)
103+
)
104+
105+
define adjectival as (
106+
adjective
107+
108+
/* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
109+
nn, юshch, shch, uюshch can be removed, with a small proportion of
110+
errors. Removing im, uem, enn creates too many errors.
111+
*/
112+
113+
// try (
114+
// [substring] among (
115+
// // 'ем' // present passive participle
116+
// // 'нн' // adjective from past passive participle
117+
// // 'вш' // past active participle
118+
// // 'ющ' 'щ' // present active participle
119+
// // ('а' or 'я' delete)
120+
121+
// // //but not 'ім' 'уем' // present passive participle
122+
// // //or 'енн' // adjective from past passive participle
123+
124+
// // 'івш' 'ивш'// past active participle
125+
// // 'ующ' // present active participle
126+
// // (delete)
127+
// )
128+
// )
129+
130+
)
131+
132+
// define reflexive as (
133+
// [substring] among (
134+
// // 'ся'
135+
// // 'с{'}'
136+
// // (delete)
137+
// )
138+
// )
139+
140+
// define verb as (
141+
// [substring] among (
142+
// // 'ла' 'на' 'ете' 'йте' 'лі' 'й'
143+
// // 'л' 'ем' 'н' 'ло' 'но' 'ет' 'ют'
144+
// // 'ни' 'т{'}' 'еш{'}'
145+
146+
// // 'нно'
147+
// // ('а' or 'я' delete)
148+
149+
// // 'іла' 'ила' 'ена' 'ейте'
150+
// // 'уйте' 'іте' 'ілі' 'илі' 'ей'
151+
// // 'уй' 'іл' 'ил' 'ім' 'им' 'ен'
152+
// // 'іло' 'ило' 'ено' 'ят' 'ует'
153+
// // 'уют' 'іт' 'ит' 'ени' 'іт{'}'
154+
// // 'ит{'}' 'іш{'}' 'ую' 'ю'
155+
// // (delete)
156+
// /* note the short passive participle tests:
157+
// 'на' 'н' 'но' 'ни'
158+
// 'ена' 'ен' 'ено' 'ени'
159+
// */
160+
// )
161+
// )
162+
163+
// define noun as (
164+
// [substring] among (
165+
// // 'а' 'ев' 'ов' 'іе' '{'}е' 'е'
166+
// // 'іямі' 'ямі' 'амі' 'еі' 'іі'
167+
// // 'і' 'іей' 'ей' 'ой' 'ій' 'й'
168+
// // 'іям' 'ям' 'іем' 'ем' 'ам' 'ом'
169+
// // 'о' 'у' 'ах' 'іях' 'ях' 'и' '{'}'
170+
// // 'ію' '{'}ю' 'ю' 'ія' '{'}я' 'я'
171+
// // (delete)
172+
// /* the small class of neuter forms 'ені' 'енем'
173+
// 'ена' 'ен' 'енам' 'енамі' 'ена{x}'
174+
// omitted - they only occur on 12 words.
175+
// */
176+
// )
177+
// )
178+
179+
// define derivational as (
180+
// [substring] R2 among (
181+
// // 'ост'
182+
// // 'ост{'}'
183+
// // (delete)
184+
// )
185+
// )
186+
187+
// define tidy_up as (
188+
// [substring] among (
189+
// // 'ейш'
190+
// // 'ейше' // superlative forms
191+
// // (delete
192+
// // ['н'] 'н' delete
193+
// // )
194+
// // 'н'
195+
// // ('н' delete) // e.g. -nno endings
196+
// // '{'}'
197+
// // (delete) // with some slight false conflations
198+
// )
199+
// )
200+
)
201+
202+
define stem as (
203+
204+
// Normalise {e"} to е. The documentation has long suggested the user
205+
// should do this before calling the stemmer - we now do it for them.
206+
// do repeat ( goto (['{e"}']) <- 'е' )
207+
208+
do mark_regions
209+
backwards setlimit tomark pV for (
210+
do (
211+
adjectival
212+
// perfective_gerund or
213+
// ( try reflexive
214+
// adjectival or verb or noun
215+
// )
216+
)
217+
// try([ 'і' ] delete)
218+
// because noun ending -iю is being treated as verb ending -ю
219+
220+
// do derivational
221+
// do tidy_up
222+
)
223+
)

0 commit comments

Comments
 (0)