Skip to content

Commit 4684fa7

Browse files
phddyheedocpark
andauthored
update Kiwi to v0.21.0 (#36)
* update Kiwi to v0.21.0 * fix: CI 빌드 및 테스트 이슈 해결 - 모델 경로 변경 - 테스트 실패 수정 --------- Co-authored-by: Heedoc Park <[email protected]>
1 parent ef73aeb commit 4684fa7

File tree

7 files changed

+176
-99
lines changed

7 files changed

+176
-99
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,7 @@
1717

1818
__debug_bin
1919

20-
ModelGenerator/
20+
base/
2121
.idea/
22+
models/
23+
include/

Makefile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
KIWI_VERSION := "v0.10.3"
1+
KIWI_VERSION := v0.21.0
22

33
.PHONY: test
4-
test: ModelGenerator/default.dict
4+
test: base/default.dict
55
go test ./...
66

7-
ModelGenerator/default.dict:
8-
curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION).tgz --output model.tgz
9-
tar -xzvf model.tgz
7+
base/default.dict:
8+
curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION)_base.tgz --output model.tgz
9+
tar --no-same-owner -xzvf model.tgz
1010
rm -f model.tgz
1111

1212

@@ -17,7 +17,7 @@ install-kiwi:
1717
.PHONY: clean
1818
clean:
1919
rm -f model.tgz
20-
rm -rf ./ModelGenerator
20+
rm -rf ./base
2121

2222
.PHONY: format
2323
format:

kiwi.go

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
package kiwi
33

44
/*
5-
#cgo LDFLAGS: -l kiwi
5+
#cgo CFLAGS: -I/usr/local/include
6+
#cgo LDFLAGS: -Wl,-rpath,/usr/local/lib
7+
68
#include <stdlib.h>
79
#include <string.h>
810
#include <stdint.h> // for uintptr_t
@@ -14,6 +16,7 @@ extern int KiwiReaderBridge(int lineNumber, char *buffer, void *userData);
1416
import "C"
1517

1618
import (
19+
"fmt"
1720
"io"
1821
"runtime/cgo"
1922
"unsafe"
@@ -91,17 +94,36 @@ type TokenResult struct {
9194

9295
// Analyze returns the result of the analysis.
9396
func (k *Kiwi) Analyze(text string, topN int, options AnalyzeOption) ([]TokenResult, error) {
94-
kiwiResH := C.kiwi_analyze(k.handler, C.CString(text), C.int(topN), C.int(options))
97+
var (
98+
blocklist C.kiwi_morphset_h
99+
pretokenized C.kiwi_pretokenized_h
100+
cText = C.CString(text)
101+
)
102+
103+
defer C.free(unsafe.Pointer(cText))
95104

105+
kiwiResH := C.kiwi_analyze(k.handler, cText, C.int(topN), C.int(options), blocklist, pretokenized)
106+
if kiwiResH == nil {
107+
return nil, fmt.Errorf("failed to analyze text")
108+
}
96109
defer C.kiwi_res_close(kiwiResH)
97110

98111
resSize := int(C.kiwi_res_size(kiwiResH))
112+
if resSize < 0 {
113+
return nil, fmt.Errorf("invalid result size: %d", resSize)
114+
}
115+
99116
res := make([]TokenResult, resSize)
100117

101118
for i := 0; i < resSize; i++ {
102-
tokens := make([]TokenInfo, int(C.kiwi_res_word_num(kiwiResH, C.int(i))))
119+
wordNum := int(C.kiwi_res_word_num(kiwiResH, C.int(i)))
120+
if wordNum < 0 {
121+
return nil, fmt.Errorf("invalid word number: %d", wordNum)
122+
}
103123

104-
for j := 0; j < len(tokens); j++ {
124+
tokens := make([]TokenInfo, wordNum)
125+
126+
for j := 0; j < wordNum; j++ {
105127
pos, err := ParsePOSType(C.GoString(C.kiwi_res_tag(kiwiResH, C.int(i), C.int(j))))
106128
if err != nil {
107129
return nil, err
@@ -131,15 +153,30 @@ type SplitResult struct {
131153

132154
// SplitSentence returns the line of sentences.
133155
func (k *Kiwi) SplitSentence(text string, options AnalyzeOption) ([]SplitResult, error) {
134-
kiwiSsH := C.kiwi_split_into_sents(k.handler, C.CString(text), C.int(options), nil)
156+
var cText = C.CString(text)
157+
defer C.free(unsafe.Pointer(cText))
158+
159+
kiwiSsH := C.kiwi_split_into_sents(k.handler, cText, C.int(options), nil)
160+
if kiwiSsH == nil {
161+
return nil, fmt.Errorf("failed to split sentences")
162+
}
135163
defer C.kiwi_ss_close(kiwiSsH)
136164

137165
resSize := int(C.kiwi_ss_size(kiwiSsH))
166+
if resSize < 0 {
167+
return nil, fmt.Errorf("invalid result size: %d", resSize)
168+
}
169+
138170
res := make([]SplitResult, resSize)
139171

140172
for i := 0; i < resSize; i++ {
141173
begin := int(C.kiwi_ss_begin_position(kiwiSsH, C.int(i)))
142174
end := int(C.kiwi_ss_end_position(kiwiSsH, C.int(i)))
175+
176+
if begin < 0 || end < begin || end > len(text) {
177+
return nil, fmt.Errorf("invalid position range: begin=%d, end=%d", begin, end)
178+
}
179+
143180
res[i] = SplitResult{
144181
Text: text[begin:end],
145182
Begin: begin,
@@ -188,7 +225,12 @@ func (kb *KiwiBuilder) LoadDict(dictPath string) int {
188225

189226
// Build creates kiwi instance with user word etc.
190227
func (kb *KiwiBuilder) Build() *Kiwi {
191-
h := C.kiwi_builder_build(kb.handler)
228+
var (
229+
typos C.kiwi_typo_h
230+
typoCostThreshold = C.float(1.0)
231+
)
232+
233+
h := C.kiwi_builder_build(kb.handler, typos, typoCostThreshold)
192234
defer kb.Close()
193235
return &Kiwi{
194236
handler: h,

kiwi_example_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
)
88

99
func Example() {
10-
kb := kiwi.NewBuilder("./ModelGenerator", 1 /*=numThread*/, kiwi.KIWI_BUILD_INTEGRATE_ALLOMORPH /*=options*/)
10+
kb := kiwi.NewBuilder("./base", 1 /*=numThread*/, kiwi.KIWI_BUILD_INTEGRATE_ALLOMORPH /*=options*/)
1111
kb.AddWord("코딩냄비", "NNP", 0)
1212

1313
k := kb.Build()
@@ -16,5 +16,5 @@ func Example() {
1616
results, _ := k.Analyze("안녕하세요 코딩냄비입니다. 부글부글.", 1 /*=topN*/, kiwi.KIWI_MATCH_ALL)
1717
fmt.Println(results)
1818
// Output:
19-
// [{[{0 NNG 안녕} {2 XSA 하} {4 EP 시} {3 EC 어요} {6 NNP 코딩냄비} {10 VCP 이} {11 EF ᆸ니다} {13 SF .} {15 NNP 부글부} {18 NNG 글} {19 SF .}] -69.74997}]
19+
// [{[{0 NNG 안녕} {2 XSA 하} {3 EF 세요} {6 NNP 코딩냄비} {10 VCP 이} {10 EF ᆸ니다} {13 SF .} {15 MAG 부글부글} {19 SF .}] -55.869953}]
2020
}

kiwi_test.go

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ import (
99
)
1010

1111
func TestKiwiVersion(t *testing.T) {
12-
assert.Equal(t, KiwiVersion(), "0.10.3")
12+
assert.Equal(t, KiwiVersion(), "0.21.0")
1313
}
1414

1515
func TestAnalyze(t *testing.T) {
16-
kiwi := New("./ModelGenerator", 1, KIWI_BUILD_DEFAULT)
16+
kiwi := New("./base", 1, KIWI_BUILD_DEFAULT)
1717
res, _ := kiwi.Analyze("아버지가 방에 들어가신다", 1, KIWI_MATCH_ALL)
1818

1919
expected := []TokenResult{
@@ -50,12 +50,12 @@ func TestAnalyze(t *testing.T) {
5050
Form: "시",
5151
},
5252
{
53-
Position: 12,
53+
Position: 11,
5454
Tag: POS_EF,
5555
Form: "ᆫ다",
5656
},
5757
},
58-
Score: -38.967132568359375,
58+
Score: -34.55623,
5959
},
6060
}
6161

@@ -64,7 +64,7 @@ func TestAnalyze(t *testing.T) {
6464
}
6565

6666
func TestSplitSentence(t *testing.T) {
67-
kiwi := New("./ModelGenerator", 1, KIWI_BUILD_DEFAULT)
67+
kiwi := New("./base", 1, KIWI_BUILD_DEFAULT)
6868
res, _ := kiwi.SplitSentence("여러 문장으로 구성된 텍스트네 이걸 분리해줘", KIWI_MATCH_ALL)
6969

7070
expected := []SplitResult{
@@ -85,14 +85,16 @@ func TestSplitSentence(t *testing.T) {
8585
}
8686

8787
func TestAddWordFail(t *testing.T) {
88-
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
88+
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
8989
add := kb.AddWord("아버지가", "SKO", 0)
90-
assert.Equal(t, 0, add)
90+
assert.Equal(t, -1, add)
9191
assert.Equal(t, 0, kb.Close())
92+
93+
KiwiClearError()
9294
}
9395

9496
func TestAddWord(t *testing.T) {
95-
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
97+
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
9698
add := kb.AddWord("아버지가", "NNG", 0)
9799

98100
assert.Equal(t, 0, add)
@@ -132,12 +134,12 @@ func TestAddWord(t *testing.T) {
132134
Form: "시",
133135
},
134136
{
135-
Position: 12,
137+
Position: 11,
136138
Tag: "EF",
137139
Form: "ᆫ다",
138140
},
139141
},
140-
Score: -36.959194,
142+
Score: -32.80881,
141143
},
142144
}
143145

@@ -146,7 +148,7 @@ func TestAddWord(t *testing.T) {
146148
}
147149

148150
func TestLoadDict(t *testing.T) {
149-
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
151+
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
150152
add := kb.LoadDict("./example/user_dict.tsv")
151153

152154
assert.Equal(t, 1, add)
@@ -191,12 +193,12 @@ func TestLoadDict(t *testing.T) {
191193
Form: "시",
192194
},
193195
{
194-
Position: 12,
196+
Position: 11,
195197
Tag: "EF",
196198
Form: "ᆫ다",
197199
},
198200
},
199-
Score: -36.959194,
201+
Score: -32.80881,
200202
},
201203
}
202204

@@ -205,7 +207,7 @@ func TestLoadDict(t *testing.T) {
205207
}
206208

207209
func TestLoadDict2(t *testing.T) {
208-
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
210+
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
209211
add := kb.LoadDict("./example/user_dict2.tsv")
210212

211213
assert.Equal(t, 3, add)
@@ -236,7 +238,7 @@ func TestLoadDict2(t *testing.T) {
236238
Form: "들어가신다",
237239
},
238240
},
239-
Score: -13.669565,
241+
Score: -12.538677,
240242
},
241243
}
242244

@@ -245,7 +247,7 @@ func TestLoadDict2(t *testing.T) {
245247
}
246248

247249
func TestExtractWord(t *testing.T) {
248-
kb := NewBuilder("./ModelGenerator", 0, KIWI_BUILD_DEFAULT)
250+
kb := NewBuilder("./base", 1, KIWI_BUILD_DEFAULT)
249251
rs := strings.NewReader(`2008년에는 애국가의 작곡자 안익태가 1930년대에 독일 유학 기간 중 친일 활동을 했다는 사실이 밝혀졌다. 이후 안익태가 나치 독일 하의
250252
베를린에서 만주국 10주년 건국 기념음악회를 지휘하는 동영상까지 발굴되어 관련 학계나 사회에 큰 충격을 주었다. 안익태가 친일 행적을 한 바
251253
있다는 빼도박도 못할 증거가 나왔으니까. 영상물의 '만주환상곡'에는 우리가 현재 알고있는 '한국환상곡'의 두 선율("무궁화 삼천리 나의 사랑아,
@@ -263,12 +265,18 @@ func TestExtractWord(t *testing.T) {
263265
POSScore: -1.92593,
264266
Score: 0,
265267
},
268+
{
269+
Form: "익태",
270+
Freq: 4,
271+
POSScore: -0.23702252,
272+
Score: 0,
273+
},
266274
}, wordInfos)
267275
assert.Equal(t, 0, kb.Close())
268276
}
269277

270278
func TestExtractWordwithFile(t *testing.T) {
271-
kb := NewBuilder("./ModelGenerator", 0, KIWI_BUILD_DEFAULT)
279+
kb := NewBuilder("./base", 1, KIWI_BUILD_DEFAULT) // Use single thread for deterministic results
272280
file, _ := os.Open("./example/test.txt")
273281

274282
wordInfos, _ := kb.ExtractWords(file, 10 /*=minCnt*/, 5 /*=maxWordLen*/, 0.0 /*=minScore*/, -25.0 /*=posThreshold*/)

0 commit comments

Comments
 (0)