Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@

__debug_bin

ModelGenerator/
base/
.idea/
models/
include/
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
KIWI_VERSION := "v0.10.3"
KIWI_VERSION := v0.21.0

.PHONY: test
test: ModelGenerator/default.dict
test: base/default.dict
go test ./...

ModelGenerator/default.dict:
curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION).tgz --output model.tgz
tar -xzvf model.tgz
base/default.dict:
curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION)_base.tgz --output model.tgz
tar --no-same-owner -xzvf model.tgz
rm -f model.tgz


Expand All @@ -17,7 +17,7 @@ install-kiwi:
.PHONY: clean
clean:
rm -f model.tgz
rm -rf ./ModelGenerator
rm -rf ./base

.PHONY: format
format:
Expand Down
54 changes: 48 additions & 6 deletions kiwi.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
package kiwi

/*
#cgo LDFLAGS: -l kiwi
#cgo CFLAGS: -I/usr/local/include
#cgo LDFLAGS: -Wl,-rpath,/usr/local/lib

#include <stdlib.h>
#include <string.h>
#include <stdint.h> // for uintptr_t
Expand All @@ -14,6 +16,7 @@ extern int KiwiReaderBridge(int lineNumber, char *buffer, void *userData);
import "C"

import (
"fmt"
"io"
"runtime/cgo"
"unsafe"
Expand Down Expand Up @@ -91,17 +94,36 @@ type TokenResult struct {

// Analyze returns the result of the analysis.
func (k *Kiwi) Analyze(text string, topN int, options AnalyzeOption) ([]TokenResult, error) {
kiwiResH := C.kiwi_analyze(k.handler, C.CString(text), C.int(topN), C.int(options))
var (
blocklist C.kiwi_morphset_h
pretokenized C.kiwi_pretokenized_h
cText = C.CString(text)
)

defer C.free(unsafe.Pointer(cText))

kiwiResH := C.kiwi_analyze(k.handler, cText, C.int(topN), C.int(options), blocklist, pretokenized)
if kiwiResH == nil {
return nil, fmt.Errorf("failed to analyze text")
}
defer C.kiwi_res_close(kiwiResH)

resSize := int(C.kiwi_res_size(kiwiResH))
if resSize < 0 {
return nil, fmt.Errorf("invalid result size: %d", resSize)
}

res := make([]TokenResult, resSize)

for i := 0; i < resSize; i++ {
tokens := make([]TokenInfo, int(C.kiwi_res_word_num(kiwiResH, C.int(i))))
wordNum := int(C.kiwi_res_word_num(kiwiResH, C.int(i)))
if wordNum < 0 {
return nil, fmt.Errorf("invalid word number: %d", wordNum)
}

for j := 0; j < len(tokens); j++ {
tokens := make([]TokenInfo, wordNum)

for j := 0; j < wordNum; j++ {
pos, err := ParsePOSType(C.GoString(C.kiwi_res_tag(kiwiResH, C.int(i), C.int(j))))
if err != nil {
return nil, err
Expand Down Expand Up @@ -131,15 +153,30 @@ type SplitResult struct {

// SplitSentence returns the line of sentences.
func (k *Kiwi) SplitSentence(text string, options AnalyzeOption) ([]SplitResult, error) {
kiwiSsH := C.kiwi_split_into_sents(k.handler, C.CString(text), C.int(options), nil)
var cText = C.CString(text)
defer C.free(unsafe.Pointer(cText))

kiwiSsH := C.kiwi_split_into_sents(k.handler, cText, C.int(options), nil)
if kiwiSsH == nil {
return nil, fmt.Errorf("failed to split sentences")
}
defer C.kiwi_ss_close(kiwiSsH)

resSize := int(C.kiwi_ss_size(kiwiSsH))
if resSize < 0 {
return nil, fmt.Errorf("invalid result size: %d", resSize)
}

res := make([]SplitResult, resSize)

for i := 0; i < resSize; i++ {
begin := int(C.kiwi_ss_begin_position(kiwiSsH, C.int(i)))
end := int(C.kiwi_ss_end_position(kiwiSsH, C.int(i)))

if begin < 0 || end < begin || end > len(text) {
return nil, fmt.Errorf("invalid position range: begin=%d, end=%d", begin, end)
}

res[i] = SplitResult{
Text: text[begin:end],
Begin: begin,
Expand Down Expand Up @@ -188,7 +225,12 @@ func (kb *KiwiBuilder) LoadDict(dictPath string) int {

// Build creates kiwi instance with user word etc.
func (kb *KiwiBuilder) Build() *Kiwi {
h := C.kiwi_builder_build(kb.handler)
var (
typos C.kiwi_typo_h
typoCostThreshold = C.float(1.0)
)

h := C.kiwi_builder_build(kb.handler, typos, typoCostThreshold)
defer kb.Close()
return &Kiwi{
handler: h,
Expand Down
4 changes: 2 additions & 2 deletions kiwi_example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
)

func Example() {
kb := kiwi.NewBuilder("./ModelGenerator", 1 /*=numThread*/, kiwi.KIWI_BUILD_INTEGRATE_ALLOMORPH /*=options*/)
kb := kiwi.NewBuilder("./base", 1 /*=numThread*/, kiwi.KIWI_BUILD_INTEGRATE_ALLOMORPH /*=options*/)
kb.AddWord("코딩냄비", "NNP", 0)

k := kb.Build()
Expand All @@ -16,5 +16,5 @@ func Example() {
results, _ := k.Analyze("안녕하세요 코딩냄비입니다. 부글부글.", 1 /*=topN*/, kiwi.KIWI_MATCH_ALL)
fmt.Println(results)
// Output:
// [{[{0 NNG 안녕} {2 XSA 하} {4 EP 시} {3 EC 어요} {6 NNP 코딩냄비} {10 VCP 이} {11 EF ᆸ니다} {13 SF .} {15 NNP 부글부} {18 NNG 글} {19 SF .}] -69.74997}]
// [{[{0 NNG 안녕} {2 XSA 하} {3 EF 세요} {6 NNP 코딩냄비} {10 VCP 이} {10 EF ᆸ니다} {13 SF .} {15 MAG 부글부글} {19 SF .}] -55.869953}]
}
42 changes: 25 additions & 17 deletions kiwi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ import (
)

func TestKiwiVersion(t *testing.T) {
assert.Equal(t, KiwiVersion(), "0.10.3")
assert.Equal(t, KiwiVersion(), "0.21.0")
}

func TestAnalyze(t *testing.T) {
kiwi := New("./ModelGenerator", 1, KIWI_BUILD_DEFAULT)
kiwi := New("./base", 1, KIWI_BUILD_DEFAULT)
res, _ := kiwi.Analyze("아버지가 방에 들어가신다", 1, KIWI_MATCH_ALL)

expected := []TokenResult{
Expand Down Expand Up @@ -50,12 +50,12 @@ func TestAnalyze(t *testing.T) {
Form: "시",
},
{
Position: 12,
Position: 11,
Tag: POS_EF,
Form: "ᆫ다",
},
},
Score: -38.967132568359375,
Score: -34.55623,
},
}

Expand All @@ -64,7 +64,7 @@ func TestAnalyze(t *testing.T) {
}

func TestSplitSentence(t *testing.T) {
kiwi := New("./ModelGenerator", 1, KIWI_BUILD_DEFAULT)
kiwi := New("./base", 1, KIWI_BUILD_DEFAULT)
res, _ := kiwi.SplitSentence("여러 문장으로 구성된 텍스트네 이걸 분리해줘", KIWI_MATCH_ALL)

expected := []SplitResult{
Expand All @@ -85,14 +85,16 @@ func TestSplitSentence(t *testing.T) {
}

func TestAddWordFail(t *testing.T) {
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
add := kb.AddWord("아버지가", "SKO", 0)
assert.Equal(t, 0, add)
assert.Equal(t, -1, add)
assert.Equal(t, 0, kb.Close())

KiwiClearError()
}

func TestAddWord(t *testing.T) {
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
add := kb.AddWord("아버지가", "NNG", 0)

assert.Equal(t, 0, add)
Expand Down Expand Up @@ -132,12 +134,12 @@ func TestAddWord(t *testing.T) {
Form: "시",
},
{
Position: 12,
Position: 11,
Tag: "EF",
Form: "ᆫ다",
},
},
Score: -36.959194,
Score: -32.80881,
},
}

Expand All @@ -146,7 +148,7 @@ func TestAddWord(t *testing.T) {
}

func TestLoadDict(t *testing.T) {
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
add := kb.LoadDict("./example/user_dict.tsv")

assert.Equal(t, 1, add)
Expand Down Expand Up @@ -191,12 +193,12 @@ func TestLoadDict(t *testing.T) {
Form: "시",
},
{
Position: 12,
Position: 11,
Tag: "EF",
Form: "ᆫ다",
},
},
Score: -36.959194,
Score: -32.80881,
},
}

Expand All @@ -205,7 +207,7 @@ func TestLoadDict(t *testing.T) {
}

func TestLoadDict2(t *testing.T) {
kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH)
add := kb.LoadDict("./example/user_dict2.tsv")

assert.Equal(t, 3, add)
Expand Down Expand Up @@ -236,7 +238,7 @@ func TestLoadDict2(t *testing.T) {
Form: "들어가신다",
},
},
Score: -13.669565,
Score: -12.538677,
},
}

Expand All @@ -245,7 +247,7 @@ func TestLoadDict2(t *testing.T) {
}

func TestExtractWord(t *testing.T) {
kb := NewBuilder("./ModelGenerator", 0, KIWI_BUILD_DEFAULT)
kb := NewBuilder("./base", 1, KIWI_BUILD_DEFAULT)
rs := strings.NewReader(`2008년에는 애국가의 작곡자 안익태가 1930년대에 독일 유학 기간 중 친일 활동을 했다는 사실이 밝혀졌다. 이후 안익태가 나치 독일 하의
베를린에서 만주국 10주년 건국 기념음악회를 지휘하는 동영상까지 발굴되어 관련 학계나 사회에 큰 충격을 주었다. 안익태가 친일 행적을 한 바
있다는 빼도박도 못할 증거가 나왔으니까. 영상물의 '만주환상곡'에는 우리가 현재 알고있는 '한국환상곡'의 두 선율("무궁화 삼천리 나의 사랑아,
Expand All @@ -263,12 +265,18 @@ func TestExtractWord(t *testing.T) {
POSScore: -1.92593,
Score: 0,
},
{
Form: "익태",
Freq: 4,
POSScore: -0.23702252,
Score: 0,
},
}, wordInfos)
assert.Equal(t, 0, kb.Close())
}

func TestExtractWordwithFile(t *testing.T) {
kb := NewBuilder("./ModelGenerator", 0, KIWI_BUILD_DEFAULT)
kb := NewBuilder("./base", 1, KIWI_BUILD_DEFAULT) // Use single thread for deterministic results
file, _ := os.Open("./example/test.txt")

wordInfos, _ := kb.ExtractWords(file, 10 /*=minCnt*/, 5 /*=maxWordLen*/, 0.0 /*=minScore*/, -25.0 /*=posThreshold*/)
Expand Down
Loading
Loading