diff --git a/.gitignore b/.gitignore index aa5ae68..4e098b4 100644 --- a/.gitignore +++ b/.gitignore @@ -17,5 +17,7 @@ __debug_bin -ModelGenerator/ +base/ .idea/ +models/ +include/ \ No newline at end of file diff --git a/Makefile b/Makefile index 81f7f59..49b6795 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ -KIWI_VERSION := "v0.10.3" +KIWI_VERSION := v0.21.0 .PHONY: test -test: ModelGenerator/default.dict +test: base/default.dict go test ./... -ModelGenerator/default.dict: - curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION).tgz --output model.tgz - tar -xzvf model.tgz +base/default.dict: + curl -L https://github.com/bab2min/Kiwi/releases/download/$(KIWI_VERSION)/kiwi_model_$(KIWI_VERSION)_base.tgz --output model.tgz + tar --no-same-owner -xzvf model.tgz rm -f model.tgz @@ -17,7 +17,7 @@ install-kiwi: .PHONY: clean clean: rm -f model.tgz - rm -rf ./ModelGenerator + rm -rf ./base .PHONY: format format: diff --git a/kiwi.go b/kiwi.go index 94d327e..fba4a7d 100644 --- a/kiwi.go +++ b/kiwi.go @@ -2,7 +2,9 @@ package kiwi /* -#cgo LDFLAGS: -l kiwi +#cgo CFLAGS: -I/usr/local/include +#cgo LDFLAGS: -Wl,-rpath,/usr/local/lib + #include #include #include // for uintptr_t @@ -14,6 +16,7 @@ extern int KiwiReaderBridge(int lineNumber, char *buffer, void *userData); import "C" import ( + "fmt" "io" "runtime/cgo" "unsafe" @@ -91,17 +94,36 @@ type TokenResult struct { // Analyze returns the result of the analysis. func (k *Kiwi) Analyze(text string, topN int, options AnalyzeOption) ([]TokenResult, error) { - kiwiResH := C.kiwi_analyze(k.handler, C.CString(text), C.int(topN), C.int(options)) + var ( + blocklist C.kiwi_morphset_h + pretokenized C.kiwi_pretokenized_h + cText = C.CString(text) + ) + + defer C.free(unsafe.Pointer(cText)) + kiwiResH := C.kiwi_analyze(k.handler, cText, C.int(topN), C.int(options), blocklist, pretokenized) + if kiwiResH == nil { + return nil, fmt.Errorf("failed to analyze text") + } defer C.kiwi_res_close(kiwiResH) resSize := int(C.kiwi_res_size(kiwiResH)) + if resSize < 0 { + return nil, fmt.Errorf("invalid result size: %d", resSize) + } + res := make([]TokenResult, resSize) for i := 0; i < resSize; i++ { - tokens := make([]TokenInfo, int(C.kiwi_res_word_num(kiwiResH, C.int(i)))) + wordNum := int(C.kiwi_res_word_num(kiwiResH, C.int(i))) + if wordNum < 0 { + return nil, fmt.Errorf("invalid word number: %d", wordNum) + } - for j := 0; j < len(tokens); j++ { + tokens := make([]TokenInfo, wordNum) + + for j := 0; j < wordNum; j++ { pos, err := ParsePOSType(C.GoString(C.kiwi_res_tag(kiwiResH, C.int(i), C.int(j)))) if err != nil { return nil, err @@ -131,15 +153,30 @@ type SplitResult struct { // SplitSentence returns the line of sentences. func (k *Kiwi) SplitSentence(text string, options AnalyzeOption) ([]SplitResult, error) { - kiwiSsH := C.kiwi_split_into_sents(k.handler, C.CString(text), C.int(options), nil) + var cText = C.CString(text) + defer C.free(unsafe.Pointer(cText)) + + kiwiSsH := C.kiwi_split_into_sents(k.handler, cText, C.int(options), nil) + if kiwiSsH == nil { + return nil, fmt.Errorf("failed to split sentences") + } defer C.kiwi_ss_close(kiwiSsH) resSize := int(C.kiwi_ss_size(kiwiSsH)) + if resSize < 0 { + return nil, fmt.Errorf("invalid result size: %d", resSize) + } + res := make([]SplitResult, resSize) for i := 0; i < resSize; i++ { begin := int(C.kiwi_ss_begin_position(kiwiSsH, C.int(i))) end := int(C.kiwi_ss_end_position(kiwiSsH, C.int(i))) + + if begin < 0 || end < begin || end > len(text) { + return nil, fmt.Errorf("invalid position range: begin=%d, end=%d", begin, end) + } + res[i] = SplitResult{ Text: text[begin:end], Begin: begin, @@ -188,7 +225,12 @@ func (kb *KiwiBuilder) LoadDict(dictPath string) int { // Build creates kiwi instance with user word etc. func (kb *KiwiBuilder) Build() *Kiwi { - h := C.kiwi_builder_build(kb.handler) + var ( + typos C.kiwi_typo_h + typoCostThreshold = C.float(1.0) + ) + + h := C.kiwi_builder_build(kb.handler, typos, typoCostThreshold) defer kb.Close() return &Kiwi{ handler: h, diff --git a/kiwi_example_test.go b/kiwi_example_test.go index 8ae0b74..74a805e 100644 --- a/kiwi_example_test.go +++ b/kiwi_example_test.go @@ -7,7 +7,7 @@ import ( ) func Example() { - kb := kiwi.NewBuilder("./ModelGenerator", 1 /*=numThread*/, kiwi.KIWI_BUILD_INTEGRATE_ALLOMORPH /*=options*/) + kb := kiwi.NewBuilder("./base", 1 /*=numThread*/, kiwi.KIWI_BUILD_INTEGRATE_ALLOMORPH /*=options*/) kb.AddWord("코딩냄비", "NNP", 0) k := kb.Build() @@ -16,5 +16,5 @@ func Example() { results, _ := k.Analyze("안녕하세요 코딩냄비입니다. 부글부글.", 1 /*=topN*/, kiwi.KIWI_MATCH_ALL) fmt.Println(results) // Output: - // [{[{0 NNG 안녕} {2 XSA 하} {4 EP 시} {3 EC 어요} {6 NNP 코딩냄비} {10 VCP 이} {11 EF ᆸ니다} {13 SF .} {15 NNP 부글부} {18 NNG 글} {19 SF .}] -69.74997}] + // [{[{0 NNG 안녕} {2 XSA 하} {3 EF 세요} {6 NNP 코딩냄비} {10 VCP 이} {10 EF ᆸ니다} {13 SF .} {15 MAG 부글부글} {19 SF .}] -55.869953}] } diff --git a/kiwi_test.go b/kiwi_test.go index 970c121..06d52f9 100644 --- a/kiwi_test.go +++ b/kiwi_test.go @@ -9,11 +9,11 @@ import ( ) func TestKiwiVersion(t *testing.T) { - assert.Equal(t, KiwiVersion(), "0.10.3") + assert.Equal(t, KiwiVersion(), "0.21.0") } func TestAnalyze(t *testing.T) { - kiwi := New("./ModelGenerator", 1, KIWI_BUILD_DEFAULT) + kiwi := New("./base", 1, KIWI_BUILD_DEFAULT) res, _ := kiwi.Analyze("아버지가 방에 들어가신다", 1, KIWI_MATCH_ALL) expected := []TokenResult{ @@ -50,12 +50,12 @@ func TestAnalyze(t *testing.T) { Form: "시", }, { - Position: 12, + Position: 11, Tag: POS_EF, Form: "ᆫ다", }, }, - Score: -38.967132568359375, + Score: -34.55623, }, } @@ -64,7 +64,7 @@ func TestAnalyze(t *testing.T) { } func TestSplitSentence(t *testing.T) { - kiwi := New("./ModelGenerator", 1, KIWI_BUILD_DEFAULT) + kiwi := New("./base", 1, KIWI_BUILD_DEFAULT) res, _ := kiwi.SplitSentence("여러 문장으로 구성된 텍스트네 이걸 분리해줘", KIWI_MATCH_ALL) expected := []SplitResult{ @@ -85,14 +85,16 @@ func TestSplitSentence(t *testing.T) { } func TestAddWordFail(t *testing.T) { - kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) + kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) add := kb.AddWord("아버지가", "SKO", 0) - assert.Equal(t, 0, add) + assert.Equal(t, -1, add) assert.Equal(t, 0, kb.Close()) + + KiwiClearError() } func TestAddWord(t *testing.T) { - kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) + kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) add := kb.AddWord("아버지가", "NNG", 0) assert.Equal(t, 0, add) @@ -132,12 +134,12 @@ func TestAddWord(t *testing.T) { Form: "시", }, { - Position: 12, + Position: 11, Tag: "EF", Form: "ᆫ다", }, }, - Score: -36.959194, + Score: -32.80881, }, } @@ -146,7 +148,7 @@ func TestAddWord(t *testing.T) { } func TestLoadDict(t *testing.T) { - kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) + kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) add := kb.LoadDict("./example/user_dict.tsv") assert.Equal(t, 1, add) @@ -191,12 +193,12 @@ func TestLoadDict(t *testing.T) { Form: "시", }, { - Position: 12, + Position: 11, Tag: "EF", Form: "ᆫ다", }, }, - Score: -36.959194, + Score: -32.80881, }, } @@ -205,7 +207,7 @@ func TestLoadDict(t *testing.T) { } func TestLoadDict2(t *testing.T) { - kb := NewBuilder("./ModelGenerator", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) + kb := NewBuilder("./base", 1, KIWI_BUILD_INTEGRATE_ALLOMORPH) add := kb.LoadDict("./example/user_dict2.tsv") assert.Equal(t, 3, add) @@ -236,7 +238,7 @@ func TestLoadDict2(t *testing.T) { Form: "들어가신다", }, }, - Score: -13.669565, + Score: -12.538677, }, } @@ -245,7 +247,7 @@ func TestLoadDict2(t *testing.T) { } func TestExtractWord(t *testing.T) { - kb := NewBuilder("./ModelGenerator", 0, KIWI_BUILD_DEFAULT) + kb := NewBuilder("./base", 1, KIWI_BUILD_DEFAULT) rs := strings.NewReader(`2008년에는 애국가의 작곡자 안익태가 1930년대에 독일 유학 기간 중 친일 활동을 했다는 사실이 밝혀졌다. 이후 안익태가 나치 독일 하의 베를린에서 만주국 10주년 건국 기념음악회를 지휘하는 동영상까지 발굴되어 관련 학계나 사회에 큰 충격을 주었다. 안익태가 친일 행적을 한 바 있다는 빼도박도 못할 증거가 나왔으니까. 영상물의 '만주환상곡'에는 우리가 현재 알고있는 '한국환상곡'의 두 선율("무궁화 삼천리 나의 사랑아, @@ -263,12 +265,18 @@ func TestExtractWord(t *testing.T) { POSScore: -1.92593, Score: 0, }, + { + Form: "익태", + Freq: 4, + POSScore: -0.23702252, + Score: 0, + }, }, wordInfos) assert.Equal(t, 0, kb.Close()) } func TestExtractWordwithFile(t *testing.T) { - kb := NewBuilder("./ModelGenerator", 0, KIWI_BUILD_DEFAULT) + kb := NewBuilder("./base", 1, KIWI_BUILD_DEFAULT) // Use single thread for deterministic results file, _ := os.Open("./example/test.txt") wordInfos, _ := kb.ExtractWords(file, 10 /*=minCnt*/, 5 /*=maxWordLen*/, 0.0 /*=minScore*/, -25.0 /*=posThreshold*/) diff --git a/postype.go b/postype.go index 3b96ad1..83a25bb 100644 --- a/postype.go +++ b/postype.go @@ -11,46 +11,21 @@ const ( POS_NNG POSType = "NNG" POS_NNP POSType = "NNP" POS_NNB POSType = "NNB" + POS_NR POSType = "NR" + POS_NP POSType = "NP" - POS_VV POSType = "VV" - POS_VA POSType = "VA" - - POS_MAG POSType = "MAG" - - POS_NR POSType = "NR" - POS_NP POSType = "NP" - - POS_VX POSType = "VX" - - POS_MM POSType = "MM" - POS_MAJ POSType = "MAJ" - - POS_IC POSType = "IC" - - POS_XPN POSType = "XPN" - POS_XSN POSType = "XSN" - POS_XSV POSType = "XSV" - POS_XSA POSType = "XSA" - POS_XR POSType = "XR" - + POS_VV POSType = "VV" + POS_VA POSType = "VA" + POS_VX POSType = "VX" POS_VCP POSType = "VCP" POS_VCN POSType = "VCN" - POS_SF POSType = "SF" - POS_SP POSType = "SP" - POS_SS POSType = "SS" - POS_SE POSType = "SE" - POS_SO POSType = "SO" - POS_SW POSType = "SW" + POS_MM POSType = "MM" - POS_SL POSType = "SL" - POS_SH POSType = "SH" - POS_SN POSType = "SN" + POS_MAG POSType = "MAG" + POS_MAJ POSType = "MAJ" - POS_W_URL POSType = "W_URL" - POS_W_EMAIL POSType = "W_EMAIL" - POS_W_MENTION POSType = "W_MENTION" - POS_W_HASHTAG POSType = "W_HASHTAG" + POS_IC POSType = "IC" POS_JKS POSType = "JKS" POS_JKC POSType = "JKC" @@ -68,62 +43,111 @@ const ( POS_ETN POSType = "ETN" POS_ETM POSType = "ETM" - POS_V POSType = "V" + POS_XPN POSType = "XPN" + + POS_XSN POSType = "XSN" + POS_XSV POSType = "XSV" + POS_XSA POSType = "XSA" + POS_XSM POSType = "XSM" + + POS_XR POSType = "XR" + + POS_SF POSType = "SF" + POS_SP POSType = "SP" + POS_SS POSType = "SS" + POS_SSO POSType = "SSO" + POS_SSC POSType = "SSC" + POS_SE POSType = "SE" + POS_SO POSType = "SO" + POS_SW POSType = "SW" + POS_SL POSType = "SL" + POS_SH POSType = "SH" + POS_SN POSType = "SN" + POS_SB POSType = "SB" + + POS_UN POSType = "UN" - POS_MAX POSType = "MAX" + POS_W_URL POSType = "W_URL" + POS_W_EMAIL POSType = "W_EMAIL" + POS_W_HASHTAG POSType = "W_HASHTAG" + POS_W_MENTION POSType = "W_MENTION" + POS_W_SERIAL POSType = "W_SERIAL" + POS_W_EMOJI POSType = "W_EMOJI" + + POS_Z_CODA POSType = "Z_CODA" + POS_Z_SIOT POSType = "Z_SIOT" + + POS_USER_0 POSType = "USER0" + POS_USER_1 POSType = "USER1" + POS_USER_2 POSType = "USER2" + POS_USER_3 POSType = "USER3" + POS_USER_4 POSType = "USER4" ) func (p POSType) isValid() bool { switch p { - case POS_UNKNOWN, + case + POS_UNKNOWN, POS_NNG, POS_NNP, POS_NNB, - POS_VV, - POS_VA, - POS_MAG, POS_NR, POS_NP, + POS_VV, + POS_VA, POS_VX, + POS_VCP, + POS_VCN, POS_MM, + POS_MAG, POS_MAJ, POS_IC, + POS_JKS, + POS_JKC, + POS_JKG, + POS_JKO, + POS_JKB, + POS_JKV, + POS_JKQ, + POS_JX, + POS_JC, + POS_EP, + POS_EF, + POS_EC, + POS_ETN, + POS_ETM, POS_XPN, POS_XSN, POS_XSV, POS_XSA, + POS_XSM, POS_XR, - POS_VCP, - POS_VCN, POS_SF, POS_SP, POS_SS, + POS_SSO, + POS_SSC, POS_SE, POS_SO, POS_SW, POS_SL, POS_SH, POS_SN, + POS_SB, + POS_UN, POS_W_URL, POS_W_EMAIL, - POS_W_MENTION, POS_W_HASHTAG, - POS_JKS, - POS_JKC, - POS_JKG, - POS_JKO, - POS_JKB, - POS_JKV, - POS_JKQ, - POS_JX, - POS_JC, - POS_EP, - POS_EF, - POS_EC, - POS_ETN, - POS_ETM, - POS_V, - POS_MAX: + POS_W_MENTION, + POS_W_SERIAL, + POS_W_EMOJI, + POS_Z_CODA, + POS_Z_SIOT, + POS_USER_0, + POS_USER_1, + POS_USER_2, + POS_USER_3, + POS_USER_4: return true default: return false diff --git a/scripts/install_kiwi.bash b/scripts/install_kiwi.bash index dbd7e05..2381a13 100644 --- a/scripts/install_kiwi.bash +++ b/scripts/install_kiwi.bash @@ -11,15 +11,16 @@ elif [ "$(uname)" == "Windows" ]; then OS='win' fi +if [ "$(uname -m)" == "arm64" ]; then + ARCH="arm64" +else + ARCH="x86_64" +fi + echo "set OS env to ${OS:?}" echo "installing Kiwi version ${KIWI_VERSION:?}" -wget -O kiwi.tgz "https://github.com/bab2min/Kiwi/releases/download/${KIWI_VERSION}/kiwi_${OS}_x86_64_${KIWI_VERSION}.tgz" && - tar xzvf kiwi.tgz && - sudo mv build/libkiwi* /usr/local/lib/ && +wget -O kiwi.tgz "https://github.com/bab2min/Kiwi/releases/download/${KIWI_VERSION}/kiwi_${OS}_${ARCH}_${KIWI_VERSION}.tgz" && + sudo tar xzvf kiwi.tgz -C /usr/local && [[ "$(uname)" == "Linux" ]] && sudo ldconfig || echo 'skip' && - rm -rf kiwi.tgz build && - wget -O source.tgz https://github.com/bab2min/Kiwi/archive/refs/tags/${KIWI_VERSION}.tar.gz && - tar xzvf source.tgz && - sudo cp -r Kiwi-${KIWI_VERSION/v/}/include/kiwi /usr/local/include/ && - rm -rf source.tgz Kiwi-* + rm -f kiwi.tgz