forked from yanyiwu/gojieba
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjieba.cpp
121 lines (106 loc) · 3.59 KB
/
jieba.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
extern "C" {
#include "jieba.h"
}
#include "cppjieba/Jieba.hpp"
static char** ConvertWords(const std::vector<std::string>& words) {
char ** res = (char**)malloc(sizeof(char*) * (words.size() + 1));
for (size_t i = 0; i < words.size(); i++) {
res[i] = (char*)malloc(sizeof(char) * (words[i].length() + 1));
strcpy(res[i], words[i].c_str());
}
res[words.size()] = NULL;
return res;
}
static Word* ConvertWords(const std::vector<cppjieba::Word>& words) {
Word* res = (Word*)malloc(sizeof(Word) * (words.size() + 1));
for (size_t i = 0; i < words.size(); i++) {
res[i].offset = words[i].offset;
res[i].len = words[i].word.size();
}
res[words.size()].offset = 0;
res[words.size()].len = 0;
return res;
}
static struct CWordWeight* ConvertWords(const std::vector<std::pair<std::string, double> >& words) {
struct CWordWeight* res = (struct CWordWeight*)malloc(sizeof(struct CWordWeight) * (words.size() + 1));
for (size_t i = 0; i < words.size(); i++) {
res[i].word = (char*)malloc(sizeof(char) * (words[i].first.length() + 1));
strcpy(res[i].word, words[i].first.c_str());
res[i].weight = words[i].second;
}
res[words.size()].word = NULL;
return res;
}
Jieba NewJieba(const char* dict_path,
const char* hmm_path,
const char* user_dict,
const char* idf_path,
const char* stop_words_path) {
return (Jieba)(new cppjieba::Jieba(dict_path, hmm_path, user_dict, idf_path, stop_words_path));
}
void FreeJieba(Jieba x) {
delete (cppjieba::Jieba*)x;
}
char** Cut(Jieba x, const char* sentence, int is_hmm_used) {
std::vector<std::string> words;
((cppjieba::Jieba*)x)->Cut(sentence, words, is_hmm_used);
char** res = ConvertWords(words);
return res;
}
char** CutAll(Jieba x, const char* sentence) {
std::vector<std::string> words;
((cppjieba::Jieba*)x)->CutAll(sentence, words);
char** res = ConvertWords(words);
return res;
}
char** CutForSearch(Jieba x, const char* sentence, int is_hmm_used) {
std::vector<std::string> words;
((cppjieba::Jieba*)x)->CutForSearch(sentence, words, is_hmm_used);
char** res = ConvertWords(words);
return res;
}
char** Tag(Jieba x, const char* sentence) {
std::vector<std::pair<std::string, std::string> > result;
((cppjieba::Jieba*)x)->Tag(sentence, result);
std::vector<std::string> words;
words.reserve(result.size());
for (size_t i = 0; i < result.size(); ++i) {
words.push_back(result[i].first + "/" + result[i].second);
}
return ConvertWords(words);
}
void AddWord(Jieba x, const char* word) {
((cppjieba::Jieba*)x)->InsertUserWord(word);
}
Word* Tokenize(Jieba x, const char* sentence, TokenizeMode mode, int is_hmm_used) {
std::vector<cppjieba::Word> words;
switch (mode) {
case SearchMode:
((cppjieba::Jieba*)x)->CutForSearch(sentence, words, is_hmm_used);
return ConvertWords(words);
default:
((cppjieba::Jieba*)x)->Cut(sentence, words, is_hmm_used);
return ConvertWords(words);
}
}
struct CWordWeight* ExtractWithWeight(Jieba handle, const char* sentence, int top_k) {
std::vector<std::pair<std::string, double> > words;
((cppjieba::Jieba*)handle)->extractor.Extract(sentence, words, top_k);
struct CWordWeight* res = ConvertWords(words);
return res;
}
void FreeWordWeights(struct CWordWeight* wws) {
struct CWordWeight* x = wws;
while (x && x->word) {
free(x->word);
x->word = NULL;
x++;
}
free(wws);
}
char** Extract(Jieba handle, const char* sentence, int top_k) {
std::vector<std::string> words;
((cppjieba::Jieba*)handle)->extractor.Extract(sentence, words, top_k);
char** res = ConvertWords(words);
return res;
}