Open Chinese Convert 1.1.2
A project for conversion between Traditional and Simplified Chinese
PhraseExtract.hpp
1/*
2 * Open Chinese Convert
3 *
4 * Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#pragma once
20
21#include <functional>
22#include <unordered_map>
23
24#include "Common.hpp"
25#include "UTF8StringSlice.hpp"
26
27namespace opencc {
28
29class OPENCC_EXPORT PhraseExtract {
30public:
31 typedef UTF8StringSlice::LengthType LengthType;
32
34
36
37 virtual ~PhraseExtract();
38
39 void Extract(const std::string& text) {
40 SetFullText(text);
41 ExtractSuffixes();
42 CalculateFrequency();
43 CalculateSuffixEntropy();
44 ReleaseSuffixes();
45 ExtractPrefixes();
46 CalculatePrefixEntropy();
47 ReleasePrefixes();
48 ExtractWordCandidates();
49 CalculateCohesions();
50 SelectWords();
51 }
52
53 void SetFullText(const std::string& fullText) {
54 utf8FullText = UTF8StringSlice(fullText.c_str());
55 }
56
57 void SetFullText(const char* fullText) {
58 utf8FullText = UTF8StringSlice(fullText);
59 }
60
61 void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
62
63 void SetWordMinLength(const LengthType _wordMinLength) {
64 wordMinLength = _wordMinLength;
65 }
66
67 void SetWordMaxLength(const LengthType _wordMaxLength) {
68 wordMaxLength = _wordMaxLength;
69 }
70
71 void SetPrefixSetLength(const LengthType _prefixSetLength) {
72 prefixSetLength = _prefixSetLength;
73 }
74
75 void SetSuffixSetLength(const LengthType _suffixSetLength) {
76 suffixSetLength = _suffixSetLength;
77 }
78
79 // PreCalculationFilter is called after frequencies statistics.
80 void SetPreCalculationFilter(
81 const std::function<bool(const PhraseExtract&,
82 const UTF8StringSlice8Bit&)>& filter) {
83 preCalculationFilter = filter;
84 }
85
86 void SetPostCalculationFilter(
87 const std::function<bool(const PhraseExtract&,
88 const UTF8StringSlice8Bit&)>& filter) {
89 postCalculationFilter = filter;
90 }
91
92 void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
93
94 void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
95
96 const std::vector<UTF8StringSlice8Bit>& Words() const { return words; }
97
98 const std::vector<UTF8StringSlice8Bit>& WordCandidates() const {
99 return wordCandidates;
100 }
101
102 struct Signals {
103 size_t frequency;
104 double cohesion;
105 double suffixEntropy;
106 double prefixEntropy;
107 };
108
109 const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
110
111 double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
112
113 double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
114
115 double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
116
117 double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
118
119 size_t Frequency(const UTF8StringSlice8Bit& word) const;
120
121 double Probability(const UTF8StringSlice8Bit& word) const;
122
123 double LogProbability(const UTF8StringSlice8Bit& word) const;
124
125 void Reset();
126
127 void ExtractSuffixes();
128
129 void ExtractPrefixes();
130
131 void ExtractWordCandidates();
132
133 void CalculateFrequency();
134
135 void CalculateCohesions();
136
137 void CalculateSuffixEntropy();
138
139 void CalculatePrefixEntropy();
140
141 void SelectWords();
142
143 static bool
144 DefaultPreCalculationFilter(const PhraseExtract&,
146
147 static bool
148 DefaultPostCalculationFilter(const PhraseExtract&,
150
151private:
152 class DictType;
153
154 // Pointwise Mutual Information
155 double PMI(const UTF8StringSlice8Bit& wordCandidate,
156 const UTF8StringSlice8Bit& part1,
157 const UTF8StringSlice8Bit& part2) const;
158
159 double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
160
161 double CalculateEntropy(
162 const std::unordered_map<UTF8StringSlice8Bit, size_t,
163 UTF8StringSlice8Bit::Hasher>& choices) const;
164
165 LengthType wordMinLength;
166 LengthType wordMaxLength;
167 LengthType prefixSetLength;
168 LengthType suffixSetLength;
169 std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
170 preCalculationFilter;
171 std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
172 postCalculationFilter;
173
174 bool prefixesExtracted;
175 bool suffixesExtracted;
176 bool frequenciesCalculated;
177 bool wordCandidatesExtracted;
178 bool cohesionsCalculated;
179 bool prefixEntropiesCalculated;
180 bool suffixEntropiesCalculated;
181 bool wordsSelected;
182
183 UTF8StringSlice utf8FullText;
184 size_t totalOccurrence;
185 double logTotalOccurrence;
186 std::vector<UTF8StringSlice8Bit> prefixes;
187 std::vector<UTF8StringSlice8Bit> suffixes;
188 std::vector<UTF8StringSlice8Bit> wordCandidates;
189 std::vector<UTF8StringSlice8Bit> words;
190 DictType* signals;
191
192 friend class PhraseExtractTest;
193};
194
195} // namespace opencc
Definition: PhraseExtract.hpp:29
Definition: UTF8StringSlice.hpp:202
Definition: UTF8StringSlice.hpp:54
Definition: PhraseExtract.hpp:102