SphinxBase 5prealpha
ngrams_raw.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2015 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38#include <string.h>
39
40#include <sphinxbase/err.h>
41#include <sphinxbase/pio.h>
42#include <sphinxbase/strfuncs.h>
44#include <sphinxbase/byteorder.h>
45
46#include "ngram_model_internal.h"
47#include "ngrams_raw.h"
48
49int
50ngram_ord_comparator(const void *a_raw, const void *b_raw)
51{
52 ngram_raw_t *a = (ngram_raw_t *) a_raw;
53 ngram_raw_t *b = (ngram_raw_t *) b_raw;
54 int a_w_ptr = 0;
55 int b_w_ptr = 0;
56 while (a_w_ptr < a->order && b_w_ptr < b->order) {
57 if (a->words[a_w_ptr] == b->words[b_w_ptr]) {
58 a_w_ptr++;
59 b_w_ptr++;
60 continue;
61 }
62 if (a->words[a_w_ptr] < b->words[b_w_ptr])
63 return -1;
64 else
65 return 1;
66 }
67 return a->order - b->order;
68}
69
70static int
71read_ngram_instance(lineiter_t ** li, hash_table_t * wid,
72 logmath_t * lmath, int order, int order_max,
73 ngram_raw_t * raw_ngram)
74{
75 int n;
76 int words_expected;
77 int i;
78 char *wptr[NGRAM_MAX_ORDER + 1];
79 uint32 *word_out;
80
81 if (*li)
82 *li = lineiter_next(*li);
83 if (*li == NULL) {
84 E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
85 order);
86 return -1;
87 }
88 words_expected = order + 1;
89 if ((n =
90 str2words((*li)->buf, wptr,
91 NGRAM_MAX_ORDER + 1)) < words_expected) {
92 E_ERROR("Format error; %d-gram ignored: %s\n", order, (*li)->buf);
93 return -1;
94 }
95
96 raw_ngram->order = order;
97
98 if (order == order_max) {
99 raw_ngram->prob = atof_c(wptr[0]);
100 if (raw_ngram->prob > 0) {
101 E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
102 raw_ngram->prob = 0.0f;
103 }
104 raw_ngram->prob =
105 logmath_log10_to_log_float(lmath, raw_ngram->prob);
106 }
107 else {
108 float weight, backoff;
109
110 weight = atof_c(wptr[0]);
111 if (weight > 0) {
112 E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
113 raw_ngram->prob = 0.0f;
114 }
115 else {
116 raw_ngram->prob =
117 logmath_log10_to_log_float(lmath, weight);
118 }
119
120 if (n == order + 1) {
121 raw_ngram->backoff = 0.0f;
122 }
123 else {
124 backoff = atof_c(wptr[order + 1]);
125 raw_ngram->backoff =
126 logmath_log10_to_log_float(lmath, backoff);
127 }
128 }
129 raw_ngram->words =
130 (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
131 for (word_out = raw_ngram->words + order - 1, i = 1;
132 word_out >= raw_ngram->words; --word_out, i++) {
133 hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out);
134 }
135 return 0;
136}
137
138static int
139ngrams_raw_read_order(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
140 hash_table_t * wid, logmath_t * lmath, uint32 count,
141 int order, int order_max)
142{
143 char expected_header[20];
144 uint32 i;
145
146 sprintf(expected_header, "\\%d-grams:", order);
147 while (*li && strcmp((*li)->buf, expected_header) != 0) {
148 *li = lineiter_next(*li);
149 }
150
151 if (*li == NULL) {
152 E_ERROR("Failed to find '%s', language model file truncated\n", expected_header);
153 return -1;
154 }
155
156 *raw_ngrams = (ngram_raw_t *) ckd_calloc(count, sizeof(ngram_raw_t));
157 for (i = 0; i < count; i++) {
158 if (read_ngram_instance(li, wid, lmath, order, order_max,
159 &((*raw_ngrams)[i])) < 0)
160 break;
161 }
162
163 qsort(*raw_ngrams, count, sizeof(ngram_raw_t), &ngram_ord_comparator);
164 return 0;
165}
166
168ngrams_raw_read_arpa(lineiter_t ** li, logmath_t * lmath, uint32 * counts,
169 int order, hash_table_t * wid)
170{
171 ngram_raw_t **raw_ngrams;
172 int order_it;
173
174 raw_ngrams =
175 (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
176
177 for (order_it = 2; order_it <= order; order_it++) {
178 if (ngrams_raw_read_order(&raw_ngrams[order_it - 2], li, wid, lmath,
179 counts[order_it - 1], order_it, order) < 0)
180 break;
181 }
182
183 /* Check if we found ARPA end-mark */
184 if (*li == NULL) {
185 E_ERROR("ARPA file ends without end-mark\n");
186 ngrams_raw_free(raw_ngrams, counts, order);
187 return NULL;
188 } else {
189 *li = lineiter_next(*li);
190 if (strcmp((*li)->buf, "\\end\\") != 0) {
191 E_WARN
192 ("Finished reading ARPA file. Expecting end mark but found '%s'\n",
193 (*li)->buf);
194 }
195 }
196
197 return raw_ngrams;
198}
199
200static void
201read_dmp_weight_array(FILE * fp, logmath_t * lmath, uint8 do_swap,
202 int32 counts, ngram_raw_t * raw_ngrams,
203 int weight_idx)
204{
205 int32 i, k;
206 dmp_weight_t *tmp_weight_arr;
207
208 fread(&k, sizeof(k), 1, fp);
209 if (do_swap)
210 SWAP_INT32(&k);
211 tmp_weight_arr =
212 (dmp_weight_t *) ckd_calloc(k, sizeof(*tmp_weight_arr));
213 fread(tmp_weight_arr, sizeof(*tmp_weight_arr), k, fp);
214 for (i = 0; i < k; i++) {
215 if (do_swap)
216 SWAP_INT32(&tmp_weight_arr[i].l);
217 /* Convert values to log. */
218 tmp_weight_arr[i].f =
219 logmath_log10_to_log_float(lmath, tmp_weight_arr[i].f);
220 }
221 /* replace indexes with real probs in raw bigrams */
222 for (i = 0; i < counts; i++) {
223 if (weight_idx == 0) {
224 raw_ngrams[i].prob =
225 tmp_weight_arr[(int) raw_ngrams[i].prob].f;
226 } else {
227 raw_ngrams[i].backoff =
228 tmp_weight_arr[(int) raw_ngrams[i].backoff].f;
229 }
230 }
231 ckd_free(tmp_weight_arr);
232}
233
234#define BIGRAM_SEGMENT_SIZE 9
235
237ngrams_raw_read_dmp(FILE * fp, logmath_t * lmath, uint32 * counts,
238 int order, uint32 * unigram_next, uint8 do_swap)
239{
240 uint32 j, ngram_idx;
241 uint16 *bigrams_next;
242 ngram_raw_t **raw_ngrams =
243 (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
244
245 /* read bigrams */
246 raw_ngrams[0] =
247 (ngram_raw_t *) ckd_calloc((size_t) (counts[1] + 1),
248 sizeof(*raw_ngrams[0]));
249 bigrams_next =
250 (uint16 *) ckd_calloc((size_t) (counts[1] + 1),
251 sizeof(*bigrams_next));
252 ngram_idx = 1;
253 for (j = 0; j <= (int32) counts[1]; j++) {
254 uint16 wid, prob_idx, bo_idx;
255 ngram_raw_t *raw_ngram = &raw_ngrams[0][j];
256
257 fread(&wid, sizeof(wid), 1, fp);
258 if (do_swap)
259 SWAP_INT16(&wid);
260 raw_ngram->order = 2;
261 while (ngram_idx < counts[0] && j == unigram_next[ngram_idx]) {
262 ngram_idx++;
263 }
264
265 if (j != counts[1]) {
266 raw_ngram->words =
267 (uint32 *) ckd_calloc(2, sizeof(*raw_ngram->words));
268 raw_ngram->words[0] = (uint32) wid;
269 raw_ngram->words[1] = (uint32) ngram_idx - 1;
270 }
271
272 fread(&prob_idx, sizeof(prob_idx), 1, fp);
273 fread(&bo_idx, sizeof(bo_idx), 1, fp);
274 fread(&bigrams_next[j], sizeof(bigrams_next[j]), 1, fp);
275 if (do_swap) {
276 SWAP_INT16(&prob_idx);
277 SWAP_INT16(&bo_idx);
278 SWAP_INT16(&bigrams_next[j]);
279 }
280
281 if (j != counts[1]) {
282 raw_ngram->prob = prob_idx + 0.5f; /* keep index in float. ugly but avoiding using extra memory */
283 raw_ngram->backoff = bo_idx + 0.5f;
284 }
285 }
286
287 if (ngram_idx < counts[0]) {
288 E_ERROR("Corrupted model, not enough unigrams %d %d\n", ngram_idx, counts[0]);
289 ckd_free(bigrams_next);
290 ngrams_raw_free(raw_ngrams, counts, order);
291 return NULL;
292 }
293
294 /* read trigrams */
295 if (order > 2) {
296 raw_ngrams[1] =
297 (ngram_raw_t *) ckd_calloc((size_t) counts[2],
298 sizeof(*raw_ngrams[1]));
299 for (j = 0; j < (int32) counts[2]; j++) {
300 uint16 wid, prob_idx;
301 ngram_raw_t *raw_ngram = &raw_ngrams[1][j];
302
303 fread(&wid, sizeof(wid), 1, fp);
304 fread(&prob_idx, sizeof(prob_idx), 1, fp);
305 if (do_swap) {
306 SWAP_INT16(&wid);
307 SWAP_INT16(&prob_idx);
308 }
309
310 raw_ngram->order = 3;
311 raw_ngram->words =
312 (uint32 *) ckd_calloc(3, sizeof(*raw_ngram->words));
313 raw_ngram->words[0] = (uint32) wid;
314 raw_ngram->prob = prob_idx + 0.5f; /* keep index in float. ugly but avoiding using extra memory */
315 }
316 }
317
318 /* read prob2 */
319 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
320 raw_ngrams[0], 0);
321 /* read bo2 */
322 if (order > 2) {
323 int32 k;
324 int32 *tseg_base;
325 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
326 raw_ngrams[0], 1);
327 /* read prob3 */
328 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[2],
329 raw_ngrams[1], 0);
330 /* Read tseg_base size and tseg_base to fill trigram's first words */
331 fread(&k, sizeof(k), 1, fp);
332 if (do_swap)
333 SWAP_INT32(&k);
334 tseg_base = (int32 *) ckd_calloc(k, sizeof(int32));
335 fread(tseg_base, sizeof(int32), k, fp);
336 if (do_swap) {
337 for (j = 0; j < (uint32) k; j++) {
338 SWAP_INT32(&tseg_base[j]);
339 }
340 }
341 ngram_idx = 0;
342 for (j = 1; j <= counts[1]; j++) {
343 uint32 next_ngram_idx =
344 (uint32) (tseg_base[j >> BIGRAM_SEGMENT_SIZE] +
345 bigrams_next[j]);
346 while (ngram_idx < next_ngram_idx) {
347 raw_ngrams[1][ngram_idx].words[1] =
348 raw_ngrams[0][j - 1].words[0];
349 raw_ngrams[1][ngram_idx].words[2] =
350 raw_ngrams[0][j - 1].words[1];
351 ngram_idx++;
352 }
353 }
354 ckd_free(tseg_base);
355
356 if (ngram_idx < counts[2]) {
357 E_ERROR("Corrupted model, some trigrams have no corresponding bigram\n");
358 ckd_free(bigrams_next);
359 ngrams_raw_free(raw_ngrams, counts, order);
360 return NULL;
361 }
362 }
363 ckd_free(bigrams_next);
364
365 /* sort raw ngrams for reverse trie */
366 qsort(raw_ngrams[0], (size_t) counts[1], sizeof(*raw_ngrams[0]),
367 &ngram_ord_comparator);
368 if (order > 2) {
369 qsort(raw_ngrams[1], (size_t) counts[2], sizeof(*raw_ngrams[1]),
370 &ngram_ord_comparator);
371 }
372 return raw_ngrams;
373}
374
375void
376ngrams_raw_free(ngram_raw_t ** raw_ngrams, uint32 * counts, int order)
377{
378 uint32 num;
379 int order_it;
380
381 for (order_it = 0; order_it < order - 1; order_it++) {
382 for (num = 0; num < counts[order_it + 1]; num++) {
383 ckd_free(raw_ngrams[order_it][num].words);
384 }
385 ckd_free(raw_ngrams[order_it]);
386 }
387 ckd_free(raw_ngrams);
388}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition: hash_table.c:322
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
Definition: logmath.c:480
file IO related operations.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
Line iterator for files.
Definition: pio.h:177