SphinxBase 5prealpha
ngram_model.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * \file ngram_model.c N-Gram language models.
39 *
40 * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
41 */
42
43#ifdef HAVE_CONFIG_H
44#include <config.h>
45#endif
46
47#include <string.h>
48#include <assert.h>
49
52#include "sphinxbase/filename.h"
53#include "sphinxbase/pio.h"
54#include "sphinxbase/err.h"
55#include "sphinxbase/logmath.h"
56#include "sphinxbase/strfuncs.h"
57#include "sphinxbase/case.h"
58
59#include "ngram_model_internal.h"
60#include "ngram_model_trie.h"
61
63ngram_file_name_to_type(const char *file_name)
64{
65 const char *ext;
66
67 ext = strrchr(file_name, '.');
68 if (ext == NULL) {
69 return NGRAM_INVALID;
70 }
71 if (0 == strcmp_nocase(ext, ".gz")) {
72 while (--ext >= file_name) {
73 if (*ext == '.')
74 break;
75 }
76 if (ext < file_name) {
77 return NGRAM_INVALID;
78 }
79 }
80 else if (0 == strcmp_nocase(ext, ".bz2")) {
81 while (--ext >= file_name) {
82 if (*ext == '.')
83 break;
84 }
85 if (ext < file_name) {
86 return NGRAM_INVALID;
87 }
88 }
89 /* We use strncmp because there might be a .gz on the end. */
90 if (0 == strncmp_nocase(ext, ".ARPA", 5))
91 return NGRAM_ARPA;
92 if (0 == strncmp_nocase(ext, ".DMP", 4)
93 || 0 == strncmp_nocase(ext, ".BIN", 4))
94 return NGRAM_BIN;
95 return NGRAM_INVALID;
96}
97
99ngram_str_to_type(const char *str_name)
100{
101 if (0 == strcmp_nocase(str_name, "arpa"))
102 return NGRAM_ARPA;
103 if (0 == strcmp_nocase(str_name, "dmp")
104 || 0 == strcmp_nocase(str_name, "bin"))
105 return NGRAM_BIN;
106 return NGRAM_INVALID;
107}
108
109char const *
111{
112 switch (type) {
113 case NGRAM_ARPA:
114 return "arpa";
115 case NGRAM_BIN:
116 return "dmp/bin";
117 default:
118 return NULL;
119 }
120}
121
122
125 const char *file_name,
126 ngram_file_type_t file_type, logmath_t * lmath)
127{
128 ngram_model_t *model = NULL;
129 switch (file_type) {
130 case NGRAM_AUTO:{
131 if ((model =
132 ngram_model_trie_read_bin(config, file_name,
133 lmath)) != NULL)
134 break;
135 if ((model =
136 ngram_model_trie_read_arpa(config, file_name,
137 lmath)) != NULL)
138 break;
139 if ((model =
140 ngram_model_trie_read_dmp(config, file_name,
141 lmath)) != NULL)
142 break;
143 return NULL;
144 }
145 case NGRAM_ARPA:
146 model = ngram_model_trie_read_arpa(config, file_name, lmath);
147 break;
148 case NGRAM_BIN:
149 if ((model =
150 ngram_model_trie_read_bin(config, file_name, lmath)) != NULL)
151 break;
152 if ((model =
153 ngram_model_trie_read_dmp(config, file_name, lmath)) != NULL)
154 break;
155 return NULL;
156 default:
157 E_ERROR("language model file type not supported\n");
158 return NULL;
159 }
160
161 /* Now set weights based on config if present. */
162 if (config) {
163 float32 lw = 1.0;
164 float32 wip = 1.0;
165
166 if (cmd_ln_exists_r(config, "-lw"))
167 lw = cmd_ln_float32_r(config, "-lw");
168 if (cmd_ln_exists_r(config, "-wip"))
169 wip = cmd_ln_float32_r(config, "-wip");
170
171 ngram_model_apply_weights(model, lw, wip);
172 }
173
174 return model;
175}
176
177int
178ngram_model_write(ngram_model_t * model, const char *file_name,
179 ngram_file_type_t file_type)
180{
181 switch (file_type) {
182 case NGRAM_AUTO:{
183 file_type = ngram_file_name_to_type(file_name);
184 /* Default to ARPA (catches .lm and other things) */
185 if (file_type == NGRAM_INVALID)
186 file_type = NGRAM_ARPA;
187 return ngram_model_write(model, file_name, file_type);
188 }
189 case NGRAM_ARPA:
190 return ngram_model_trie_write_arpa(model, file_name);
191 case NGRAM_BIN:
192 return ngram_model_trie_write_bin(model, file_name);
193 default:
194 E_ERROR("language model file type not supported\n");
195 return -1;
196 }
197 E_ERROR("language model file type not supported\n");
198 return -1;
199}
200
201int32
202ngram_model_init(ngram_model_t * base,
203 ngram_funcs_t * funcs,
204 logmath_t * lmath, int32 n, int32 n_unigram)
205{
206 base->refcount = 1;
207 base->funcs = funcs;
208 base->n = n;
209 /* If this was previously initialized... */
210 if (base->n_counts == NULL)
211 base->n_counts = (uint32 *) ckd_calloc(n, sizeof(*base->n_counts));
212 /* Don't reset weights if logmath object hasn't changed. */
213 if (base->lmath != lmath) {
214 /* Set default values for weights. */
215 base->lw = 1.0;
216 base->log_wip = 0; /* i.e. 1.0 */
217 base->log_zero = logmath_get_zero(lmath);
218 base->lmath = lmath;
219 }
220 /* Allocate or reallocate space for word strings. */
221 if (base->word_str) {
222 /* Free all previous word strings if they were allocated. */
223 if (base->writable) {
224 int32 i;
225 for (i = 0; i < base->n_words; ++i) {
226 ckd_free(base->word_str[i]);
227 base->word_str[i] = NULL;
228 }
229 }
230 base->word_str =
231 (char **) ckd_realloc(base->word_str,
232 n_unigram * sizeof(char *));
233 }
234 else {
235 base->word_str = (char **) ckd_calloc(n_unigram, sizeof(char *));
236 }
237 /* NOTE: They are no longer case-insensitive since we are allowing
238 * other encodings for word strings. Beware. */
239 if (base->wid)
240 hash_table_empty(base->wid);
241 else
242 base->wid = hash_table_new(n_unigram, FALSE);
243 base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
244
245 return 0;
246}
247
250{
251 ++model->refcount;
252 return model;
253}
254
255void
257{
258 if (model->funcs && model->funcs->flush)
259 (*model->funcs->flush) (model);
260}
261
262int
264{
265 int i;
266
267 if (model == NULL)
268 return 0;
269 if (--model->refcount > 0)
270 return model->refcount;
271 if (model->funcs && model->funcs->free)
272 (*model->funcs->free) (model);
273 if (model->writable) {
274 /* Free all words. */
275 for (i = 0; i < model->n_words; ++i) {
276 ckd_free(model->word_str[i]);
277 }
278 }
279 else {
280 /* Free all class words. */
281 for (i = 0; i < model->n_classes; ++i) {
282 ngram_class_t *lmclass;
283 int32 j;
284
285 lmclass = model->classes[i];
286 for (j = 0; j < lmclass->n_words; ++j) {
287 ckd_free(model->word_str[lmclass->start_wid + j]);
288 }
289 for (j = 0; j < lmclass->n_hash; ++j) {
290 if (lmclass->nword_hash[j].wid != -1) {
291 ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
292 }
293 }
294 }
295 }
296 for (i = 0; i < model->n_classes; ++i) {
297 ngram_class_free(model->classes[i]);
298 }
299 ckd_free(model->classes);
300 hash_table_free(model->wid);
301 ckd_free(model->word_str);
302 ckd_free(model->n_counts);
303 ckd_free(model);
304 return 0;
305}
306
307int
309{
310 int writable, i;
311 hash_table_t *new_wid;
312
313 /* Were word strings already allocated? */
314 writable = model->writable;
315 /* Either way, we are going to allocate some word strings. */
316 model->writable = TRUE;
317
318 /* And, don't forget, we need to rebuild the word to unigram ID
319 * mapping. */
320 new_wid = hash_table_new(model->n_words, FALSE);
321 for (i = 0; i < model->n_words; ++i) {
322 char *outstr;
323 if (writable) {
324 outstr = model->word_str[i];
325 }
326 else {
327 outstr = ckd_salloc(model->word_str[i]);
328 }
329 /* Don't case-fold <tags> or [classes] */
330 if (outstr[0] == '<' || outstr[0] == '[') {
331 }
332 else {
333 switch (kase) {
334 case NGRAM_UPPER:
335 ucase(outstr);
336 break;
337 case NGRAM_LOWER:
338 lcase(outstr);
339 break;
340 default:
341 ;
342 }
343 }
344 model->word_str[i] = outstr;
345
346 /* Now update the hash table. We might have terrible
347 * collisions here, so warn about them. */
348 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
349 E_WARN("Duplicate word in dictionary after conversion: %s\n",
350 model->word_str[i]);
351 }
352 }
353 /* Swap out the hash table. */
354 hash_table_free(model->wid);
355 model->wid = new_wid;
356 return 0;
357}
358
359int
360ngram_model_apply_weights(ngram_model_t * model, float32 lw, float32 wip)
361{
362 return (*model->funcs->apply_weights) (model, lw, wip);
363}
364
365float32
366ngram_model_get_weights(ngram_model_t * model, int32 * out_log_wip)
367{
368 if (out_log_wip)
369 *out_log_wip = model->log_wip;
370 return model->lw;
371}
372
373
374int32
375ngram_ng_score(ngram_model_t * model, int32 wid, int32 * history,
376 int32 n_hist, int32 * n_used)
377{
378 int32 score, class_weight = 0;
379 int i;
380
381 /* Closed vocabulary, OOV word probability is zero */
382 if (wid == NGRAM_INVALID_WID)
383 return model->log_zero;
384
385 /* "Declassify" wid and history */
386 if (NGRAM_IS_CLASSWID(wid)) {
387 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
388
389 class_weight = ngram_class_prob(lmclass, wid);
390 if (class_weight == 1) /* Meaning, not found in class. */
391 return model->log_zero;
392 wid = lmclass->tag_wid;
393 }
394 for (i = 0; i < n_hist; ++i) {
395 if (history[i] != NGRAM_INVALID_WID
396 && NGRAM_IS_CLASSWID(history[i]))
397 history[i] =
398 model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
399 }
400 score = (*model->funcs->score) (model, wid, history, n_hist, n_used);
401
402 /* Multiply by unigram in-class weight. */
403 return score + class_weight;
404}
405
406int32
407ngram_score(ngram_model_t * model, const char *word, ...)
408{
409 va_list history;
410 const char *hword;
411 int32 *histid;
412 int32 n_hist;
413 int32 n_used;
414 int32 prob;
415
416 va_start(history, word);
417 n_hist = 0;
418 while ((hword = va_arg(history, const char *)) != NULL)
419 ++n_hist;
420 va_end(history);
421
422 histid = ckd_calloc(n_hist, sizeof(*histid));
423 va_start(history, word);
424 n_hist = 0;
425 while ((hword = va_arg(history, const char *)) != NULL) {
426 histid[n_hist] = ngram_wid(model, hword);
427 ++n_hist;
428 }
429 va_end(history);
430
431 prob = ngram_ng_score(model, ngram_wid(model, word),
432 histid, n_hist, &n_used);
433 ckd_free(histid);
434 return prob;
435}
436
437int32
438ngram_tg_score(ngram_model_t * model, int32 w3, int32 w2, int32 w1,
439 int32 * n_used)
440{
441 int32 hist[2];
442 hist[0] = w2;
443 hist[1] = w1;
444 return ngram_ng_score(model, w3, hist, 2, n_used);
445}
446
447int32
448ngram_bg_score(ngram_model_t * model, int32 w2, int32 w1, int32 * n_used)
449{
450 return ngram_ng_score(model, w2, &w1, 1, n_used);
451}
452
453int32
454ngram_ng_prob(ngram_model_t * model, int32 wid, int32 * history,
455 int32 n_hist, int32 * n_used)
456{
457 int32 prob, class_weight = 0;
458 int i;
459
460 /* Closed vocabulary, OOV word probability is zero */
461 if (wid == NGRAM_INVALID_WID)
462 return model->log_zero;
463
464 /* "Declassify" wid and history */
465 if (NGRAM_IS_CLASSWID(wid)) {
466 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
467
468 class_weight = ngram_class_prob(lmclass, wid);
469 if (class_weight == 1) /* Meaning, not found in class. */
470 return class_weight;
471 wid = lmclass->tag_wid;
472 }
473 for (i = 0; i < n_hist; ++i) {
474 if (history[i] != NGRAM_INVALID_WID
475 && NGRAM_IS_CLASSWID(history[i]))
476 history[i] =
477 model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
478 }
479 prob = (*model->funcs->raw_score) (model, wid, history,
480 n_hist, n_used);
481 /* Multiply by unigram in-class weight. */
482 return prob + class_weight;
483}
484
485int32
486ngram_probv(ngram_model_t * model, const char *word, ...)
487{
488 va_list history;
489 const char *hword;
490 int32 *histid;
491 int32 n_hist;
492 int32 n_used;
493 int32 prob;
494
495 va_start(history, word);
496 n_hist = 0;
497 while ((hword = va_arg(history, const char *)) != NULL)
498 ++n_hist;
499 va_end(history);
500
501 histid = ckd_calloc(n_hist, sizeof(*histid));
502 va_start(history, word);
503 n_hist = 0;
504 while ((hword = va_arg(history, const char *)) != NULL) {
505 histid[n_hist] = ngram_wid(model, hword);
506 ++n_hist;
507 }
508 va_end(history);
509
510 prob = ngram_ng_prob(model, ngram_wid(model, word),
511 histid, n_hist, &n_used);
512 ckd_free(histid);
513 return prob;
514}
515
516int32
517ngram_prob(ngram_model_t * model, const char* const *words, int32 n)
518{
519 int32 *ctx_id;
520 int32 nused;
521 int32 prob;
522 int32 wid;
523 uint32 i;
524
525 ctx_id = (int32 *) ckd_calloc(n - 1, sizeof(*ctx_id));
526 for (i = 1; i < (uint32) n; ++i)
527 ctx_id[i - 1] = ngram_wid(model, words[i]);
528
529 wid = ngram_wid(model, *words);
530 prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused);
531 ckd_free(ctx_id);
532
533 return prob;
534}
535
536int32
538{
539 int32 prob;
540
541 /* Undo insertion penalty. */
542 prob = score - base->log_wip;
543 /* Undo language weight. */
544 prob = (int32) (prob / base->lw);
545
546 return prob;
547}
548
549int32
551{
552 int32 val;
553
554 /* FIXME: This could be memoized for speed if necessary. */
555 /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
556 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
557 return NGRAM_INVALID_WID;
558 else
559 return val;
560}
561
562int32
564{
565 return model->log_zero;
566}
567
568int32
570{
571 if (model != NULL)
572 return model->n;
573 return 0;
574}
575
576uint32 const *
578{
579 if (model != NULL)
580 return model->n_counts;
581 return NULL;
582}
583
584int32
585ngram_wid(ngram_model_t * model, const char *word)
586{
587 int32 val;
588
589 if (hash_table_lookup_int32(model->wid, word, &val) == -1)
590 return ngram_unknown_wid(model);
591 else
592 return val;
593}
594
595const char *
596ngram_word(ngram_model_t * model, int32 wid)
597{
598 /* Remove any class tag */
599 wid = NGRAM_BASEWID(wid);
600 if (wid >= model->n_words)
601 return NULL;
602 return model->word_str[wid];
603}
604
608int32
609ngram_add_word_internal(ngram_model_t * model,
610 const char *word, int32 classid)
611{
612
613 /* Check for hash collisions. */
614 int32 wid;
615 if (hash_table_lookup_int32(model->wid, word, &wid) == 0) {
616 E_WARN("Omit duplicate word '%s'\n", word);
617 return wid;
618 }
619
620 /* Take the next available word ID */
621 wid = model->n_words;
622 if (classid >= 0) {
623 wid = NGRAM_CLASSWID(wid, classid);
624 }
625
626 /* Reallocate word_str if necessary. */
627 if (model->n_words >= model->n_1g_alloc) {
628 model->n_1g_alloc += UG_ALLOC_STEP;
629 model->word_str = ckd_realloc(model->word_str,
630 sizeof(*model->word_str) *
631 model->n_1g_alloc);
632 }
633 /* Add the word string in the appropriate manner. */
634 /* Class words are always dynamically allocated. */
635 model->word_str[model->n_words] = ckd_salloc(word);
636 /* Now enter it into the hash table. */
638 (model->wid, model->word_str[model->n_words], wid) != wid) {
639 E_ERROR
640 ("Hash insertion failed for word %s => %p (should not happen)\n",
641 model->word_str[model->n_words], (void *) (long) (wid));
642 }
643 /* Increment number of words. */
644 ++model->n_words;
645 return wid;
646}
647
648int32
650 const char *word, float32 weight)
651{
652 int32 wid, prob = model->log_zero;
653
654 /* If we add word to unwritable model, we need to make it writable */
655 if (!model->writable) {
656 E_WARN("Can't add word '%s' to read-only language model. "
657 "Disable mmap with '-mmap no' to make it writable\n", word);
658 return -1;
659 }
660
661 wid = ngram_add_word_internal(model, word, -1);
662 if (wid == NGRAM_INVALID_WID)
663 return wid;
664
665 /* Do what needs to be done to add the word to the unigram. */
666 if (model->funcs && model->funcs->add_ug)
667 prob =
668 (*model->funcs->add_ug) (model, wid,
669 logmath_log(model->lmath, weight));
670 if (prob == 0)
671 return -1;
672
673 return wid;
674}
675
677ngram_class_new(ngram_model_t * model, int32 tag_wid, int32 start_wid,
678 glist_t classwords)
679{
680 ngram_class_t *lmclass;
681 gnode_t *gn;
682 float32 tprob;
683 int i;
684
685 lmclass = ckd_calloc(1, sizeof(*lmclass));
686 lmclass->tag_wid = tag_wid;
687 /* wid_base is the wid (minus class tag) of the first word in the list. */
688 lmclass->start_wid = start_wid;
689 lmclass->n_words = glist_count(classwords);
690 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
691 lmclass->nword_hash = NULL;
692 lmclass->n_hash = 0;
693 tprob = 0.0;
694 for (gn = classwords; gn; gn = gnode_next(gn)) {
695 tprob += gnode_float32(gn);
696 }
697 if (tprob > 1.1 || tprob < 0.9) {
698 E_INFO("Total class probability is %f, will normalize\n", tprob);
699 for (gn = classwords; gn; gn = gnode_next(gn)) {
700 gn->data.fl /= tprob;
701 }
702 }
703 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
704 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
705 }
706
707 return lmclass;
708}
709
710int32
711ngram_class_add_word(ngram_class_t * lmclass, int32 wid, int32 lweight)
712{
713 int32 hash;
714
715 if (lmclass->nword_hash == NULL) {
716 /* Initialize everything in it to -1 */
717 lmclass->nword_hash =
718 ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
719 memset(lmclass->nword_hash, 0xff,
720 NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
721 lmclass->n_hash = NGRAM_HASH_SIZE;
722 lmclass->n_hash_inuse = 0;
723 }
724 /* Stupidest possible hash function. This will work pretty well
725 * when this function is called repeatedly with contiguous word
726 * IDs, though... */
727 hash = wid & (lmclass->n_hash - 1);
728 if (lmclass->nword_hash[hash].wid == -1) {
729 /* Good, no collision. */
730 lmclass->nword_hash[hash].wid = wid;
731 lmclass->nword_hash[hash].prob1 = lweight;
732 ++lmclass->n_hash_inuse;
733 return hash;
734 }
735 else {
736 int32 next;
737 /* Collision... Find the end of the hash chain. */
738 while (lmclass->nword_hash[hash].next != -1)
739 hash = lmclass->nword_hash[hash].next;
740 assert(hash != -1);
741 /* Does we has any more bukkit? */
742 if (lmclass->n_hash_inuse == lmclass->n_hash) {
743 /* Oh noes! Ok, we makes more. */
744 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
745 lmclass->n_hash * 2 *
746 sizeof(*lmclass->
747 nword_hash));
748 memset(lmclass->nword_hash + lmclass->n_hash, 0xff,
749 lmclass->n_hash * sizeof(*lmclass->nword_hash));
750 /* Just use the next allocated one (easy) */
751 next = lmclass->n_hash;
752 lmclass->n_hash *= 2;
753 }
754 else {
755 /* Look for any available bucket. We hope this doesn't happen. */
756 for (next = 0; next < lmclass->n_hash; ++next)
757 if (lmclass->nword_hash[next].wid == -1)
758 break;
759 /* This should absolutely not happen. */
760 assert(next != lmclass->n_hash);
761 }
762 lmclass->nword_hash[next].wid = wid;
763 lmclass->nword_hash[next].prob1 = lweight;
764 lmclass->nword_hash[hash].next = next;
765 ++lmclass->n_hash_inuse;
766 return next;
767 }
768}
769
770void
771ngram_class_free(ngram_class_t * lmclass)
772{
773 ckd_free(lmclass->nword_hash);
774 ckd_free(lmclass->prob1);
775 ckd_free(lmclass);
776}
777
778int32
780 const char *classname,
781 const char *word, float32 weight)
782{
783 ngram_class_t *lmclass;
784 int32 classid, tag_wid, wid, i, scale;
785 float32 fprob;
786
787 /* Find the class corresponding to classname. Linear search
788 * probably okay here since there won't be very many classes, and
789 * this doesn't have to be fast. */
790 tag_wid = ngram_wid(model, classname);
791 if (tag_wid == NGRAM_INVALID_WID) {
792 E_ERROR("No such word or class tag: %s\n", classname);
793 return tag_wid;
794 }
795 for (classid = 0; classid < model->n_classes; ++classid) {
796 if (model->classes[classid]->tag_wid == tag_wid)
797 break;
798 }
799 /* Hmm, no such class. It's probably not a good idea to create one. */
800 if (classid == model->n_classes) {
801 E_ERROR
802 ("Word %s is not a class tag (call ngram_model_add_class() first)\n",
803 classname);
804 return NGRAM_INVALID_WID;
805 }
806 lmclass = model->classes[classid];
807
808 /* Add this word to the model's set of words. */
809 wid = ngram_add_word_internal(model, word, classid);
810 if (wid == NGRAM_INVALID_WID)
811 return wid;
812
813 /* This is the fixed probability of the new word. */
814 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
815 /* Now normalize everything else to fit it in. This is
816 * accomplished by simply scaling all the other probabilities
817 * by (1-fprob). */
818 scale = logmath_log(model->lmath, 1.0 - fprob);
819 for (i = 0; i < lmclass->n_words; ++i)
820 lmclass->prob1[i] += scale;
821 for (i = 0; i < lmclass->n_hash; ++i)
822 if (lmclass->nword_hash[i].wid != -1)
823 lmclass->nword_hash[i].prob1 += scale;
824
825 /* Now add it to the class hash table. */
826 return ngram_class_add_word(lmclass, wid,
827 logmath_log(model->lmath, fprob));
828}
829
830int32
832 const char *classname,
833 float32 classweight,
834 char **words, const float32 * weights, int32 n_words)
835{
836 ngram_class_t *lmclass;
837 glist_t classwords = NULL;
838 int32 i, start_wid = -1;
839 int32 classid, tag_wid;
840
841 /* Check if classname already exists in model. If not, add it. */
842 if ((tag_wid =
843 ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
844 tag_wid = ngram_model_add_word(model, classname, classweight);
845 if (tag_wid == NGRAM_INVALID_WID)
846 return -1;
847 }
848
849 if (model->n_classes == 128) {
850 E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
851 return -1;
852 }
853 classid = model->n_classes;
854 for (i = 0; i < n_words; ++i) {
855 int32 wid;
856
857 wid = ngram_add_word_internal(model, words[i], classid);
858 if (wid == NGRAM_INVALID_WID)
859 return -1;
860 if (start_wid == -1)
861 start_wid = NGRAM_BASEWID(wid);
862 classwords = glist_add_float32(classwords, weights[i]);
863 }
864 classwords = glist_reverse(classwords);
865 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
866 glist_free(classwords);
867 if (lmclass == NULL)
868 return -1;
869
870 ++model->n_classes;
871 if (model->classes == NULL)
872 model->classes = ckd_calloc(1, sizeof(*model->classes));
873 else
874 model->classes = ckd_realloc(model->classes,
875 model->n_classes *
876 sizeof(*model->classes));
877 model->classes[classid] = lmclass;
878 return classid;
879}
880
881int32
882ngram_class_prob(ngram_class_t * lmclass, int32 wid)
883{
884 int32 base_wid = NGRAM_BASEWID(wid);
885
886 if (base_wid < lmclass->start_wid
887 || base_wid > lmclass->start_wid + lmclass->n_words) {
888 int32 hash;
889
890 /* Look it up in the hash table. */
891 hash = wid & (lmclass->n_hash - 1);
892 while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
893 hash = lmclass->nword_hash[hash].next;
894 if (hash == -1)
895 return 1;
896 return lmclass->nword_hash[hash].prob1;
897 }
898 else {
899 return lmclass->prob1[base_wid - lmclass->start_wid];
900 }
901}
902
903int32
904read_classdef_file(hash_table_t * classes, const char *file_name)
905{
906 FILE *fp;
907 int32 is_pipe;
908 int inclass;
909 int32 rv = -1;
910 gnode_t *gn;
911 glist_t classwords = NULL;
912 glist_t classprobs = NULL;
913 char *classname = NULL;
914
915 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
916 E_ERROR("File %s not found\n", file_name);
917 return -1;
918 }
919
920 inclass = FALSE;
921 while (!feof(fp)) {
922 char line[512];
923 char *wptr[2];
924 int n_words;
925
926 if (fgets(line, sizeof(line), fp) == NULL)
927 break;
928
929 n_words = str2words(line, wptr, 2);
930 if (n_words <= 0)
931 continue;
932
933 if (inclass) {
934 /* Look for an end of class marker. */
935 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
936 classdef_t *classdef;
937 gnode_t *word, *weight;
938 int32 i;
939
940 if (classname == NULL || 0 != strcmp(wptr[1], classname))
941 goto error_out;
942 inclass = FALSE;
943
944 /* Construct a class from the list of words collected. */
945 classdef = ckd_calloc(1, sizeof(*classdef));
946 classwords = glist_reverse(classwords);
947 classprobs = glist_reverse(classprobs);
948 classdef->n_words = glist_count(classwords);
949 classdef->words = ckd_calloc(classdef->n_words,
950 sizeof(*classdef->words));
951 classdef->weights = ckd_calloc(classdef->n_words,
952 sizeof(*classdef->weights));
953 word = classwords;
954 weight = classprobs;
955 for (i = 0; i < classdef->n_words; ++i) {
956 classdef->words[i] = gnode_ptr(word);
957 classdef->weights[i] = gnode_float32(weight);
958 word = gnode_next(word);
959 weight = gnode_next(weight);
960 }
961
962 /* Add this class to the hash table. */
963 if (hash_table_enter(classes, classname, classdef) !=
964 classdef) {
965 classdef_free(classdef);
966 goto error_out;
967 }
968
969 /* Reset everything. */
970 glist_free(classwords);
971 glist_free(classprobs);
972 classwords = NULL;
973 classprobs = NULL;
974 classname = NULL;
975 }
976 else {
977 float32 fprob;
978
979 if (n_words == 2)
980 fprob = atof_c(wptr[1]);
981 else
982 fprob = 1.0f;
983 /* Add it to the list of words for this class. */
984 classwords =
985 glist_add_ptr(classwords, ckd_salloc(wptr[0]));
986 classprobs = glist_add_float32(classprobs, fprob);
987 }
988 }
989 else {
990 /* Start a new LM class if the LMCLASS marker is seen */
991 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
992 if (inclass)
993 goto error_out;
994 inclass = TRUE;
995 classname = ckd_salloc(wptr[1]);
996 }
997 /* Otherwise, just ignore whatever junk we got */
998 }
999 }
1000 rv = 0; /* Success. */
1001
1002 error_out:
1003 /* Free all the stuff we might have allocated. */
1004 fclose_comp(fp, is_pipe);
1005 for (gn = classwords; gn; gn = gnode_next(gn))
1006 ckd_free(gnode_ptr(gn));
1007 glist_free(classwords);
1008 glist_free(classprobs);
1009 ckd_free(classname);
1010
1011 return rv;
1012}
1013
1014void
1015classdef_free(classdef_t * classdef)
1016{
1017 int32 i;
1018 for (i = 0; i < classdef->n_words; ++i)
1019 ckd_free(classdef->words[i]);
1020 ckd_free(classdef->words);
1021 ckd_free(classdef->weights);
1022 ckd_free(classdef);
1023}
1024
1025
1026int32
1027ngram_model_read_classdef(ngram_model_t * model, const char *file_name)
1028{
1029 hash_table_t *classes;
1030 glist_t hl = NULL;
1031 gnode_t *gn;
1032 int32 rv = -1;
1033
1034 classes = hash_table_new(0, FALSE);
1035 if (read_classdef_file(classes, file_name) < 0) {
1036 hash_table_free(classes);
1037 return -1;
1038 }
1039
1040 /* Create a new class in the language model for each classdef. */
1041 hl = hash_table_tolist(classes, NULL);
1042 for (gn = hl; gn; gn = gnode_next(gn)) {
1043 hash_entry_t *he = gnode_ptr(gn);
1044 classdef_t *classdef = he->val;
1045
1046 if (ngram_model_add_class(model, he->key, 1.0,
1047 classdef->words,
1048 classdef->weights,
1049 classdef->n_words) < 0)
1050 goto error_out;
1051 }
1052 rv = 0;
1053
1054 error_out:
1055 for (gn = hl; gn; gn = gnode_next(gn)) {
1056 hash_entry_t *he = gnode_ptr(gn);
1057 ckd_free((char *) he->key);
1058 classdef_free(he->val);
1059 }
1060 glist_free(hl);
1061 hash_table_free(classes);
1062 return rv;
1063}
Locale-independent implementation of case swapping operation.
SPHINXBASE_EXPORT void ucase(char *str)
Convert str to all upper case.
SPHINXBASE_EXPORT void lcase(char *str)
Convert str to all lower case.
SPHINXBASE_EXPORT int32 strcmp_nocase(const char *str1, const char *str2)
(FIXME! The implementation is incorrect!) Case insensitive string compare.
Definition: case.c:94
SPHINXBASE_EXPORT int32 strncmp_nocase(const char *str1, const char *str2, size_t len)
Like strcmp_nocase() but with a maximum length.
Definition: case.c:119
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
#define ckd_malloc(sz)
Macro for ckd_malloc
Definition: ckd_alloc.h:253
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition: ckd_alloc.h:258
SPHINXBASE_EXPORT int cmd_ln_exists_r(cmd_ln_t *cmdln, char const *name)
Re-entrant version of cmd_ln_exists().
Definition: cmd_ln.c:929
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
File names related operation.
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Definition: glist.c:169
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed.
Definition: glist.c:133
SPHINXBASE_EXPORT glist_t glist_add_float32(glist_t g, float32 val)
Create and prepend a new list node containing a single-precision float.
Definition: glist.c:110
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
Definition: glist.c:74
#define gnode_ptr(g)
Head of a list of gnodes.
Definition: glist.h:109
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Definition: glist.c:145
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:688
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition: hash_table.h:228
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
Definition: hash_table.c:616
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
Definition: hash_table.c:483
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition: hash_table.c:322
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT int logmath_get_zero(logmath_t *lmath)
Get the smallest possible value represented in this base.
Definition: logmath.c:374
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition: logmath.c:447
N-Gram language models.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
Definition: ngram_model.c:831
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
Definition: ngram_model.c:438
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Definition: ngram_model.c:550
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
Definition: ngram_model.c:454
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Definition: ngram_model.c:596
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
Definition: ngram_model.c:649
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
Definition: ngram_model.c:779
#define NGRAM_INVALID_WID
Impossible word ID.
Definition: ngram_model.h:83
@ NGRAM_INVALID
Not a valid file type.
Definition: ngram_model.h:77
@ NGRAM_AUTO
Determine file type automatically.
Definition: ngram_model.h:78
@ NGRAM_BIN
Sphinx .DMP format.
Definition: ngram_model.h:80
@ NGRAM_ARPA
ARPABO text format (the standard).
Definition: ngram_model.h:79
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
Definition: ngram_model.c:308
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip)
Get the current weights from a language model.
Definition: ngram_model.c:366
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
Definition: ngram_model.c:569
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
Definition: ngram_model.c:360
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
Definition: ngram_model.c:375
SPHINXBASE_EXPORT int32 ngram_probv(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
Definition: ngram_model.c:486
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:124
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Definition: ngram_model.c:256
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
Definition: ngram_model.c:110
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
Definition: ngram_model.c:448
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *const *words, int32 n)
Get the "raw" log-probability for a general N-Gram.
Definition: ngram_model.c:517
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
Definition: ngram_model.c:1027
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
Definition: ngram_model.c:407
SPHINXBASE_EXPORT uint32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
Definition: ngram_model.c:577
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
Definition: ngram_model.c:178
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:585
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition: ngram_model.c:63
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
Definition: ngram_model.c:537
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
Definition: ngram_model.c:563
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:263
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
Definition: ngram_model.c:249
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition: ngram_model.c:99
file IO related operations.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:184
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
Definition: pio.c:107
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
One class definition from a classdef file.
Opaque structure used to hold the results of command-line parsing.
A node in a generic list.
Definition: glist.h:100
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
Definition: hash_table.h:149
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
Definition: hash_table.h:155
int32 prob1
Probability for this word.
int32 next
Index of next bucket (or -1 for no collision)
int32 wid
Word ID of this bucket.
Implementation of ngram_class_t.
int32 start_wid
Starting base word ID for this class' words.
int32 * prob1
Probability table for base words.
int32 n_hash_inuse
Number of words in nword_hash.
int32 n_hash
Number of buckets in nword_hash (power of 2)
int32 tag_wid
Base word ID for this class tag.
int32 n_words
Number of base words for this class.
Implementation-specific functions for operating on ngram_model_t objects.
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip)
Implementation-specific function for applying language model weights.
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
struct ngram_class_s ** classes
Word class definitions.
int refcount
Reference count.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 log_wip
Log of word insertion penalty.
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
int32 log_zero
Zero probability, cached here for quick lookup.
uint32 * n_counts
Counts for 1, 2, 3, ... grams.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
hash_table_t * wid
Mapping of unigram names to word IDs.
float32 lw
Language model scaling factor.
uint8 writable
Are word strings writable?
struct ngram_funcs_s * funcs
Implementation-specific methods.
uint8 n_classes
Number of classes (maximum 128)
char ** word_str
Unigram names.