SphinxBase 5prealpha
ngram_model_trie.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2015 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38#include <string.h>
39#include <assert.h>
40
41#include <sphinxbase/err.h>
42#include <sphinxbase/pio.h>
43#include <sphinxbase/strfuncs.h>
45#include <sphinxbase/byteorder.h>
46
47#include "ngram_model_trie.h"
48
49static const char trie_hdr[] = "Trie Language Model";
50static const char dmp_hdr[] = "Darpa Trigram LM";
51static ngram_funcs_t ngram_model_trie_funcs;
52
53/*
54 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
55 */
56static int
57read_counts_arpa(lineiter_t ** li, uint32 * counts, int *order)
58{
59 int32 ngram, prev_ngram;
60 uint32 ngram_cnt;
61
62 /* skip file until past the '\data\' marker */
63 while (*li) {
64 if (strcmp((*li)->buf, "\\data\\") == 0)
65 break;
66 *li = lineiter_next(*li);
67 }
68
69 if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
70 E_INFO("No \\data\\ mark in LM file\n");
71 return -1;
72 }
73
74 prev_ngram = 0;
75 *order = 0;
76 while ((*li = lineiter_next(*li))) {
77 if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
78 break;
79 if (ngram != prev_ngram + 1) {
81 ("Ngram counts in LM file is not in order. %d goes after %d\n",
82 ngram, prev_ngram);
83 return -1;
84 }
85 prev_ngram = ngram;
86 counts[*order] = ngram_cnt;
87 (*order)++;
88 }
89
90 if (*li == NULL) {
91 E_ERROR("EOF while reading ngram counts\n");
92 return -1;
93 }
94
95 return 0;
96}
97
98static int
99read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base,
100 unigram_t * unigrams)
101{
102 uint32 i;
103 int n;
104 int n_parts;
105 char *wptr[3];
106
107 while (*li && strcmp((*li)->buf, "\\1-grams:") != 0) {
108 *li = lineiter_next(*li);
109 }
110 if (*li == NULL) {
111 E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
112 return -1;
113 }
114
115 n_parts = 2;
116 for (i = 0; i < count; i++) {
117 unigram_t *unigram;
118
119 *li = lineiter_next(*li);
120 if (*li == NULL) {
121 E_ERROR
122 ("Unexpected end of ARPA file. Failed to read %dth unigram\n",
123 i + 1);
124 return -1;
125 }
126 if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) {
127 E_ERROR("Format error at line %s, Failed to read unigrams\n", (*li)->buf);
128 return -1;
129 }
130
131 unigram = &unigrams[i];
132 unigram->prob =
133 logmath_log10_to_log_float(base->lmath, atof_c(wptr[0]));
134 if (unigram->prob > 0) {
135 E_WARN("Unigram '%s' has positive probability\n", wptr[1]);
136 unigram->prob = 0;
137 }
138 if (n == n_parts + 1) {
139 unigram->bo =
141 atof_c(wptr[2]));
142 }
143 else {
144 unigram->bo = 0.0f;
145 }
146
147 /* TODO: classify float with fpclassify and warn if bad value occurred */
148 base->word_str[i] = ckd_salloc(wptr[1]);
149 }
150
151 /* fill hash-table that maps unigram names to their word ids */
152 for (i = 0; i < count; i++) {
154 (base->wid, base->word_str[i],
155 (void *) (long) i)) != (void *) (long) i) {
156 E_WARN("Duplicate word in dictionary: %s\n",
157 base->word_str[i]);
158 }
159 }
160 return 0;
161}
162
164ngram_model_trie_read_arpa(cmd_ln_t * config,
165 const char *path, logmath_t * lmath)
166{
167 FILE *fp;
168 lineiter_t *li;
169 ngram_model_trie_t *model;
170 ngram_model_t *base;
171 ngram_raw_t **raw_ngrams;
172 int32 is_pipe;
173 uint32 counts[NGRAM_MAX_ORDER];
174 int order;
175 int i;
176
177 E_INFO("Trying to read LM in arpa format\n");
178 if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
179 E_ERROR("File %s not found\n", path);
180 return NULL;
181 }
182
183 model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
184 li = lineiter_start_clean(fp);
185 /* Read n-gram counts from file */
186 if (read_counts_arpa(&li, counts, &order) == -1) {
187 ckd_free(model);
188 lineiter_free(li);
189 fclose_comp(fp, is_pipe);
190 return NULL;
191 }
192
193 E_INFO("LM of order %d\n", order);
194 for (i = 0; i < order; i++) {
195 E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
196 }
197
198 base = &model->base;
199 ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
200 (int32) counts[0]);
201 base->writable = TRUE;
202
203 model->trie = lm_trie_create(counts[0], order);
204 if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) {
205 ngram_model_free(base);
206 lineiter_free(li);
207 fclose_comp(fp, is_pipe);
208 return NULL;
209 }
210
211 if (order > 1) {
212 raw_ngrams =
213 ngrams_raw_read_arpa(&li, base->lmath, counts, order,
214 base->wid);
215 if (raw_ngrams == NULL) {
216 ngram_model_free(base);
217 lineiter_free(li);
218 fclose_comp(fp, is_pipe);
219 return NULL;
220 }
221 lm_trie_build(model->trie, raw_ngrams, counts, base->n_counts, order);
222 ngrams_raw_free(raw_ngrams, counts, order);
223 }
224
225 lineiter_free(li);
226 fclose_comp(fp, is_pipe);
227
228 return base;
229}
230
231int
232ngram_model_trie_write_arpa(ngram_model_t * base, const char *path)
233{
234 int i;
235 uint32 j;
236 ngram_model_trie_t *model = (ngram_model_trie_t *) base;
237 FILE *fp = fopen(path, "w");
238 if (!fp) {
239 E_ERROR("Unable to open %s to write arpa LM from trie\n", path);
240 return -1;
241 }
242 fprintf(fp,
243 "This is an ARPA-format language model file, generated by CMU Sphinx\n");
244 /* Write N-gram counts. */
245 fprintf(fp, "\\data\\\n");
246 for (i = 0; i < base->n; ++i) {
247 fprintf(fp, "ngram %d=%d\n", i + 1, base->n_counts[i]);
248 }
249 /* Write 1-grams */
250 fprintf(fp, "\n\\1-grams:\n");
251 for (j = 0; j < base->n_counts[0]; j++) {
252 unigram_t *unigram = &model->trie->unigrams[j];
253 fprintf(fp, "%.4f\t%s",
254 logmath_log_float_to_log10(base->lmath, unigram->prob),
255 base->word_str[j]);
256 if (base->n > 1) {
257 fprintf(fp, "\t%.4f",
258 logmath_log_float_to_log10(base->lmath, unigram->bo));
259 }
260 fprintf(fp, "\n");
261 }
262 /* Write ngrams */
263 if (base->n > 1) {
264 for (i = 2; i <= base->n; ++i) {
265 ngram_raw_t *raw_ngrams =
266 (ngram_raw_t *) ckd_calloc((size_t) base->n_counts[i - 1],
267 sizeof(*raw_ngrams));
268 uint32 raw_ngram_idx;
269 uint32 j;
270 uint32 hist[NGRAM_MAX_ORDER];
271 node_range_t range;
272 raw_ngram_idx = 0;
273 range.begin = range.end = 0;
274
275 /* we need to iterate over a trie here. recursion should do the job */
276 lm_trie_fill_raw_ngram(model->trie, raw_ngrams,
277 &raw_ngram_idx, base->n_counts, range, hist, 0,
278 i, base->n);
279 assert(raw_ngram_idx == base->n_counts[i - 1]);
280 qsort(raw_ngrams, (size_t) base->n_counts[i - 1],
281 sizeof(ngram_raw_t), &ngram_ord_comparator);
282
283 fprintf(fp, "\n\\%d-grams:\n", i);
284 for (j = 0; j < base->n_counts[i - 1]; j++) {
285 int k;
286 fprintf(fp, "%.4f", logmath_log_float_to_log10(base->lmath, raw_ngrams[j].prob));
287 for (k = 0; k < i; k++) {
288 fprintf(fp, "\t%s",
289 base->word_str[raw_ngrams[j].words[k]]);
290 }
291 ckd_free(raw_ngrams[j].words);
292 if (i < base->n) {
293 fprintf(fp, "\t%.4f", logmath_log_float_to_log10(base->lmath, raw_ngrams[j].backoff));
294 }
295 fprintf(fp, "\n");
296 }
297 ckd_free(raw_ngrams);
298 }
299 }
300 fprintf(fp, "\n\\end\\\n");
301 return fclose(fp);
302}
303
304static void
305read_word_str(ngram_model_t * base, FILE * fp)
306{
307 int32 k;
308 uint32 i, j;
309 char *tmp_word_str;
310 /* read ascii word strings */
311 base->writable = TRUE;
312 fread(&k, sizeof(k), 1, fp);
313 tmp_word_str = (char *) ckd_calloc((size_t) k, 1);
314 fread(tmp_word_str, 1, (size_t) k, fp);
315
316 /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
317 for (i = 0, j = 0; i < (uint32) k; i++)
318 if (tmp_word_str[i] == '\0')
319 j++;
320 if (j != base->n_counts[0]) {
321 E_ERROR
322 ("Error reading word strings (%d doesn't match n_unigrams %d)\n",
323 j, base->n_counts[0]);
324 }
325
326 /* Break up string just read into words */
327 j = 0;
328 for (i = 0; i < base->n_counts[0]; i++) {
329 base->word_str[i] = ckd_salloc(tmp_word_str + j);
330 if (hash_table_enter(base->wid, base->word_str[i],
331 (void *) (long) i) != (void *) (long) i) {
332 E_WARN("Duplicate word in dictionary: %s\n",
333 base->word_str[i]);
334 }
335 j += strlen(base->word_str[i]) + 1;
336 }
337 free(tmp_word_str);
338}
339
341ngram_model_trie_read_bin(cmd_ln_t * config,
342 const char *path, logmath_t * lmath)
343{
344 int32 is_pipe;
345 FILE *fp;
346 size_t hdr_size;
347 char *hdr;
348 int cmp_res;
349 uint8 i, order;
350 uint32 counts[NGRAM_MAX_ORDER];
351 ngram_model_trie_t *model;
352 ngram_model_t *base;
353
354 E_INFO("Trying to read LM in trie binary format\n");
355 if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) {
356 E_ERROR("File %s not found\n", path);
357 return NULL;
358 }
359 hdr_size = strlen(trie_hdr);
360 hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr));
361 fread(hdr, sizeof(*hdr), hdr_size, fp);
362 cmp_res = strcmp(hdr, trie_hdr);
363 ckd_free(hdr);
364 if (cmp_res) {
365 E_INFO("Header doesn't match\n");
366 fclose_comp(fp, is_pipe);
367 return NULL;
368 }
369 model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
370 base = &model->base;
371 fread(&order, sizeof(order), 1, fp);
372 for (i = 0; i < order; i++) {
373 fread(&counts[i], sizeof(counts[i]), 1, fp);
374 }
375 ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
376 (int32) counts[0]);
377 for (i = 0; i < order; i++) {
378 base->n_counts[i] = counts[i];
379 }
380
381 model->trie = lm_trie_read_bin(counts, order, fp);
382 read_word_str(base, fp);
383 fclose_comp(fp, is_pipe);
384
385 return base;
386}
387
388static void
389write_word_str(FILE * fp, ngram_model_t * model)
390{
391 int32 k;
392 uint32 i;
393
394 k = 0;
395 for (i = 0; i < model->n_counts[0]; i++)
396 k += strlen(model->word_str[i]) + 1;
397 fwrite(&k, sizeof(k), 1, fp);
398 for (i = 0; i < model->n_counts[0]; i++)
399 fwrite(model->word_str[i], 1, strlen(model->word_str[i]) + 1, fp);
400}
401
402int
403ngram_model_trie_write_bin(ngram_model_t * base, const char *path)
404{
405 int i;
406 int32 is_pipe;
407 ngram_model_trie_t *model = (ngram_model_trie_t *) base;
408 FILE *fp = fopen_comp(path, "wb", &is_pipe);
409 if (!fp) {
410 E_ERROR("Unable to open %s to write binary trie LM\n", path);
411 return -1;
412 }
413
414 fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp);
415 fwrite(&model->base.n, sizeof(model->base.n), 1, fp);
416 for (i = 0; i < model->base.n; i++) {
417 fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]),
418 1, fp);
419 }
420 lm_trie_write_bin(model->trie, base->n_counts[0], fp);
421 write_word_str(fp, base);
422 fclose_comp(fp, is_pipe);
423 return 0;
424}
425
427ngram_model_trie_read_dmp(cmd_ln_t * config,
428 const char *file_name, logmath_t * lmath)
429{
430 uint8 do_swap;
431 int32 is_pipe;
432 int32 k;
433 uint32 j;
434 int32 vn, ts;
435 int32 count;
436 uint32 counts[3];
437 uint32 *unigram_next;
438 int order;
439 char str[1024];
440 FILE *fp;
441 ngram_model_trie_t *model;
442 ngram_model_t *base;
443 ngram_raw_t **raw_ngrams;
444
445 E_INFO("Trying to read LM in dmp format\n");
446 if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
447 E_ERROR("Dump file %s not found\n", file_name);
448 return NULL;
449 }
450
451 do_swap = FALSE;
452 fread(&k, sizeof(k), 1, fp);
453 if (k != strlen(dmp_hdr) + 1) {
454 SWAP_INT32(&k);
455 if (k != strlen(dmp_hdr) + 1) {
456 E_ERROR
457 ("Wrong magic header size number %x: %s is not a dump file\n",
458 k, file_name);
459 return NULL;
460 }
461 do_swap = 1;
462 }
463 if (fread(str, 1, k, fp) != (size_t) k) {
464 E_ERROR("Cannot read header\n");
465 return NULL;
466 }
467 if (strncmp(str, dmp_hdr, k) != 0) {
468 E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
469 return NULL;
470 }
471
472 if (fread(&k, sizeof(k), 1, fp) != 1)
473 return NULL;
474 if (do_swap)
475 SWAP_INT32(&k);
476 if (fread(str, 1, k, fp) != (size_t) k) {
477 E_ERROR("Cannot read LM filename in header\n");
478 return NULL;
479 }
480
481 /* read version#, if present (must be <= 0) */
482 if (fread(&vn, sizeof(vn), 1, fp) != 1)
483 return NULL;
484 if (do_swap)
485 SWAP_INT32(&vn);
486 if (vn <= 0) {
487 /* read and don't compare timestamps (we don't care) */
488 if (fread(&ts, sizeof(ts), 1, fp) != 1)
489 return NULL;
490 if (do_swap)
491 SWAP_INT32(&ts);
492
493 /* read and skip format description */
494 for (;;) {
495 if (fread(&k, sizeof(k), 1, fp) != 1)
496 return NULL;
497 if (do_swap)
498 SWAP_INT32(&k);
499 if (k == 0)
500 break;
501 if (fread(str, 1, k, fp) != (size_t) k) {
502 E_ERROR("Failed to read word\n");
503 return NULL;
504 }
505 }
506 /* read model->ucount */
507 if (fread(&count, sizeof(count), 1, fp) != 1)
508 return NULL;
509 if (do_swap)
510 SWAP_INT32(&count);
511 counts[0] = count;
512 }
513 else {
514 counts[0] = vn;
515 }
516 /* read model->bcount, tcount */
517 if (fread(&count, sizeof(count), 1, fp) != 1)
518 return NULL;
519 if (do_swap)
520 SWAP_INT32(&count);
521 counts[1] = count;
522 if (fread(&count, sizeof(count), 1, fp) != 1)
523 return NULL;
524 if (do_swap)
525 SWAP_INT32(&count);
526 counts[2] = count;
527 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);
528
529 model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
530 base = &model->base;
531 if (counts[2] > 0)
532 order = 3;
533 else if (counts[1] > 0)
534 order = 2;
535 else
536 order = 1;
537 ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
538 (int32) counts[0]);
539
540 model->trie = lm_trie_create(counts[0], order);
541
542 unigram_next =
543 (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
544 for (j = 0; j <= (int32) counts[0]; j++) {
545 int32 bigrams;
546 int32 mapid;
547 dmp_weight_t weightp;
548 dmp_weight_t weightb;
549
550 /* Skip over the mapping ID, we don't care about it. */
551 /* Read the weights from actual unigram structure. */
552 fread(&mapid, sizeof(int32), 1, fp);
553 fread(&weightp, sizeof(weightp), 1, fp);
554 fread(&weightb, sizeof(weightb), 1, fp);
555 fread(&bigrams, sizeof(int32), 1, fp);
556 if (do_swap) {
557 SWAP_INT32(&weightp.l);
558 SWAP_INT32(&weightb.l);
559 SWAP_INT32(&bigrams);
560 }
561 model->trie->unigrams[j].prob = logmath_log10_to_log_float(lmath, weightp.f);
562 model->trie->unigrams[j].bo = logmath_log10_to_log_float(lmath, weightb.f);
563 model->trie->unigrams[j].next = bigrams;
564 unigram_next[j] = bigrams;
565 }
566
567 if (order > 1) {
568 raw_ngrams =
569 ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
570 do_swap);
571 if (raw_ngrams == NULL) {
572 ngram_model_free(base);
573 ckd_free(unigram_next);
574 fclose_comp(fp, is_pipe);
575 return NULL;
576 }
577 lm_trie_build(model->trie, raw_ngrams, counts, base->n_counts, order);
578 ngrams_raw_free(raw_ngrams, counts, order);
579 }
580
581 /* Sentinel unigram and bigrams read before */
582 ckd_free(unigram_next);
583
584 /* read ascii word strings */
585 read_word_str(base, fp);
586
587 fclose_comp(fp, is_pipe);
588 return base;
589}
590
591static void
592ngram_model_trie_free(ngram_model_t * base)
593{
594 ngram_model_trie_t *model = (ngram_model_trie_t *) base;
595 lm_trie_free(model->trie);
596}
597
598static int
599trie_apply_weights(ngram_model_t * base, float32 lw, float32 wip)
600{
601 /* just update weights that are going to be used on score calculation */
602 base->lw = lw;
603 base->log_wip = logmath_log(base->lmath, wip);
604 return 0;
605}
606
607static int32
608weight_score(ngram_model_t * base, int32 score)
609{
610 return (int32) (score * base->lw + base->log_wip);
611}
612
613static int32
614ngram_model_trie_raw_score(ngram_model_t * base, int32 wid, int32 * hist,
615 int32 n_hist, int32 * n_used)
616{
617 int32 i;
618 ngram_model_trie_t *model = (ngram_model_trie_t *) base;
619
620 if (n_hist > model->base.n - 1)
621 n_hist = model->base.n - 1;
622 for (i = 0; i < n_hist; i++) {
623 if (hist[i] < 0) {
624 n_hist = i;
625 break;
626 }
627 }
628
629 return (int32) lm_trie_score(model->trie, model->base.n, wid, hist,
630 n_hist, n_used);
631}
632
633static int32
634ngram_model_trie_score(ngram_model_t * base, int32 wid, int32 * hist,
635 int32 n_hist, int32 * n_used)
636{
637 return weight_score(base,
638 ngram_model_trie_raw_score(base, wid, hist, n_hist,
639 n_used));
640}
641
642static int32
643lm_trie_add_ug(ngram_model_t * base, int32 wid, int32 lweight)
644{
645 ngram_model_trie_t *model = (ngram_model_trie_t *) base;
646
647 /* This would be very bad if this happened! */
648 assert(!NGRAM_IS_CLASSWID(wid));
649
650 /* Reallocate unigram array. */
651 model->trie->unigrams =
652 (unigram_t *) ckd_realloc(model->trie->unigrams,
653 sizeof(*model->trie->unigrams) *
654 (base->n_1g_alloc + 1));
655 memset(model->trie->unigrams + (base->n_counts[0] + 1), 0,
656 (size_t) (base->n_1g_alloc -
657 base->n_counts[0]) * sizeof(*model->trie->unigrams));
658 ++base->n_counts[0];
659 lweight += logmath_log(base->lmath, 1.0 / base->n_counts[0]);
660 model->trie->unigrams[wid + 1].next = model->trie->unigrams[wid].next;
661 model->trie->unigrams[wid].prob = (float) lweight;
662 /* This unigram by definition doesn't participate in any bigrams,
663 * so its backoff weight is undefined and next pointer same as in finish unigram*/
664 model->trie->unigrams[wid].bo = 0;
665 /* Finally, increase the unigram count */
666 /* FIXME: Note that this can actually be quite bogus due to the
667 * presence of class words. If wid falls outside the unigram
668 * count, increase it to compensate, at the cost of no longer
669 * really knowing how many unigrams we have :( */
670 if ((uint32) wid >= base->n_counts[0])
671 base->n_counts[0] = wid + 1;
672
673 return (int32) weight_score(base, lweight);
674}
675
676static void
677lm_trie_flush(ngram_model_t * base)
678{
679 ngram_model_trie_t *model = (ngram_model_trie_t *) base;
680 lm_trie_t *trie = model->trie;
681 memset(trie->hist_cache, -1, sizeof(trie->hist_cache));
682 memset(trie->backoff_cache, 0, sizeof(trie->backoff_cache));
683 return;
684}
685
686static ngram_funcs_t ngram_model_trie_funcs = {
687 ngram_model_trie_free, /* free */
688 trie_apply_weights, /* apply_weights */
689 ngram_model_trie_score, /* score */
690 ngram_model_trie_raw_score, /* raw_score */
691 lm_trie_add_ug, /* add_ug */
692 lm_trie_flush /* flush */
693};
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:244
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition ckd_alloc.h:258
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition err.h:104
#define E_INFO(...)
Print logging information to standard error stream.
Definition err.h:114
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition err.h:99
#define E_WARN(...)
Print warning message to error log.
Definition err.h:109
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition hash_table.c:501
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
Definition logmath.c:480
SPHINXBASE_EXPORT float64 logmath_log_float_to_log10(logmath_t *lmath, float log_p)
Convert float log in base B to base 10 log.
Definition logmath.c:496
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition logmath.c:447
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
file IO related operations.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition pio.c:368
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition pio.c:184
SPHINXBASE_EXPORT lineiter_t * lineiter_start_clean(FILE *fh)
Start reading lines from a file, skip comments and trim lines.
Definition pio.c:288
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
Definition pio.c:107
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition strfuncs.c:123
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition strfuncs.c:55
Opaque structure used to hold the results of command-line parsing.
Line iterator for files.
Definition pio.h:177
Implementation-specific functions for operating on ngram_model_t objects.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 log_wip
Log of word insertion penalty.
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
uint32 * n_counts
Counts for 1, 2, 3, ... grams.
hash_table_t * wid
Mapping of unigram names to word IDs.
float32 lw
Language model scaling factor.
uint8 writable
Are word strings writable?
char ** word_str
Unigram names.
lm_trie_t * trie
Trie structure that stores ngram relations and weights.
ngram_model_t base
Base ngram_model_t structure.