SphinxBase 5prealpha
|
N-Gram language models. More...
#include <stdarg.h>
#include <sphinxbase/sphinxbase_export.h>
#include <sphinxbase/prim_type.h>
#include <sphinxbase/cmd_ln.h>
#include <sphinxbase/logmath.h>
#include <sphinxbase/mmio.h>
Go to the source code of this file.
Macros | |
#define | NGRAM_INVALID_WID -1 |
Impossible word ID. | |
Typedefs | |
typedef struct ngram_model_s | ngram_model_t |
Abstract type representing an N-Gram based language model. | |
typedef struct ngram_class_s | ngram_class_t |
Abstract type representing a word class in an N-Gram model. | |
typedef enum ngram_file_type_e | ngram_file_type_t |
File types for N-Gram files. | |
typedef enum ngram_case_e | ngram_case_t |
Constants for case folding. | |
typedef struct ngram_iter_s | ngram_iter_t |
M-gram iterator object. | |
typedef struct ngram_model_set_iter_s | ngram_model_set_iter_t |
Iterator over language models in a set. | |
Enumerations | |
enum | ngram_file_type_e { NGRAM_INVALID = -1 , NGRAM_AUTO , NGRAM_ARPA , NGRAM_BIN } |
File types for N-Gram files. More... | |
enum | ngram_case_e { NGRAM_UPPER , NGRAM_LOWER } |
Constants for case folding. More... | |
Functions | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_read (cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath) |
Read an N-Gram model from a file on disk. | |
SPHINXBASE_EXPORT int | ngram_model_write (ngram_model_t *model, const char *file_name, ngram_file_type_t format) |
Write an N-Gram model to disk. | |
SPHINXBASE_EXPORT ngram_file_type_t | ngram_file_name_to_type (const char *file_name) |
Guess the file type for an N-Gram model from the filename. | |
SPHINXBASE_EXPORT ngram_file_type_t | ngram_str_to_type (const char *str_name) |
Get the N-Gram file type from a string. | |
SPHINXBASE_EXPORT char const * | ngram_type_to_str (int type) |
Get the canonical name for an N-Gram file type. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_retain (ngram_model_t *model) |
Retain ownership of an N-Gram model. | |
SPHINXBASE_EXPORT int | ngram_model_free (ngram_model_t *model) |
Release memory associated with an N-Gram model. | |
SPHINXBASE_EXPORT int | ngram_model_casefold (ngram_model_t *model, int kase) |
Case-fold word strings in an N-Gram model. | |
SPHINXBASE_EXPORT int | ngram_model_apply_weights (ngram_model_t *model, float32 lw, float32 wip) |
Apply a language weight, insertion penalty, and unigram weight to a language model. | |
SPHINXBASE_EXPORT float32 | ngram_model_get_weights (ngram_model_t *model, int32 *out_log_wip) |
Get the current weights from a language model. | |
SPHINXBASE_EXPORT int32 | ngram_score (ngram_model_t *model, const char *word,...) |
Get the score (scaled, interpolated log-probability) for a general N-Gram. | |
SPHINXBASE_EXPORT int32 | ngram_tg_score (ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used) |
Quick trigram score lookup. | |
SPHINXBASE_EXPORT int32 | ngram_bg_score (ngram_model_t *model, int32 w2, int32 w1, int32 *n_used) |
Quick bigram score lookup. | |
SPHINXBASE_EXPORT int32 | ngram_ng_score (ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used) |
Quick general N-Gram score lookup. | |
SPHINXBASE_EXPORT int32 | ngram_probv (ngram_model_t *model, const char *word,...) |
Get the "raw" log-probability for a general N-Gram. | |
SPHINXBASE_EXPORT int32 | ngram_prob (ngram_model_t *model, const char *const *words, int32 n) |
Get the "raw" log-probability for a general N-Gram. | |
SPHINXBASE_EXPORT int32 | ngram_ng_prob (ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used) |
Quick "raw" probability lookup for a general N-Gram. | |
SPHINXBASE_EXPORT int32 | ngram_score_to_prob (ngram_model_t *model, int32 score) |
Convert score to "raw" log-probability. | |
SPHINXBASE_EXPORT int32 | ngram_wid (ngram_model_t *model, const char *word) |
Look up numerical word ID. | |
SPHINXBASE_EXPORT const char * | ngram_word (ngram_model_t *model, int32 wid) |
Look up word string for numerical word ID. | |
SPHINXBASE_EXPORT int32 | ngram_unknown_wid (ngram_model_t *model) |
Get the unknown word ID for a language model. | |
SPHINXBASE_EXPORT int32 | ngram_zero (ngram_model_t *model) |
Get the "zero" log-probability value for a language model. | |
SPHINXBASE_EXPORT int32 | ngram_model_get_size (ngram_model_t *model) |
Get the order of the N-gram model (i.e. | |
SPHINXBASE_EXPORT uint32 const * | ngram_model_get_counts (ngram_model_t *model) |
Get the counts of the various N-grams in the model. | |
SPHINXBASE_EXPORT ngram_iter_t * | ngram_model_mgrams (ngram_model_t *model, int m) |
Iterate over all M-grams. | |
SPHINXBASE_EXPORT ngram_iter_t * | ngram_iter (ngram_model_t *model, const char *word,...) |
Get an iterator over M-grams pointing to the specified M-gram. | |
SPHINXBASE_EXPORT ngram_iter_t * | ngram_ng_iter (ngram_model_t *model, int32 wid, int32 *history, int32 n_hist) |
Get an iterator over M-grams pointing to the specified M-gram. | |
SPHINXBASE_EXPORT int32 const * | ngram_iter_get (ngram_iter_t *itor, int32 *out_score, int32 *out_bowt) |
Get information from the current M-gram in an iterator. | |
SPHINXBASE_EXPORT ngram_iter_t * | ngram_iter_successors (ngram_iter_t *itor) |
Iterate over all M-gram successors of an M-1-gram. | |
SPHINXBASE_EXPORT ngram_iter_t * | ngram_iter_next (ngram_iter_t *itor) |
Advance an M-gram iterator. | |
SPHINXBASE_EXPORT void | ngram_iter_free (ngram_iter_t *itor) |
Terminate an M-gram iterator. | |
SPHINXBASE_EXPORT int32 | ngram_model_add_word (ngram_model_t *model, const char *word, float32 weight) |
Add a word (unigram) to the language model. | |
SPHINXBASE_EXPORT int32 | ngram_model_read_classdef (ngram_model_t *model, const char *file_name) |
Read a class definition file and add classes to a language model. | |
SPHINXBASE_EXPORT int32 | ngram_model_add_class (ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words) |
Add a new class to a language model. | |
SPHINXBASE_EXPORT int32 | ngram_model_add_class_word (ngram_model_t *model, const char *classname, const char *word, float32 weight) |
Add a word to a class in a language model. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_init (cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models) |
Create a set of language models sharing a common space of word IDs. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_read (cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath) |
Read a set of language models from a control file. | |
SPHINXBASE_EXPORT int32 | ngram_model_set_count (ngram_model_t *set) |
Returns the number of language models in a set. | |
SPHINXBASE_EXPORT ngram_model_set_iter_t * | ngram_model_set_iter (ngram_model_t *set) |
Begin iterating over language models in a set. | |
SPHINXBASE_EXPORT ngram_model_set_iter_t * | ngram_model_set_iter_next (ngram_model_set_iter_t *itor) |
Move to the next language model in a set. | |
SPHINXBASE_EXPORT void | ngram_model_set_iter_free (ngram_model_set_iter_t *itor) |
Finish iteration over a langauge model set. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_iter_model (ngram_model_set_iter_t *itor, char const **lmname) |
Get language model and associated name from an iterator. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_select (ngram_model_t *set, const char *name) |
Select a single language model from a set for scoring. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_lookup (ngram_model_t *set, const char *name) |
Look up a language model by name from a set. | |
SPHINXBASE_EXPORT const char * | ngram_model_set_current (ngram_model_t *set) |
Get the current language model name, if any. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_interp (ngram_model_t *set, const char **names, const float32 *weights) |
Set interpolation weights for a set and enables interpolation. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_add (ngram_model_t *set, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap) |
Add a language model to a set. | |
SPHINXBASE_EXPORT ngram_model_t * | ngram_model_set_remove (ngram_model_t *set, const char *name, int reuse_widmap) |
Remove a language model from a set. | |
SPHINXBASE_EXPORT void | ngram_model_set_map_words (ngram_model_t *set, const char **words, int32 n_words) |
Set the word-to-ID mapping for this model set. | |
SPHINXBASE_EXPORT int32 | ngram_model_set_current_wid (ngram_model_t *set, int32 set_wid) |
Query the word-ID mapping for the current language model. | |
SPHINXBASE_EXPORT int32 | ngram_model_set_known_wid (ngram_model_t *set, int32 set_wid) |
Test whether a word ID corresponds to a known word in the current state of the language model set. | |
SPHINXBASE_EXPORT void | ngram_model_flush (ngram_model_t *lm) |
Flush any cached N-Gram information. | |
N-Gram language models.
Definition in file ngram_model.h.
#define NGRAM_INVALID_WID -1 |
Impossible word ID.
Definition at line 83 of file ngram_model.h.
Referenced by ngram_model_add_class(), ngram_model_add_class_word(), ngram_model_add_word(), ngram_model_set_current_wid(), ngram_ng_prob(), ngram_ng_score(), and ngram_unknown_wid().
typedef struct ngram_class_s ngram_class_t |
Abstract type representing a word class in an N-Gram model.
Definition at line 71 of file ngram_model.h.
typedef struct ngram_iter_s ngram_iter_t |
M-gram iterator object.
Definition at line 359 of file ngram_model.h.
typedef struct ngram_model_set_iter_s ngram_model_set_iter_t |
Iterator over language models in a set.
Definition at line 556 of file ngram_model.h.
typedef struct ngram_model_s ngram_model_t |
Abstract type representing an N-Gram based language model.
Definition at line 66 of file ngram_model.h.
enum ngram_case_e |
Constants for case folding.
Definition at line 163 of file ngram_model.h.
enum ngram_file_type_e |
File types for N-Gram files.
Enumerator | |
---|---|
NGRAM_INVALID | Not a valid file type. |
NGRAM_AUTO | Determine file type automatically. |
NGRAM_ARPA | ARPABO text format (the standard). |
NGRAM_BIN | Sphinx .DMP format. |
Definition at line 76 of file ngram_model.h.
SPHINXBASE_EXPORT int32 ngram_bg_score | ( | ngram_model_t * | model, |
int32 | w2, | ||
int32 | w1, | ||
int32 * | n_used ) |
Quick bigram score lookup.
Definition at line 448 of file ngram_model.c.
References ngram_bg_score(), and ngram_ng_score().
Referenced by ngram_bg_score().
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type | ( | const char * | file_name | ) |
Guess the file type for an N-Gram model from the filename.
Definition at line 63 of file ngram_model.c.
References NGRAM_ARPA, NGRAM_BIN, ngram_file_name_to_type(), NGRAM_INVALID, strcmp_nocase(), and strncmp_nocase().
Referenced by ngram_file_name_to_type(), and ngram_model_write().
SPHINXBASE_EXPORT int32 const * ngram_iter_get | ( | ngram_iter_t * | itor, |
int32 * | out_score, | ||
int32 * | out_bowt ) |
Get information from the current M-gram in an iterator.
out_score | Output: Score for this M-gram (including any word penalty and language weight). |
out_bowt | Output: Backoff weight for this M-gram. |
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors | ( | ngram_iter_t * | itor | ) |
Iterate over all M-gram successors of an M-1-gram.
itor | Iterator pointing to the M-1-gram to get successors of. |
SPHINXBASE_EXPORT int32 ngram_model_add_class | ( | ngram_model_t * | model, |
const char * | classname, | ||
float32 | classweight, | ||
char ** | words, | ||
const float32 * | weights, | ||
int32 | n_words ) |
Add a new class to a language model.
If classname
already exists in the unigram set for model
, then it will be converted to a class tag, and classweight
will be ignored. Otherwise, a new unigram will be created as in ngram_model_add_word().
Definition at line 831 of file ngram_model.c.
References ckd_calloc, ckd_realloc, ngram_model_s::classes, E_ERROR, glist_add_float32(), glist_free(), glist_reverse(), ngram_model_s::n_classes, NGRAM_INVALID_WID, ngram_model_add_class(), ngram_model_add_word(), ngram_unknown_wid(), and ngram_wid().
Referenced by ngram_model_add_class(), ngram_model_read_classdef(), and ngram_model_set_read().
SPHINXBASE_EXPORT int32 ngram_model_add_class_word | ( | ngram_model_t * | model, |
const char * | classname, | ||
const char * | word, | ||
float32 | weight ) |
Add a word to a class in a language model.
model | The model to add a word to. |
classname | Name of the class to add this word to. |
word | Text of the word to add. |
weight | Weight of this word relative to the within-class uniform distribution. |
Definition at line 779 of file ngram_model.c.
References ngram_model_s::classes, E_ERROR, ngram_model_s::lmath, logmath_log(), ngram_model_s::n_classes, ngram_class_s::n_hash, ngram_class_s::n_hash_inuse, ngram_class_s::n_words, NGRAM_INVALID_WID, ngram_model_add_class_word(), ngram_wid(), ngram_class_s::prob1, ngram_class_s::ngram_hash_s::prob1, ngram_class_s::tag_wid, and ngram_class_s::ngram_hash_s::wid.
Referenced by ngram_model_add_class_word().
SPHINXBASE_EXPORT int32 ngram_model_add_word | ( | ngram_model_t * | model, |
const char * | word, | ||
float32 | weight ) |
Add a word (unigram) to the language model.
model | The model to add a word to. |
word | Text of the word to add. |
weight | Weight of this word relative to the uniform distribution. |
Definition at line 649 of file ngram_model.c.
References ngram_funcs_s::add_ug, E_WARN, ngram_model_s::funcs, ngram_model_s::lmath, ngram_model_s::log_zero, logmath_log(), NGRAM_INVALID_WID, ngram_model_add_word(), and ngram_model_s::writable.
Referenced by ngram_model_add_class(), and ngram_model_add_word().
SPHINXBASE_EXPORT int ngram_model_apply_weights | ( | ngram_model_t * | model, |
float32 | lw, | ||
float32 | wip ) |
Apply a language weight, insertion penalty, and unigram weight to a language model.
This will change the values output by ngram_score() and friends. This is done for efficiency since in decoding, these are the only values we actually need. Call ngram_prob() if you want the "raw" N-Gram probability estimate.
To remove all weighting, call ngram_apply_weights(model, 1.0, 1.0).
Definition at line 360 of file ngram_model.c.
References ngram_funcs_s::apply_weights, ngram_model_s::funcs, and ngram_model_apply_weights().
Referenced by ngram_model_apply_weights(), and ngram_model_read().
SPHINXBASE_EXPORT int ngram_model_casefold | ( | ngram_model_t * | model, |
int | kase ) |
Case-fold word strings in an N-Gram model.
WARNING: This is not Unicode aware, so any non-ASCII characters will not be converted.
Definition at line 308 of file ngram_model.c.
References ckd_salloc, E_WARN, hash_table_enter_int32, hash_table_free(), hash_table_new(), lcase(), ngram_model_s::n_words, ngram_model_casefold(), ucase(), ngram_model_s::wid, ngram_model_s::word_str, and ngram_model_s::writable.
Referenced by ngram_model_casefold().
SPHINXBASE_EXPORT void ngram_model_flush | ( | ngram_model_t * | lm | ) |
Flush any cached N-Gram information.
Definition at line 256 of file ngram_model.c.
References ngram_funcs_s::flush, ngram_model_s::funcs, and ngram_model_flush().
Referenced by ngram_model_flush().
SPHINXBASE_EXPORT int ngram_model_free | ( | ngram_model_t * | model | ) |
Release memory associated with an N-Gram model.
Definition at line 263 of file ngram_model.c.
References ckd_free(), ngram_model_s::classes, ngram_funcs_s::free, ngram_model_s::funcs, hash_table_free(), ngram_model_s::n_classes, ngram_model_s::n_counts, ngram_class_s::n_hash, ngram_model_s::n_words, ngram_class_s::n_words, ngram_model_free(), ngram_model_s::refcount, ngram_class_s::start_wid, ngram_model_s::wid, ngram_class_s::ngram_hash_s::wid, ngram_model_s::word_str, and ngram_model_s::writable.
Referenced by ngram_model_free(), and ngram_model_set_read().
SPHINXBASE_EXPORT uint32 const * ngram_model_get_counts | ( | ngram_model_t * | model | ) |
Get the counts of the various N-grams in the model.
Definition at line 577 of file ngram_model.c.
References ngram_model_s::n_counts, and ngram_model_get_counts().
Referenced by ngram_model_get_counts().
SPHINXBASE_EXPORT int32 ngram_model_get_size | ( | ngram_model_t * | model | ) |
Get the order of the N-gram model (i.e.
the "N" in "N-gram")
Definition at line 569 of file ngram_model.c.
References ngram_model_s::n, and ngram_model_get_size().
Referenced by ngram_model_get_size().
SPHINXBASE_EXPORT float32 ngram_model_get_weights | ( | ngram_model_t * | model, |
int32 * | out_log_wip ) |
Get the current weights from a language model.
model | The model in question. |
out_log_wip | Output: (optional) logarithm of word insertion penalty. |
Definition at line 366 of file ngram_model.c.
References ngram_model_s::log_wip, ngram_model_s::lw, and ngram_model_get_weights().
Referenced by ngram_model_get_weights().
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams | ( | ngram_model_t * | model, |
int | m ) |
Iterate over all M-grams.
model | Language model to query. |
m | Order of the M-Grams requested minus one (i.e. order of the history) |
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read | ( | cmd_ln_t * | config, |
const char * | file_name, | ||
ngram_file_type_t | file_type, | ||
logmath_t * | lmath ) |
Read an N-Gram model from a file on disk.
config | Optional pointer to a set of command-line arguments. Recognized arguments are: |
file_name | path to the file to read. |
file_type | type of the file, or NGRAM_AUTO to determine automatically. |
lmath | Log-math parameters to use for probability calculations. Ownership of this object is assumed by the newly created ngram_model_t, and you should not attempt to free it manually. If you wish to reuse it elsewhere, you must retain it with logmath_retain(). |
Definition at line 124 of file ngram_model.c.
References cmd_ln_exists_r(), E_ERROR, NGRAM_ARPA, NGRAM_AUTO, NGRAM_BIN, ngram_model_apply_weights(), and ngram_model_read().
Referenced by ngram_model_read(), and ngram_model_set_read().
SPHINXBASE_EXPORT int32 ngram_model_read_classdef | ( | ngram_model_t * | model, |
const char * | file_name ) |
Read a class definition file and add classes to a language model.
This function assumes that the class tags have already been defined as unigrams in the language model. All words in the class definition will be added to the vocabulary as special in-class words. For this reason is is necessary that they not have the same names as any words in the general unigram distribution. The convention is to suffix them with ":class_tag", where class_tag is the class tag minus the enclosing square brackets.
Definition at line 1027 of file ngram_model.c.
References ckd_free(), glist_free(), gnode_ptr, hash_table_free(), hash_table_new(), hash_table_tolist(), ngram_model_add_class(), ngram_model_read_classdef(), and hash_entry_s::val.
Referenced by ngram_model_read_classdef().
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain | ( | ngram_model_t * | model | ) |
Retain ownership of an N-Gram model.
Definition at line 249 of file ngram_model.c.
References ngram_model_retain(), and ngram_model_s::refcount.
Referenced by ngram_model_retain(), and ngram_model_set_init().
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_add | ( | ngram_model_t * | set, |
ngram_model_t * | model, | ||
const char * | name, | ||
float32 | weight, | ||
int | reuse_widmap ) |
Add a language model to a set.
set | The language model set to add to. |
model | The language model to add. |
name | The name to associate with this model. |
weight | Interpolation weight for this model, relative to the uniform distribution. 1.0 is a safe value. |
reuse_widmap | Reuse the existing word-ID mapping in set . Any new words present in model will not be added to the word-ID mapping in this case. |
Definition at line 524 of file ngram_model_set.c.
References ckd_calloc_2d, ckd_free_2d(), ckd_realloc, ckd_salloc, ngram_model_s::lmath, ngram_model_set_s::lms, logmath_log(), ngram_model_set_s::lweights, ngram_model_set_s::maphist, ngram_model_s::n, ngram_model_set_s::n_models, ngram_model_s::n_words, ngram_model_set_s::names, ngram_wid(), ngram_model_set_s::widmap, and ngram_model_s::word_str.
SPHINXBASE_EXPORT int32 ngram_model_set_count | ( | ngram_model_t * | set | ) |
Returns the number of language models in a set.
Definition at line 368 of file ngram_model_set.c.
References ngram_model_set_s::n_models.
SPHINXBASE_EXPORT const char * ngram_model_set_current | ( | ngram_model_t * | set | ) |
Get the current language model name, if any.
Definition at line 451 of file ngram_model_set.c.
References ngram_model_set_s::cur, and ngram_model_set_s::names.
SPHINXBASE_EXPORT int32 ngram_model_set_current_wid | ( | ngram_model_t * | set, |
int32 | set_wid ) |
Query the word-ID mapping for the current language model.
set_wid
is invalid or interpolation is enabled. Definition at line 462 of file ngram_model_set.c.
References ngram_model_set_s::cur, ngram_model_s::n_words, NGRAM_INVALID_WID, and ngram_model_set_s::widmap.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_init | ( | cmd_ln_t * | config, |
ngram_model_t ** | models, | ||
char ** | names, | ||
const float32 * | weights, | ||
int32 | n_models ) |
Create a set of language models sharing a common space of word IDs.
This function creates a meta-language model which groups together a set of language models, synchronizing word IDs between them. To use this language model, you can either select a submodel to use exclusively using ngram_model_set_select(), or interpolate between scores from all models. To do the latter, you can either pass a non-NULL value of the weights
parameter, or re-activate interpolation later on by calling ngram_model_set_interp().
In order to make this efficient, there are some restrictions on the models that can be grouped together. The most important (and currently the only) one is that they must all share the same log-math parameters.
config | Any configuration parameters to be shared between models. |
models | Array of pointers to previously created language models. |
names | Array of strings to use as unique identifiers for LMs. |
weights | Array of weights to use in interpolating LMs, or NULL for no interpolation. |
n_models | Number of elements in the arrays passed to this function. |
Definition at line 124 of file ngram_model_set.c.
References ngram_model_set_s::base, ckd_calloc, ckd_salloc, ngram_model_set_s::cur, E_ERROR, ngram_model_s::lmath, ngram_model_set_s::lms, logmath_get_base(), logmath_get_shift(), logmath_log(), ngram_model_set_s::lweights, ngram_model_set_s::maphist, ngram_model_s::n, ngram_model_set_s::n_models, ngram_model_set_s::names, and ngram_model_retain().
Referenced by ngram_model_set_read().
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_interp | ( | ngram_model_t * | set, |
const char ** | names, | ||
const float32 * | weights ) |
Set interpolation weights for a set and enables interpolation.
If weights
is NULL, any previously initialized set of weights will be used. If no weights were specified to ngram_model_set_init(), then a uniform distribution will be used.
Definition at line 493 of file ngram_model_set.c.
References ngram_model_set_s::cur, E_ERROR, ngram_model_s::lmath, logmath_log(), ngram_model_set_s::lweights, ngram_model_set_s::n_models, and ngram_model_set_s::names.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter | ( | ngram_model_t * | set | ) |
Begin iterating over language models in a set.
Definition at line 375 of file ngram_model_set.c.
References ckd_calloc, and ngram_model_set_s::n_models.
SPHINXBASE_EXPORT void ngram_model_set_iter_free | ( | ngram_model_set_iter_t * | itor | ) |
Finish iteration over a langauge model set.
Definition at line 398 of file ngram_model_set.c.
References ckd_free().
Referenced by ngram_model_set_iter_next().
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_iter_model | ( | ngram_model_set_iter_t * | itor, |
char const ** | lmname ) |
Get language model and associated name from an iterator.
itor | the iterator |
lmname | Output: string name associated with this language model. |
Definition at line 404 of file ngram_model_set.c.
References ngram_model_set_s::lms, and ngram_model_set_s::names.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter_next | ( | ngram_model_set_iter_t * | itor | ) |
Move to the next language model in a set.
Definition at line 388 of file ngram_model_set.c.
References ngram_model_set_s::n_models, and ngram_model_set_iter_free().
SPHINXBASE_EXPORT int32 ngram_model_set_known_wid | ( | ngram_model_t * | set, |
int32 | set_wid ) |
Test whether a word ID corresponds to a known word in the current state of the language model set.
set_wid
corresponds to a known word in that language model. Otherwise, returns non-zero if set_wid
corresponds to a known word in any language model. Definition at line 473 of file ngram_model_set.c.
References ngram_model_set_s::cur, ngram_model_set_s::lms, ngram_model_set_s::n_models, ngram_model_s::n_words, ngram_unknown_wid(), and ngram_model_set_s::widmap.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_lookup | ( | ngram_model_t * | set, |
const char * | name ) |
Look up a language model by name from a set.
name
, or NULL if no language model by that name exists. Definition at line 413 of file ngram_model_set.c.
References ngram_model_set_s::cur, ngram_model_set_s::lms, ngram_model_set_s::n_models, and ngram_model_set_s::names.
SPHINXBASE_EXPORT void ngram_model_set_map_words | ( | ngram_model_t * | set, |
const char ** | words, | ||
int32 | n_words ) |
Set the word-to-ID mapping for this model set.
Definition at line 639 of file ngram_model_set.c.
References ckd_calloc, ckd_calloc_2d, ckd_free(), ckd_free_2d(), ckd_salloc, hash_table_empty(), hash_table_enter_int32, ngram_model_set_s::lms, ngram_model_s::n_1g_alloc, ngram_model_set_s::n_models, ngram_model_s::n_words, ngram_wid(), ngram_model_s::wid, ngram_model_set_s::widmap, ngram_model_s::word_str, and ngram_model_s::writable.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_read | ( | cmd_ln_t * | config, |
const char * | lmctlfile, | ||
logmath_t * | lmath ) |
Read a set of language models from a control file.
This file creates a language model set from a "control file" of the type used in Sphinx-II and Sphinx-III. File format (optional stuff is indicated by enclosing in []):
[{ LMClassFileName LMClassFilename ... }] TrigramLMFileName LMName [{ LMClassName LMClassName ... }] TrigramLMFileName LMName [{ LMClassName LMClassName ... }] ... (There should be whitespace around the { and } delimiters.)
This is an extension of the older format that had only TrigramLMFilenName and LMName pairs. The new format allows a set of LMClass files to be read in and referred to by the trigram LMs.
No "comments" allowed in this file.
config | Configuration parameters. |
lmctlfile | Path to the language model control file. |
lmath | Log-math parameters to use for probability calculations. Ownership of this object is assumed by the newly created ngram_model_t, and you should not attempt to free it manually. If you wish to reuse it elsewhere, you must retain it with logmath_retain(). |
Definition at line 185 of file ngram_model_set.c.
References ckd_calloc, ckd_free(), ckd_salloc, E_ERROR, E_ERROR_SYSTEM, E_INFO, glist_add_ptr(), glist_count(), glist_free(), glist_reverse(), gnode_ptr, hash_table_free(), hash_table_lookup(), hash_table_new(), hash_table_tolist(), NGRAM_AUTO, ngram_model_add_class(), ngram_model_free(), ngram_model_read(), ngram_model_set_init(), path_is_absolute(), string_join(), and hash_entry_s::val.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_remove | ( | ngram_model_t * | set, |
const char * | name, | ||
int | reuse_widmap ) |
Remove a language model from a set.
set | The language model set to remove from. |
name | The name associated with the model to remove. |
reuse_widmap | Reuse the existing word-ID mapping in set . |
Definition at line 583 of file ngram_model_set.c.
References ckd_free(), ngram_model_s::lmath, ngram_model_set_s::lms, ngram_model_s::log_zero, logmath_exp(), logmath_log(), ngram_model_set_s::lweights, ngram_model_s::n, ngram_model_set_s::n_models, ngram_model_s::n_words, ngram_model_set_s::names, and ngram_model_set_s::widmap.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_select | ( | ngram_model_t * | set, |
const char * | name ) |
Select a single language model from a set for scoring.
Definition at line 435 of file ngram_model_set.c.
References ngram_model_set_s::cur, ngram_model_set_s::lms, ngram_model_set_s::n_models, and ngram_model_set_s::names.
SPHINXBASE_EXPORT int ngram_model_write | ( | ngram_model_t * | model, |
const char * | file_name, | ||
ngram_file_type_t | format ) |
Write an N-Gram model to disk.
Definition at line 178 of file ngram_model.c.
References E_ERROR, NGRAM_ARPA, NGRAM_AUTO, NGRAM_BIN, ngram_file_name_to_type(), NGRAM_INVALID, and ngram_model_write().
Referenced by ngram_model_write().
SPHINXBASE_EXPORT int32 ngram_ng_prob | ( | ngram_model_t * | model, |
int32 | wid, | ||
int32 * | history, | ||
int32 | n_hist, | ||
int32 * | n_used ) |
Quick "raw" probability lookup for a general N-Gram.
See documentation for ngram_ng_score() and ngram_apply_weights() for an explanation of this.
Definition at line 454 of file ngram_model.c.
References ngram_model_s::classes, ngram_model_s::funcs, ngram_model_s::log_zero, NGRAM_INVALID_WID, ngram_ng_prob(), ngram_funcs_s::raw_score, and ngram_class_s::tag_wid.
Referenced by ngram_ng_prob(), ngram_prob(), and ngram_probv().
SPHINXBASE_EXPORT int32 ngram_ng_score | ( | ngram_model_t * | model, |
int32 | wid, | ||
int32 * | history, | ||
int32 | n_hist, | ||
int32 * | n_used ) |
Quick general N-Gram score lookup.
Definition at line 375 of file ngram_model.c.
References ngram_model_s::classes, ngram_model_s::funcs, ngram_model_s::log_zero, NGRAM_INVALID_WID, ngram_ng_score(), ngram_funcs_s::score, and ngram_class_s::tag_wid.
Referenced by ngram_bg_score(), ngram_ng_score(), ngram_score(), and ngram_tg_score().
SPHINXBASE_EXPORT int32 ngram_prob | ( | ngram_model_t * | model, |
const char *const * | words, | ||
int32 | n ) |
Get the "raw" log-probability for a general N-Gram.
This returns the log-probability of an N-Gram, as defined in the language model file, before any language weighting, interpolation, or insertion penalty has been applied.
Definition at line 517 of file ngram_model.c.
References ckd_calloc, ckd_free(), ngram_ng_prob(), ngram_prob(), and ngram_wid().
Referenced by ngram_prob().
SPHINXBASE_EXPORT int32 ngram_probv | ( | ngram_model_t * | model, |
const char * | word, | ||
... ) |
Get the "raw" log-probability for a general N-Gram.
This returns the log-probability of an N-Gram, as defined in the language model file, before any language weighting, interpolation, or insertion penalty has been applied.
Definition at line 486 of file ngram_model.c.
References ckd_calloc, ckd_free(), ngram_ng_prob(), ngram_probv(), and ngram_wid().
Referenced by ngram_probv().
SPHINXBASE_EXPORT int32 ngram_score | ( | ngram_model_t * | model, |
const char * | word, | ||
... ) |
Get the score (scaled, interpolated log-probability) for a general N-Gram.
The argument list consists of the history words (as null-terminated strings) of the N-Gram, in reverse order, followed by NULL. Therefore, if you wanted to get the N-Gram score for "a whole joy", you would call:
score = ngram_score(model, "joy", "whole", "a", NULL);
This is not the function to use in decoding, because it has some overhead for looking up words. Use ngram_ng_score(), ngram_tg_score(), or ngram_bg_score() instead. In the future there will probably be a version that takes a general language model state object, to support suffix-array LM and things like that.
If one of the words is not in the LM's vocabulary, the result will depend on whether this is an open or closed vocabulary language model. For an open-vocabulary model, unknown words are all mapped to the unigram <UNK> which has a non-zero probability and also participates in higher-order N-Grams. Therefore, you will get a score of some sort in this case.
For a closed-vocabulary model, unknown words are impossible and thus have zero probability. Therefore, if word
is unknown, this function will return a "zero" log-probability, i.e. a large negative number. To obtain this number for comparison, call ngram_zero().
Definition at line 407 of file ngram_model.c.
References ckd_calloc, ckd_free(), ngram_ng_score(), ngram_score(), and ngram_wid().
Referenced by ngram_score().
SPHINXBASE_EXPORT int32 ngram_score_to_prob | ( | ngram_model_t * | model, |
int32 | score ) |
Convert score to "raw" log-probability.
score
.model | The N-Gram model from which score was obtained. |
score | The N-Gram score to convert |
Definition at line 537 of file ngram_model.c.
References ngram_model_s::log_wip, ngram_model_s::lw, and ngram_score_to_prob().
Referenced by ngram_score_to_prob().
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type | ( | const char * | str_name | ) |
Get the N-Gram file type from a string.
Definition at line 99 of file ngram_model.c.
References NGRAM_ARPA, NGRAM_BIN, NGRAM_INVALID, ngram_str_to_type(), and strcmp_nocase().
Referenced by ngram_str_to_type().
SPHINXBASE_EXPORT int32 ngram_tg_score | ( | ngram_model_t * | model, |
int32 | w3, | ||
int32 | w2, | ||
int32 | w1, | ||
int32 * | n_used ) |
Quick trigram score lookup.
Definition at line 438 of file ngram_model.c.
References ngram_ng_score(), and ngram_tg_score().
Referenced by ngram_tg_score().
SPHINXBASE_EXPORT char const * ngram_type_to_str | ( | int | type | ) |
Get the canonical name for an N-Gram file type.
Definition at line 110 of file ngram_model.c.
References NGRAM_ARPA, NGRAM_BIN, and ngram_type_to_str().
Referenced by ngram_type_to_str().
SPHINXBASE_EXPORT int32 ngram_unknown_wid | ( | ngram_model_t * | model | ) |
Get the unknown word ID for a language model.
Language models can be either "open vocabulary" or "closed vocabulary". The difference is that the former assigns a fixed non-zero unigram probability to unknown words, while the latter does not allow unknown words (or, equivalently, it assigns them zero probability). If this is a closed vocabulary model, this function will return NGRAM_INVALID_WID.
Definition at line 550 of file ngram_model.c.
References hash_table_lookup_int32(), NGRAM_INVALID_WID, ngram_unknown_wid(), and ngram_model_s::wid.
Referenced by ngram_model_add_class(), ngram_model_set_known_wid(), ngram_unknown_wid(), and ngram_wid().
SPHINXBASE_EXPORT int32 ngram_wid | ( | ngram_model_t * | model, |
const char * | word ) |
Look up numerical word ID.
Definition at line 585 of file ngram_model.c.
References hash_table_lookup_int32(), ngram_unknown_wid(), ngram_wid(), and ngram_model_s::wid.
Referenced by ngram_model_add_class(), ngram_model_add_class_word(), ngram_model_set_add(), ngram_model_set_map_words(), ngram_prob(), ngram_probv(), ngram_score(), and ngram_wid().
SPHINXBASE_EXPORT const char * ngram_word | ( | ngram_model_t * | model, |
int32 | wid ) |
Look up word string for numerical word ID.
Definition at line 596 of file ngram_model.c.
References ngram_model_s::n_words, ngram_word(), and ngram_model_s::word_str.
Referenced by ngram_word().
SPHINXBASE_EXPORT int32 ngram_zero | ( | ngram_model_t * | model | ) |
Get the "zero" log-probability value for a language model.
Definition at line 563 of file ngram_model.c.
References ngram_model_s::log_zero, and ngram_zero().
Referenced by ngram_zero().