#include <stdio.h>
#include <logmath.h>
#include <hash_table.h>
#include <cmd_ln.h>
#include "s3types.h"
#include "lmclass.h"
#include "dict.h"
Go to the source code of this file.
Classes | |
| union | lmlog_t |
| Log quantities represented in either floating or integer format. More... | |
| struct | sorted_entry_s |
| struct | sorted_list_t |
| The sorted list used lm reading. list is a (64K long) array. The first entry is the root of the tree and is created during initialization. More... | |
| struct | ug_t |
| A unigram structure Please see. More... | |
| struct | bg_t |
| A bigram structure. More... | |
| struct | bg32_t |
| A bigram structure which has 32 bits. More... | |
| struct | tg_t |
| A trigram structure. More... | |
| struct | tg32_t |
| A 32 bits version of tg_t. More... | |
| struct | membg_t |
| Management of in-memory bigrams. Not used if all bigrams in memory. More... | |
| struct | membg32_t |
| A 32 bits version of membg_t. More... | |
| struct | tginfo_s |
| struct | tginfo32_s |
| struct | lm_tgcache_entry_t |
| struct | lm_tgcache_entry32_t |
| struct | lm_s |
| struct | lmset_s |
| struct | wordprob_t |
| Generic structure that could be used at any n-gram level. More... | |
Defines | |
| #define | LM_DICTWID_BADMAP -16000 |
| #define | LM_CLASSID_BASE 0x01000000 |
| #define | LM_LEGACY_CONSTANT BAD_S3LMWID |
| #define | LM_SPHINX_CONSTANT BAD_S3LMWID32 |
| #define | LM_CLASSID_TO_CLASS(m, i) ((m)->lmclass[(i)-LM_CLASSID_BASE]) |
| #define | MIN_PROB_F -99.0 |
| #define | LM_ALLOC_BLOCK 16 |
| #define | LM_SUCCESS 1 |
| #define | LM_FAIL 0 |
| #define | LM_NOT_FOUND -1 |
| #define | LM_OFFSET_TOO_LARGE -2 |
| #define | LM_NO_DATA_MARK -3 |
| #define | LM_UNKNOWN_NG -4 |
| #define | LM_BAD_LM_COUNT -5 |
| #define | LM_UNKNOWN_WORDS -6 |
| #define | LM_BAD_BIGRAM -7 |
| #define | LM_BAD_TRIGRAM -8 |
| #define | LM_BAD_QUADGRAM -9 |
| #define | LM_BAD_QUINGRAM -10 |
| #define | LM_BAD_NGRAM -11 |
| #define | LM_TOO_MANY_NGRAM -12 |
| #define | LM_NO_MINUS_1GRAM -13 |
| #define | LM_FILE_NOT_FOUND -14 |
| #define | LM_CANNOT_ALLOCATE -15 |
| #define | LMDMP_VERSIONNULL 0 |
| #define | LMDMP_VERSION_TG_16BIT -1 |
| #define | LMDMP_VERSION_TG_16BIT_V2 -2 |
| #define | LMDMP_VERSION_TG_32BIT -3 |
| #define | LMTXT_VERSION 1000 |
| #define | LMFST_VERSION 1001 |
| #define | LMFORCED_TXT32VERSION 1002 |
| #define | NO_WORD -1 |
| #define | LOG2_BG_SEG_SZ 9 |
| #define | BG_SEG_SZ (1 << (LOG2_BG_SEG_SZ)) |
| #define | LM_TGCACHE_SIZE 100003 |
| #define | lm_lmwid2dictwid(lm, u) ((lm)->ug[u].dictwid) |
| #define | lm_n_ug(lm) ((lm)->n_ug) |
| #define | lm_n_bg(lm) ((lm)->n_bg) |
| #define | lm_n_tg(lm) ((lm)->n_tg) |
| #define | lm_wordstr(lm, u) ((lm)->wordstr[u]) |
| #define | lm_startwid(lm) ((lm)->startlwid) |
| #define | lm_finishwid(lm) ((lm)->finishlwid) |
| #define | lm_access_type(lm) ((lm)->access_type) |
| #define | LM_TGPROB(lm, tgptr) ((lm)->tgprob[(tgptr)->probid].l) |
| #define | LM_BGPROB(lm, bgptr) ((lm)->bgprob[(bgptr)->probid].l) |
| #define | LM_UGPROB(lm, ugptr) ((ugptr)->prob.l) |
| #define | LM_RAWSCORE(lm, score) ((score - (lm)->wip) / ((lm)->lw)) |
| #define | LM_DICTWID(lm, lmwid) ((lm)->ug[(lmwid)].dictwid) |
Typedefs | |
| typedef sorted_entry_s | sorted_entry_t |
| typedef tginfo_s | tginfo_t |
| typedef tginfo32_s | tginfo32_t |
| typedef lm_s | lm_t |
| typedef lmset_s | lmset_t |
Functions | |
| S3DECODER_EXPORT lmset_t * | lmset_init (const char *lmfile, const char *lmctlfile, const char *ctl_lm, const char *lmname, const char *lmdumpdir, float32 lw, float32 wip, float32 uw, dict_t *dict, logmath_t *logmath) |
| lmset_t * | lmset_read_lm (const char *lmfile, dict_t *dict, const char *lmname, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath) |
| lmset_t * | lmset_read_ctl (const char *ctlfile, dict_t *dict, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath) |
| lm_t * | lmset_get_lm_widx (lmset_t *lms, int32 lmidx) |
| lm_t * | lmset_get_lm_wname (lmset_t *lms, const char *lmname) |
| void | lmset_set_curlm_widx (lmset_t *lms, int32 lmidx) |
| S3DECODER_EXPORT void | lmset_set_curlm_wname (lmset_t *lms, const char *lmname) |
| int32 | lmset_name_to_idx (lmset_t *lms, const char *lmname) |
| char * | lmset_idx_to_name (lmset_t *lms, int32 lmidx) |
| void | lmset_add_lm (lmset_t *lms, lm_t *lm, const char *lmname) |
| void | lmset_delete_lm (lmset_t *lms, const char *lmname) |
| S3DECODER_EXPORT void | lmset_free (lmset_t *lms) |
| int32 | lm_tglist (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg_t **tg, int32 *bowt) |
| int32 | lm_tg32list (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg32_t **tg, int32 *bowt) |
| int32 | lm_bglist (lm_t *lmp, s3lmwid32_t w, bg_t **bg, int32 *bowt) |
| int32 | lm_bg32list (lm_t *lmp, s3lmwid32_t w, bg32_t **bg, int32 *bowt) |
| s3lmwid32_t | lm_wid (lm_t *lm, const char *wd) |
| void | lm_null_struct (lm_t *lm) |
| int32 | lm_ug_wordprob (lm_t *lm, dict_t *dict, int32 th, wordprob_t *wp) |
| int32 | lm_uglist (lm_t *lmp, ug_t **ug) |
| int32 | lm_ug_score (lm_t *lmp, s3lmwid32_t lwid, s3wid_t wid) |
| int32 | lm_ug_exists (lm_t *lm, s3lmwid32_t lwid) |
| int32 | lm_bg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2) |
| int32 | lm_bg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2) |
| int32 | lm_tg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3, s3wid_t w3) |
| int32 | lm_tg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3) |
| void | lm_set_param (lm_t *lm, float64 lw, float64 wip) |
| S3DECODER_EXPORT int32 | lm_rawscore (lm_t *lm, int32 score) |
| S3DECODER_EXPORT void | lm_cache_reset (lm_t *lmp) |
| S3DECODER_EXPORT void | lm_cache_stats_dump (lm_t *lmp) |
| lm_t * | lm_read (const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath) |
| lm_t * | lm_read_advance (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, logmath_t *logmath) |
| S3DECODER_EXPORT lm_t * | lm_read_advance2 (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, int lminmemory, logmath_t *logmath) |
| S3DECODER_EXPORT int32 | lm_write (lm_t *model, const char *outputfile, const char *filename, const char *fmt) |
| int32 | lm_write_advance (lm_t *model, const char *outputfile, const char *filename, const char *fmt, const char *inputenc, char *outputenc) |
| S3DECODER_EXPORT void | lm_free (lm_t *lm) |
| int32 | lm_add_wordlist (lm_t *lm, dict_t *dict, const char *filename) |
| int32 | lm_add_word_to_ug (lm_t *lm, dict_t *dict, const char *newword) |
| int32 | lm_get_classid (lm_t *model, const char *name) |
| void | lm_convert_structure (lm_t *model, int32 is32bits) |
| int32 | lm_is32bits (lm_t *model) |
| void | ug_write (FILE *fp, ug_t *ug) |
| void | bg_write (FILE *fp, bg_t *bg) |
| void | bg32_write (FILE *fp, bg32_t *bg) |
| void | tg_write (FILE *fp, tg_t *tg) |
| void | tg32_write (FILE *fp, tg32_t *tg) |
| void | copy_bg_to_bg32 (lm_t *lm) |
| void | copy_bg32_to_bg (lm_t *lm) |
| void | copy_tg_to_tg32 (lm_t *lm) |
| void | copy_tg32_to_tg (lm_t *lm) |
| void | swap_bg (bg_t *bg) |
| void | swap_bg32 (bg32_t *bg) |
| void | swap_tg (tg_t *tg) |
| void | swap_tg32 (tg32_t *tg) |
| int32 | find_bg (bg_t *bg, int32 n, s3lmwid32_t w) |
| int32 | find_bg32 (bg32_t *bg, int32 n, s3lmwid32_t w) |
| int32 | find_tg (tg_t *tg, int32 n, s3lmwid32_t w) |
| int32 | find_tg32 (tg32_t *tg, int32 n, s3lmwid32_t w) |
| ug_t * | NewUnigramTable (int32 n_ug) |
This is the header file for language model support in Sphinx 3. Sphinx 3 supports language model in 4 formats. The four formats are
ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in 3.X (X=6)
DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X (X>4)
DMP32 : We start to break the limit of number of words of 65535. This is the first LM file format in Sphinx 3.X that could capture 4 billion words in the language model
FST: In AT&T format, we start to support in 3.X (X=6).
At 20060302 we can only read and used ARPA, DMP-based format in the decoder. we can write ARPA, DMP, DMP32 and FST file format.
|
|
|
|
|
|
|
|
|
|
|
A bad bigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. |
|
|
When reading LM, if count is bad, return this msg |
|
|
(RESERVED BUT NOT USED) A bad n-gram. generalization of message -7 to -10. In our case, we don't make the message as specific as possible. |
|
|
(RESERVED BUT NOT USED) A bad quadgram (4-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. |
|
|
(RESERVED BUT NOT USED) A bad quingram (5-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. BTW, there is no need to remind me the mixed use of quadgram and quingram is stupid English. I read Manning and Schultze. |
|
|
A bad trigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. |
|
|
|
|
|
When cannot allocate tables in LM return this message |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Constant that define an operation failed. |
|
|
When couldn't find the LM file, return this message |
|
|
|
|
|
=65535 (~65k), this is introduced since 1996 when Ravi first wrote Sphinx 3.0. It was with us since. |
|
|
Access macros; not meant for arbitrary use |
|
|
|
|
|
|
|
|
|
|
|
When reading text-based LM, return thisif we see no data mark |
|
|
When reading n-gram, if the corresponding (n-1)-gram doesn't exists, return this message. |
|
|
Constant which indicate an LM couldn't be found |
|
|
Constant where the 16 bit LM was used, but th tgcount is larger than LM_LEGACY_CONSTANT (65535). This breaks addressing scheme in the current LM. |
|
|
|
|
|
(4 billion), ARCHAN: this is introduced by in Sphinx 3.6 during the time of Release Candidate I (2006 March). The caveat of using this constant is that it is much hard to detect byte-swapping problem. in general. Also, if the world has more than 10000 cities, each has 1 million roads name. We are stuck in this case. I assume this will happen in year3001. |
|
|
|
|
|
Constant that indicates an operation succeed |
|
|
|
|
|
|
|
|
When reading LM, if the number of n-grams is more than the number specified header. return this header |
|
|
|
|
|
When reading the header of LM, if there is unknown K for K-gram |
|
|
When an unknown word is found during LM readin, return this message |
|
|
|
|
|
VERSION 1 is the simplest DMP file which is trigram or lower which used 16 bits in bigram and trigram. |
|
|
VERSION 2 means legacy VERSION 1 DMP file which has log_bg_seg_sz != 9 |
|
|
VERSION 3 is the 32 bit extension of VERSION 1 but the bigram and trigram are represented by 32 bits data structure |
|
|
VERSION 0 is oldest, in the past, we used to use the version number to store the number of unigram, you will see logic that said vn > LMDMP_VERSIONNULL |
|
|
VERSION 1002 is the internal version of text-based LM. The difference betwwen 1002 and 1000 is that 1002 will assume LM is 32bits. This fact is used in lm_is32bits(lm) |
|
|
VERSION 1001 is the FST-based LM |
|
|
VERSION 1000 is the text-based LM |
|
|
|
|
|
The minimum value of probabilities and backoff weights. When changing, notice that both s2 and s3 may transform this number to very small integer (say -2e-31) This will easily cause integer wrap around. -99 is chosen for that reason. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
||||||||||||
|
Write of BG (32bits) structure
|
|
||||||||||||
|
Write of BG structure
|
|
|
Convert the 32 bit bigram structure to 16 bit
|
|
|
Convert the 16 bit bigram structure to 32 bit
|
|
|
Convert the 32 bit trigram structure to 16 bit
|
|
|
Convert the 16 bit trigram structure to 32 bit
|
|
||||||||||||||||
|
|
|
||||||||||||||||
|
|
|
||||||||||||||||
|
|
|
||||||||||||||||
|
|
|
||||||||||||||||
|
Add a word to the LM look up the dictionary and see whether it exists in the dictionary Looks alike with wid.c's logic at this point. (Incomplete!) Not fully tested in the situation for on-line recognition. We also avoid the addition of classes at this point because that could complicated things quite a lot.
|
|
||||||||||||||||
|
Add word list to the LM For each word in the file, call lm_add_wordlist. The file is assume to have a format like this: <word1> <word2> <word3> <word4> If the lmwid2dictid mapping is not updated, or the dictionary itself is not used in the context. Just specify dict=NULL;
|
|
||||||||||||||||||||
|
|
|
||||||||||||||||
|
Whether a certain bigram exists.
|
|
||||||||||||||||||||
|
|
|
||||||||||||||||||||
|
Return the bigram followers for the given word w. Return value: #bigrams in returned list.
|
|
|
LM cache related
|
|
|
LM cache statistic dumping
|
|
||||||||||||
|
Explicity convert structure from 16bit -> 32bit or 32bit to 16bit.
|
|
|
Deallocate the language model.
|
|
||||||||||||
|
Get class ID given a LM.
|
|
|
Check whether the model is operating at 32 bits |
|
|
Set all pointers to NULL in the lm |
|
||||||||||||
|
|
|
||||||||||||||||||||
|
A simple version of reading in a LM lm_read is a simple version of lm_read_advance. It will assume language weight, word insertion penalty and unigram weight to be automatically applied. There is also no class-based LM (so ndict=0). Format is set to NULL, so the program will determine it automatically.
|
|
||||||||||||||||||||||||||||||||||||||||
|
Read an LM file, it will automatically decide whether the file is a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump (non-public functions) correspondingly. Currently the code is not aware about OOV. lw, wip, uw and ndict are mainly used for recognition purpose. When lm_read is used for other purpose, one could just used dummy setting. recommended one is lw=1.0,wip=0.1,uw=1.0 and ndict=0. These are very useful when lm_read is just used as reading the LM. If applyweight is 0, lw,wip, uw will not be apply the LM at all. This will allow users to just call the LM routine without initializing other modules (such as logs3_init). If applyweight is 1, then logs3_init must be called before lm_read. This is usually the case when kb_init is called before the code. fmt now could be either "TXT", "DMP" and "TXT32" or just NULL. If it is NULL, the LM format will be automatically determined. If it is specified as "TXT" or "DMP", the corresponding lm reader will be called. In such a case, it is important for the users to know what he/she is doing. (Unfortunately, this is mostly not true. ) In the case of "TXT32", a text LM will be forced to 32bit mode. ndict is the dictionary size of the application. This is needed because class-based LM are addressed in the dictionary wid-space instead of lm wid-space. If class-based LM is not used, just set this to zero. Note: there are two defense mechanisms of lm_read_advance. First of all, if no fmt is specified, it will start to read the lm in the order of DMP->TXT. Second, if txt format is specified but LM is found to hit the 16bit legacy segments limit, it will automatically switch to read TXT32 LM
|
|
||||||||||||||||||||||||||||||||||||||||||||
|
|
|
||||||||||||||||
|
Set the language-weight and insertion penalty parameters for the LM, after revoking any earlier set of such parameters. WARNING!! This function doesn't prevent underflow of values. Make sure you call safe lm2logs3 before it.
|
|
||||||||||||||||||||||||
|
|
|
||||||||||||||||||||
|
Whether a certain trigram exists.
|
|
||||||||||||||||||||||||
|
Return trigram score for the given three word sequence. If w1 is BAD_LMWID(lm), return lm_bg_score (w2, w3). If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3). 20040227: This also account for the in-class probability of w3.
|
|
||||||||||||||||||||||||
|
Return trigram followers for given two words. Both w1 and w2 must be valid. Return value: #trigrams in returned list.
|
|
||||||||||||
|
|
|
||||||||||||||||
|
Return unigram score for the given word
|
|
||||||||||||||||||||
|
Like lm_bg_wordprob, but for unigrams. Return value: #entries filled in the wordprob array.
|
|
||||||||||||
|
Return the unigrams in LM. Return value: #unigrams in returned list.
|
|
||||||||||||
|
|
|
||||||||||||||||||||
|
Simple writing of an LM file, the input and output encoding will assume to be iso8859-1. Call lm_write. To convert encoding, please use lm_write_advance.
|
|
||||||||||||||||||||||||||||
|
Writing of an LM file with advanced options such as encoding support. Called by lm_write. fmt now could be TXT, DMP, FST inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312. Not every pair of conversion works. Current input/output encodings support list. 0: iso8859-1 1: gb2312-hex 2: gb2312 -: do nothing n: doesn't make sense or not compatible x: not supported yet y: supported i 0 1 2 0 - n n 1 n - y 2 n x - When we have 4 encoding types: This document should be implemented as a data structure. This conversion table is copied from encoding.c, please take a look the latest support in encoding.c
|
|
||||||||||||||||
|
Add a new lm into the lmset. Notice that lms->n_lm will be added by 1
|
|
||||||||||||
|
Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1
|
|
|
Free the lmset data structure
|
|
||||||||||||
|
Get an LM by index.
|
|
||||||||||||
|
Get an LM by name
|
|
||||||||||||
|
Convert index to name
|
|
||||||||||||||||||||||||||||||||||||||||||||
|
A wrapper function of controlling the behavior of LM initialization (ARCHAN 20050617) lmset_init controls the behavior how the lmset which is an array of lm was initialized by different command-line arguments. lmfile and lmctlfile are mutually exclusive. Each will invoke one reading functions. In the case of -lmfile is specified. A lmset with one single lm (or lmset->n_lm=1) will be returned. The single lm's name will be called lmname. In the case of -lmctlfile is specified. A lmset with multiple lms will be returned. The number of lm will depend on the number of lm specified by -lmctlfile. For the format, please read the current format of -lmctlfile in lm.c ctl_lm is the equivalent of -ctl for lm. When -ctl_lm is not specified in command-line (ctl_lm is NULL). Then either lm with name lmname will be used as the default lm. If lmname is NULL, then the first lm will be named as the "default" lmdumpdir is currently not used. It is there for backward compatibility purpose. lw,wip,uw are language weight, word insertion pernalty and unigram weight. Their values are crucial to computation of the language model score. Therefore, the programmer is urged to carefully set these three values and also be careful of the order. dict is assumed to be a pre-initialized dict_t structure which is used in deriving the mapping between the dictionary word and the lm words ARCHAN 20050711 -lminmemory is the only global variable that control the code and we haven't explicitly specify it. Currently, if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used. if the LM is txt-base, only -lminmemory=1 is accepted. (This will be changed in future.) ARCHAN 20050705: A survival guide for this part of the code. Our language mode code is unnecessarily complicated and is mainly caused by the fact the way we specified class-based LM and multiple LM are inter-dependent. For example, one could specify a multiple LMs file (i.e. lmctlfile) and have no classes. However, if one would like to specify class information even with a single LM, one need to use a multiple LM file format (i.e. lmctlfile). This difficulty is well-observed in the period of Sphinx 3.4-3.6. That might imply that a new LM format is needed if we want to sustain this part of the development.
|
|
||||||||||||
|
Convert name to index
|
|
||||||||||||||||||||||||||||||||
|
Read the LM control file. **Usually**, it is also a class-based LM,
|
|
||||||||||||||||||||||||||||||||||||
|
Read a single LM into the lmset.
|
|
||||||||||||
|
Set the current LM with index
|
|
||||||||||||
|
Set the current LM with name
|
|
|
Create a new unigram table
|
|
|
Swap 16 bits bigram |
|
|
Swap 32 bits bigram |
|
|
Swap 16 bits trigram |
|
|
Swap 32 bits trigram |
|
||||||||||||
|
Write of TG (32bits) structure
|
|
||||||||||||
|
Write of TG structure
|
|
||||||||||||
|
Write of UG structure
|
1.3.9.1