Main Page | Class List | Directories | File List | Class Members | File Members

lm.h File Reference

Language model. More...

#include <stdio.h>
#include <logmath.h>
#include <hash_table.h>
#include <cmd_ln.h>
#include "s3types.h"
#include "lmclass.h"
#include "dict.h"

Go to the source code of this file.

Classes

union  lmlog_t
 Log quantities represented in either floating or integer format. More...
struct  sorted_entry_s
struct  sorted_list_t
 The sorted list used lm reading. list is a (64K long) array. The first entry is the root of the tree and is created during initialization. More...
struct  ug_t
 A unigram structure Please see. More...
struct  bg_t
 A bigram structure. More...
struct  bg32_t
 A bigram structure which has 32 bits. More...
struct  tg_t
 A trigram structure. More...
struct  tg32_t
 A 32 bits version of tg_t. More...
struct  membg_t
 Management of in-memory bigrams. Not used if all bigrams in memory. More...
struct  membg32_t
 A 32 bits version of membg_t. More...
struct  tginfo_s
struct  tginfo32_s
struct  lm_tgcache_entry_t
struct  lm_tgcache_entry32_t
struct  lm_s
struct  lmset_s
struct  wordprob_t
 Generic structure that could be used at any n-gram level. More...

Defines

#define LM_DICTWID_BADMAP   -16000
#define LM_CLASSID_BASE   0x01000000
#define LM_LEGACY_CONSTANT   BAD_S3LMWID
#define LM_SPHINX_CONSTANT   BAD_S3LMWID32
#define LM_CLASSID_TO_CLASS(m, i)   ((m)->lmclass[(i)-LM_CLASSID_BASE])
#define MIN_PROB_F   -99.0
#define LM_ALLOC_BLOCK   16
#define LM_SUCCESS   1
#define LM_FAIL   0
#define LM_NOT_FOUND   -1
#define LM_OFFSET_TOO_LARGE   -2
#define LM_NO_DATA_MARK   -3
#define LM_UNKNOWN_NG   -4
#define LM_BAD_LM_COUNT   -5
#define LM_UNKNOWN_WORDS   -6
#define LM_BAD_BIGRAM   -7
#define LM_BAD_TRIGRAM   -8
#define LM_BAD_QUADGRAM   -9
#define LM_BAD_QUINGRAM   -10
#define LM_BAD_NGRAM   -11
#define LM_TOO_MANY_NGRAM   -12
#define LM_NO_MINUS_1GRAM   -13
#define LM_FILE_NOT_FOUND   -14
#define LM_CANNOT_ALLOCATE   -15
#define LMDMP_VERSIONNULL   0
#define LMDMP_VERSION_TG_16BIT   -1
#define LMDMP_VERSION_TG_16BIT_V2   -2
#define LMDMP_VERSION_TG_32BIT   -3
#define LMTXT_VERSION   1000
#define LMFST_VERSION   1001
#define LMFORCED_TXT32VERSION   1002
#define NO_WORD   -1
#define LOG2_BG_SEG_SZ   9
#define BG_SEG_SZ   (1 << (LOG2_BG_SEG_SZ))
#define LM_TGCACHE_SIZE   100003
#define lm_lmwid2dictwid(lm, u)   ((lm)->ug[u].dictwid)
#define lm_n_ug(lm)   ((lm)->n_ug)
#define lm_n_bg(lm)   ((lm)->n_bg)
#define lm_n_tg(lm)   ((lm)->n_tg)
#define lm_wordstr(lm, u)   ((lm)->wordstr[u])
#define lm_startwid(lm)   ((lm)->startlwid)
#define lm_finishwid(lm)   ((lm)->finishlwid)
#define lm_access_type(lm)   ((lm)->access_type)
#define LM_TGPROB(lm, tgptr)   ((lm)->tgprob[(tgptr)->probid].l)
#define LM_BGPROB(lm, bgptr)   ((lm)->bgprob[(bgptr)->probid].l)
#define LM_UGPROB(lm, ugptr)   ((ugptr)->prob.l)
#define LM_RAWSCORE(lm, score)   ((score - (lm)->wip) / ((lm)->lw))
#define LM_DICTWID(lm, lmwid)   ((lm)->ug[(lmwid)].dictwid)

Typedefs

typedef sorted_entry_s sorted_entry_t
typedef tginfo_s tginfo_t
typedef tginfo32_s tginfo32_t
typedef lm_s lm_t
typedef lmset_s lmset_t

Functions

S3DECODER_EXPORT lmset_tlmset_init (const char *lmfile, const char *lmctlfile, const char *ctl_lm, const char *lmname, const char *lmdumpdir, float32 lw, float32 wip, float32 uw, dict_t *dict, logmath_t *logmath)
lmset_tlmset_read_lm (const char *lmfile, dict_t *dict, const char *lmname, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath)
lmset_tlmset_read_ctl (const char *ctlfile, dict_t *dict, float64 lw, float64 wip, float64 uw, const char *lmdumpdir, logmath_t *logmath)
lm_tlmset_get_lm_widx (lmset_t *lms, int32 lmidx)
lm_tlmset_get_lm_wname (lmset_t *lms, const char *lmname)
void lmset_set_curlm_widx (lmset_t *lms, int32 lmidx)
S3DECODER_EXPORT void lmset_set_curlm_wname (lmset_t *lms, const char *lmname)
int32 lmset_name_to_idx (lmset_t *lms, const char *lmname)
char * lmset_idx_to_name (lmset_t *lms, int32 lmidx)
void lmset_add_lm (lmset_t *lms, lm_t *lm, const char *lmname)
void lmset_delete_lm (lmset_t *lms, const char *lmname)
S3DECODER_EXPORT void lmset_free (lmset_t *lms)
int32 lm_tglist (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg_t **tg, int32 *bowt)
int32 lm_tg32list (lm_t *lmp, s3lmwid32_t w1, s3lmwid32_t w2, tg32_t **tg, int32 *bowt)
int32 lm_bglist (lm_t *lmp, s3lmwid32_t w, bg_t **bg, int32 *bowt)
int32 lm_bg32list (lm_t *lmp, s3lmwid32_t w, bg32_t **bg, int32 *bowt)
s3lmwid32_t lm_wid (lm_t *lm, const char *wd)
void lm_null_struct (lm_t *lm)
int32 lm_ug_wordprob (lm_t *lm, dict_t *dict, int32 th, wordprob_t *wp)
int32 lm_uglist (lm_t *lmp, ug_t **ug)
int32 lm_ug_score (lm_t *lmp, s3lmwid32_t lwid, s3wid_t wid)
int32 lm_ug_exists (lm_t *lm, s3lmwid32_t lwid)
int32 lm_bg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2)
int32 lm_bg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
int32 lm_tg_score (lm_t *lmp, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3, s3wid_t w3)
int32 lm_tg_exists (lm_t *lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3)
void lm_set_param (lm_t *lm, float64 lw, float64 wip)
S3DECODER_EXPORT int32 lm_rawscore (lm_t *lm, int32 score)
S3DECODER_EXPORT void lm_cache_reset (lm_t *lmp)
S3DECODER_EXPORT void lm_cache_stats_dump (lm_t *lmp)
lm_tlm_read (const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath)
lm_tlm_read_advance (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, logmath_t *logmath)
S3DECODER_EXPORT lm_tlm_read_advance2 (const char *file, const char *lmname, float64 lw, float64 wip, float64 uw, int32 ndict, const char *fmt, int32 applyweight, int lminmemory, logmath_t *logmath)
S3DECODER_EXPORT int32 lm_write (lm_t *model, const char *outputfile, const char *filename, const char *fmt)
int32 lm_write_advance (lm_t *model, const char *outputfile, const char *filename, const char *fmt, const char *inputenc, char *outputenc)
S3DECODER_EXPORT void lm_free (lm_t *lm)
int32 lm_add_wordlist (lm_t *lm, dict_t *dict, const char *filename)
int32 lm_add_word_to_ug (lm_t *lm, dict_t *dict, const char *newword)
int32 lm_get_classid (lm_t *model, const char *name)
void lm_convert_structure (lm_t *model, int32 is32bits)
int32 lm_is32bits (lm_t *model)
void ug_write (FILE *fp, ug_t *ug)
void bg_write (FILE *fp, bg_t *bg)
void bg32_write (FILE *fp, bg32_t *bg)
void tg_write (FILE *fp, tg_t *tg)
void tg32_write (FILE *fp, tg32_t *tg)
void copy_bg_to_bg32 (lm_t *lm)
void copy_bg32_to_bg (lm_t *lm)
void copy_tg_to_tg32 (lm_t *lm)
void copy_tg32_to_tg (lm_t *lm)
void swap_bg (bg_t *bg)
void swap_bg32 (bg32_t *bg)
void swap_tg (tg_t *tg)
void swap_tg32 (tg32_t *tg)
int32 find_bg (bg_t *bg, int32 n, s3lmwid32_t w)
int32 find_bg32 (bg32_t *bg, int32 n, s3lmwid32_t w)
int32 find_tg (tg_t *tg, int32 n, s3lmwid32_t w)
int32 find_tg32 (tg32_t *tg, int32 n, s3lmwid32_t w)
ug_tNewUnigramTable (int32 n_ug)


Detailed Description

Language model.

This is the header file for language model support in Sphinx 3. Sphinx 3 supports language model in 4 formats. The four formats are

ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in 3.X (X=6)

DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X (X>4)

DMP32 : We start to break the limit of number of words of 65535. This is the first LM file format in Sphinx 3.X that could capture 4 billion words in the language model

FST: In AT&T format, we start to support in 3.X (X=6).

At 20060302 we can only read and used ARPA, DMP-based format in the decoder. we can write ARPA, DMP, DMP32 and FST file format.


Define Documentation

#define BG_SEG_SZ   (1 << (LOG2_BG_SEG_SZ))
 

#define lm_access_type lm   )     ((lm)->access_type)
 

#define LM_ALLOC_BLOCK   16
 

#define LM_BAD_BIGRAM   -7
 

A bad bigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.

#define LM_BAD_LM_COUNT   -5
 

When reading LM, if count is bad, return this msg

#define LM_BAD_NGRAM   -11
 

(RESERVED BUT NOT USED) A bad n-gram. generalization of message -7 to -10. In our case, we don't make the message as specific as possible.

#define LM_BAD_QUADGRAM   -9
 

(RESERVED BUT NOT USED) A bad quadgram (4-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.

#define LM_BAD_QUINGRAM   -10
 

(RESERVED BUT NOT USED) A bad quingram (5-gram), it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound. BTW, there is no need to remind me the mixed use of quadgram and quingram is stupid English. I read Manning and Schultze.

#define LM_BAD_TRIGRAM   -8
 

A bad trigram, it could be word ids larger than # of unigram, it could be word id smaller than 0. It could also be bigram out of bound.

#define LM_BGPROB lm,
bgptr   )     ((lm)->bgprob[(bgptr)->probid].l)
 

#define LM_CANNOT_ALLOCATE   -15
 

When cannot allocate tables in LM return this message

#define LM_CLASSID_BASE   0x01000000
 

#define LM_CLASSID_TO_CLASS m,
 )     ((m)->lmclass[(i)-LM_CLASSID_BASE])
 

#define LM_DICTWID lm,
lmwid   )     ((lm)->ug[(lmwid)].dictwid)
 

#define LM_DICTWID_BADMAP   -16000
 

#define LM_FAIL   0
 

Constant that define an operation failed.

#define LM_FILE_NOT_FOUND   -14
 

When couldn't find the LM file, return this message

#define lm_finishwid lm   )     ((lm)->finishlwid)
 

#define LM_LEGACY_CONSTANT   BAD_S3LMWID
 

=65535 (~65k), this is introduced since 1996 when Ravi first wrote Sphinx 3.0. It was with us since.

#define lm_lmwid2dictwid lm,
 )     ((lm)->ug[u].dictwid)
 

Access macros; not meant for arbitrary use

#define lm_n_bg lm   )     ((lm)->n_bg)
 

#define lm_n_tg lm   )     ((lm)->n_tg)
 

#define lm_n_ug lm   )     ((lm)->n_ug)
 

#define LM_NO_DATA_MARK   -3
 

When reading text-based LM, return thisif we see no data mark

#define LM_NO_MINUS_1GRAM   -13
 

When reading n-gram, if the corresponding (n-1)-gram doesn't exists, return this message.

#define LM_NOT_FOUND   -1
 

Constant which indicate an LM couldn't be found

#define LM_OFFSET_TOO_LARGE   -2
 

Constant where the 16 bit LM was used, but th tgcount is larger than LM_LEGACY_CONSTANT (65535). This breaks addressing scheme in the current LM.

#define LM_RAWSCORE lm,
score   )     ((score - (lm)->wip) / ((lm)->lw))
 

#define LM_SPHINX_CONSTANT   BAD_S3LMWID32
 

(4 billion), ARCHAN: this is introduced by in Sphinx 3.6 during the time of Release Candidate I (2006 March). The caveat of using this constant is that it is much hard to detect byte-swapping problem. in general. Also, if the world has more than 10000 cities, each has 1 million roads name. We are stuck in this case. I assume this will happen in year3001.

#define lm_startwid lm   )     ((lm)->startlwid)
 

#define LM_SUCCESS   1
 

Constant that indicates an operation succeed

#define LM_TGCACHE_SIZE   100003
 

#define LM_TGPROB lm,
tgptr   )     ((lm)->tgprob[(tgptr)->probid].l)
 

#define LM_TOO_MANY_NGRAM   -12
 

When reading LM, if the number of n-grams is more than the number specified header. return this header

#define LM_UGPROB lm,
ugptr   )     ((ugptr)->prob.l)
 

#define LM_UNKNOWN_NG   -4
 

When reading the header of LM, if there is unknown K for K-gram

#define LM_UNKNOWN_WORDS   -6
 

When an unknown word is found during LM readin, return this message

#define lm_wordstr lm,
 )     ((lm)->wordstr[u])
 

#define LMDMP_VERSION_TG_16BIT   -1
 

VERSION 1 is the simplest DMP file which is trigram or lower which used 16 bits in bigram and trigram.

#define LMDMP_VERSION_TG_16BIT_V2   -2
 

VERSION 2 means legacy VERSION 1 DMP file which has log_bg_seg_sz != 9

#define LMDMP_VERSION_TG_32BIT   -3
 

VERSION 3 is the 32 bit extension of VERSION 1 but the bigram and trigram are represented by 32 bits data structure

#define LMDMP_VERSIONNULL   0
 

VERSION 0 is oldest, in the past, we used to use the version number to store the number of unigram, you will see logic that said vn > LMDMP_VERSIONNULL

#define LMFORCED_TXT32VERSION   1002
 

VERSION 1002 is the internal version of text-based LM. The difference betwwen 1002 and 1000 is that 1002 will assume LM is 32bits. This fact is used in lm_is32bits(lm)

#define LMFST_VERSION   1001
 

VERSION 1001 is the FST-based LM

#define LMTXT_VERSION   1000
 

VERSION 1000 is the text-based LM

#define LOG2_BG_SEG_SZ   9
 

#define MIN_PROB_F   -99.0
 

The minimum value of probabilities and backoff weights. When changing, notice that both s2 and s3 may transform this number to very small integer (say -2e-31) This will easily cause integer wrap around. -99 is chosen for that reason.

#define NO_WORD   -1
 


Typedef Documentation

typedef struct lm_s lm_t
 

typedef struct lmset_s lmset_t
 

typedef struct sorted_entry_s sorted_entry_t
 

typedef struct tginfo32_s tginfo32_t
 

typedef struct tginfo_s tginfo_t
 


Function Documentation

void bg32_write FILE *  fp,
bg32_t bg
 

Write of BG (32bits) structure

Parameters:
fp  A file pointer
bg  A pointer of the bg32_t structure

void bg_write FILE *  fp,
bg_t bg
 

Write of BG structure

Parameters:
fp  A file pointer
bg  A pointer of the bg_t structure

void copy_bg32_to_bg lm_t lm  ) 
 

Convert the 32 bit bigram structure to 16 bit

Parameters:
lm  LM

void copy_bg_to_bg32 lm_t lm  ) 
 

Convert the 16 bit bigram structure to 32 bit

Parameters:
lm  LM

void copy_tg32_to_tg lm_t lm  ) 
 

Convert the 32 bit trigram structure to 16 bit

Parameters:
lm  LM

void copy_tg_to_tg32 lm_t lm  ) 
 

Convert the 16 bit trigram structure to 32 bit

Parameters:
lm  LM

int32 find_bg bg_t bg,
int32  n,
s3lmwid32_t  w
 

Parameters:
bg  In: The bigram

int32 find_bg32 bg32_t bg,
int32  n,
s3lmwid32_t  w
 

Parameters:
bg  In: The bigram

int32 find_tg tg_t tg,
int32  n,
s3lmwid32_t  w
 

Parameters:
tg  In: The trigram

int32 find_tg32 tg32_t tg,
int32  n,
s3lmwid32_t  w
 

Parameters:
tg  In: The trigram

int32 lm_add_word_to_ug lm_t lm,
dict_t dict,
const char *  newword
 

Add a word to the LM

look up the dictionary and see whether it exists in the dictionary Looks alike with wid.c's logic at this point.

(Incomplete!) Not fully tested in the situation for on-line recognition.

We also avoid the addition of classes at this point because that could complicated things quite a lot.

Parameters:
lm  In/Out: a modified LM structure
dict  In: an initialized dictionary structure Used to update lmwid2dictid mapping.
newword  In: a pointer of a new word

int32 lm_add_wordlist lm_t lm,
dict_t dict,
const char *  filename
 

Add word list to the LM For each word in the file, call lm_add_wordlist. The file is assume to have a format like this: <word1> <word2> <word3> <word4>

If the lmwid2dictid mapping is not updated, or the dictionary itself is not used in the context. Just specify dict=NULL;

Parameters:
lm  In/Out: a modified LM structure
dict  In: an initialized dictionary structure Used to update
filename  In: a file that contains a list of word one wants to add

int32 lm_bg32list lm_t lmp,
s3lmwid32_t  w,
bg32_t **  bg,
int32 *  bowt
 

Parameters:
lmp  In: LM being queried
w  In: LM word id of the 1-word history
bg  Out: *bg = array of bigrams for w
bowt  Out: *bowt = backoff-weight for w

int32 lm_bg_exists lm_t lm,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2
 

Whether a certain bigram exists.

Parameters:
lm  In: LM

int32 lm_bg_score lm_t lmp,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2,
s3wid_t  w2
 

Parameters:
lmp  In: LM begin queried

int32 lm_bglist lm_t lmp,
s3lmwid32_t  w,
bg_t **  bg,
int32 *  bowt
 

Return the bigram followers for the given word w. Return value: #bigrams in returned list.

Parameters:
lmp  In: LM being queried
w  In: LM word id of the 1-word history
bg  Out: *bg = array of bigrams for w
bowt  Out: *bowt = backoff-weight for w

S3DECODER_EXPORT void lm_cache_reset lm_t lmp  ) 
 

LM cache related

Parameters:
lmp  In: the LM

S3DECODER_EXPORT void lm_cache_stats_dump lm_t lmp  ) 
 

LM cache statistic dumping

Parameters:
lmp  In: the LM

void lm_convert_structure lm_t model,
int32  is32bits
 

Explicity convert structure from 16bit -> 32bit or 32bit to 16bit.

Parameters:
model  In: LM file being used

S3DECODER_EXPORT void lm_free lm_t lm  ) 
 

Deallocate the language model.

Parameters:
lm  In: a LM structure

int32 lm_get_classid lm_t model,
const char *  name
 

Get class ID given a LM.

Parameters:
model  In: LM file being queried
name  In: The name of the class

int32 lm_is32bits lm_t model  ) 
 

Check whether the model is operating at 32 bits

void lm_null_struct lm_t lm  ) 
 

Set all pointers to NULL in the lm

S3DECODER_EXPORT int32 lm_rawscore lm_t lm,
int32  score
 

Parameters:
lm  In: the LM

lm_t* lm_read const char *  file,
const char *  lmname,
cmd_ln_t *  config,
logmath_t *  logmath
 

A simple version of reading in a LM

lm_read is a simple version of lm_read_advance. It will assume language weight, word insertion penalty and unigram weight to be automatically applied. There is also no class-based LM (so ndict=0). Format is set to NULL, so the program will determine it automatically.

Parameters:
file  In: LM file being read
lmname  In: LM name

lm_t* lm_read_advance const char *  file,
const char *  lmname,
float64  lw,
float64  wip,
float64  uw,
int32  ndict,
const char *  fmt,
int32  applyweight,
logmath_t *  logmath
 

Read an LM file, it will automatically decide whether the file is a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump (non-public functions) correspondingly. Currently the code is not aware about OOV.

lw, wip, uw and ndict are mainly used for recognition purpose. When lm_read is used for other purpose, one could just used dummy setting. recommended one is lw=1.0,wip=0.1,uw=1.0 and ndict=0. These are very useful when lm_read is just used as reading the LM.

If applyweight is 0, lw,wip, uw will not be apply the LM at all. This will allow users to just call the LM routine without initializing other modules (such as logs3_init).

If applyweight is 1, then logs3_init must be called before lm_read. This is usually the case when kb_init is called before the code.

fmt now could be either "TXT", "DMP" and "TXT32" or just NULL. If it is NULL, the LM format will be automatically determined. If it is specified as "TXT" or "DMP", the corresponding lm reader will be called. In such a case, it is important for the users to know what he/she is doing. (Unfortunately, this is mostly not true. ) In the case of "TXT32", a text LM will be forced to 32bit mode.

ndict is the dictionary size of the application. This is needed because class-based LM are addressed in the dictionary wid-space instead of lm wid-space. If class-based LM is not used, just set this to zero.

Note: there are two defense mechanisms of lm_read_advance. First of all, if no fmt is specified, it will start to read the lm in the order of DMP->TXT. Second, if txt format is specified but LM is found to hit the 16bit legacy segments limit, it will automatically switch to read TXT32 LM

Returns:
pointer to LM structure created.
Parameters:
file  In: LM file being read
lmname  In: LM name
lw  In: Language weight
wip  In: Word insertion penalty
uw  In: Unigram weight (interpolation with uniform distr.)
ndict  In: Number of dictionary entry. We need that because class-based LM is addressed in dictionary word ID space.
fmt  In: file format of the LM, it is now either "TXT", "DMP" and NULL, if NULL, file format is automaticaly determined
applyweight  In: whether lw,wip, uw should be applied to the lm or not

S3DECODER_EXPORT lm_t* lm_read_advance2 const char *  file,
const char *  lmname,
float64  lw,
float64  wip,
float64  uw,
int32  ndict,
const char *  fmt,
int32  applyweight,
int  lminmemory,
logmath_t *  logmath
 

Parameters:
file  In: LM file being read
lmname  In: LM name
lw  In: Language weight
wip  In: Word insertion penalty
uw  In: Unigram weight (interpolation with uniform distr.)
ndict  In: Number of dictionary entry. We need that because class-based LM is addressed in dictionary word ID space.
fmt  In: file format of the LM, it is now either "TXT", "DMP" and NULL, if NULL, file format is automaticaly determined
applyweight  In: whether lw,wip, uw should be applied to the lm or not
lminmemory  In: Whether LM is read into memory

void lm_set_param lm_t lm,
float64  lw,
float64  wip
 

Set the language-weight and insertion penalty parameters for the LM, after revoking any earlier set of such parameters.

WARNING!! This function doesn't prevent underflow of values. Make sure you call safe lm2logs3 before it.

Parameters:
lm  In: the LM
lw  In: the langauage weight
wip  In: the word insertion penalty

int32 lm_tg32list lm_t lmp,
s3lmwid32_t  w1,
s3lmwid32_t  w2,
tg32_t **  tg,
int32 *  bowt
 

Parameters:
lmp  In: LM being queried
w1  In: LM word id of the first of a 2-word history
w2  In: LM word id of the second of the 2-word history
tg  Out: *tg = array of trigrams for <w1,w2>
bowt  Out: *bowt = backoff-weight for <w1, w2>

int32 lm_tg_exists lm_t lm,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2,
s3lmwid32_t  lw3
 

Whether a certain trigram exists.

Parameters:
lm  In: LM

int32 lm_tg_score lm_t lmp,
s3lmwid32_t  lw1,
s3lmwid32_t  lw2,
s3lmwid32_t  lw3,
s3wid_t  w3
 

Return trigram score for the given three word sequence. If w1 is BAD_LMWID(lm), return lm_bg_score (w2, w3). If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3).

20040227: This also account for the in-class probability of w3.

Parameters:
lmp  In: LM begin queried

int32 lm_tglist lm_t lmp,
s3lmwid32_t  w1,
s3lmwid32_t  w2,
tg_t **  tg,
int32 *  bowt
 

Return trigram followers for given two words. Both w1 and w2 must be valid. Return value: #trigrams in returned list.

Parameters:
lmp  In: LM being queried
w1  In: LM word id of the first of a 2-word history
w2  In: LM word id of the second of the 2-word history
tg  Out: *tg = array of trigrams for <w1,w2>
bowt  Out: *bowt = backoff-weight for <w1, w2>

int32 lm_ug_exists lm_t lm,
s3lmwid32_t  lwid
 

Parameters:
lm  LM
lwid  LM ID for the word

int32 lm_ug_score lm_t lmp,
s3lmwid32_t  lwid,
s3wid_t  wid
 

Return unigram score for the given word

Parameters:
lmp  In: LM begin queried
lwid  LM ID for the word
wid  Dict ID for the word

int32 lm_ug_wordprob lm_t lm,
dict_t dict,
int32  th,
wordprob_t wp
 

Like lm_bg_wordprob, but for unigrams. Return value: #entries filled in the wordprob array.

Parameters:
lm  In: LM being queried
dict  In : The dictionary
wp  In/out: Array to be filled

int32 lm_uglist lm_t lmp,
ug_t **  ug
 

Return the unigrams in LM. Return value: #unigrams in returned list.

Parameters:
lmp  In: LM being queried
ug  Out: *ug = unigram array

s3lmwid32_t lm_wid lm_t lm,
const char *  wd
 

S3DECODER_EXPORT int32 lm_write lm_t model,
const char *  outputfile,
const char *  filename,
const char *  fmt
 

Simple writing of an LM file, the input and output encoding will assume to be iso8859-1. Call lm_write. To convert encoding, please use lm_write_advance.

Parameters:
outputfile  In: the pointer LM we want to output In: the output file name
filename  In: the LM file name
fmt  In: LM file format, it is now either "TXT" or "DMP"

int32 lm_write_advance lm_t model,
const char *  outputfile,
const char *  filename,
const char *  fmt,
const char *  inputenc,
char *  outputenc
 

Writing of an LM file with advanced options such as encoding support. Called by lm_write.

fmt now could be TXT, DMP, FST

inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312. Not every pair of conversion works.

Current input/output encodings support list. 0: iso8859-1 1: gb2312-hex 2: gb2312

-: do nothing n: doesn't make sense or not compatible x: not supported yet y: supported

i 0 1 2 0 - n n 1 n - y 2 n x -

When we have 4 encoding types: This document should be implemented as a data structure.

This conversion table is copied from encoding.c, please take a look the latest support in encoding.c

Parameters:
model  In: the pointer LM we want to output
outputfile  In: the output file name
filename  In: the LM file name
fmt  In: LM file format, it is now either "TXT", "DMP", "FST"
inputenc  In: Input encoding type
outputenc  Out: Output encoding type

void lmset_add_lm lmset_t lms,
lm_t lm,
const char *  lmname
 

Add a new lm into the lmset. Notice that lms->n_lm will be added by 1

Parameters:
lms  In/Out : The set of LM
lm  In : The input LM
lmname  In: The lm name

void lmset_delete_lm lmset_t lms,
const char *  lmname
 

Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1

Parameters:
lms  In/Out : The set of LM
lmname  The lm name

S3DECODER_EXPORT void lmset_free lmset_t lms  ) 
 

Free the lmset data structure

Parameters:
lms  In: The set of LM

lm_t* lmset_get_lm_widx lmset_t lms,
int32  lmidx
 

Get an LM by index.

Parameters:
lms  In: The set of LM
lmidx  In: LM index

lm_t* lmset_get_lm_wname lmset_t lms,
const char *  lmname
 

Get an LM by name

Returns:
a pointer of the LM with name lmname
Parameters:
lms  In: The set of LM
lmname  In: The LM name

char* lmset_idx_to_name lmset_t lms,
int32  lmidx
 

Convert index to name

Returns:
a pointer of the name string. No memory is allocated.
Parameters:
lms  In: The set of LM
lmidx  In: LM index

S3DECODER_EXPORT lmset_t* lmset_init const char *  lmfile,
const char *  lmctlfile,
const char *  ctl_lm,
const char *  lmname,
const char *  lmdumpdir,
float32  lw,
float32  wip,
float32  uw,
dict_t dict,
logmath_t *  logmath
 

A wrapper function of controlling the behavior of LM initialization

(ARCHAN 20050617) lmset_init controls the behavior how the lmset which is an array of lm was initialized by different command-line arguments. lmfile and lmctlfile are mutually exclusive. Each will invoke one reading functions.

In the case of -lmfile is specified. A lmset with one single lm (or lmset->n_lm=1) will be returned. The single lm's name will be called lmname.

In the case of -lmctlfile is specified. A lmset with multiple lms will be returned. The number of lm will depend on the number of lm specified by -lmctlfile. For the format, please read the current format of -lmctlfile in lm.c

ctl_lm is the equivalent of -ctl for lm. When -ctl_lm is not specified in command-line (ctl_lm is NULL). Then either lm with name lmname will be used as the default lm. If lmname is NULL, then the first lm will be named as the "default"

lmdumpdir is currently not used. It is there for backward compatibility purpose.

lw,wip,uw are language weight, word insertion pernalty and unigram weight. Their values are crucial to computation of the language model score. Therefore, the programmer is urged to carefully set these three values and also be careful of the order.

dict is assumed to be a pre-initialized dict_t structure which is used in deriving the mapping between the dictionary word and the lm words

ARCHAN 20050711 -lminmemory is the only global variable that control the code and we haven't explicitly specify it. Currently, if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used. if the LM is txt-base, only -lminmemory=1 is accepted. (This will be changed in future.)

ARCHAN 20050705: A survival guide for this part of the code. Our language mode code is unnecessarily complicated and is mainly caused by the fact the way we specified class-based LM and multiple LM are inter-dependent. For example, one could specify a multiple LMs file (i.e. lmctlfile) and have no classes. However, if one would like to specify class information even with a single LM, one need to use a multiple LM file format (i.e. lmctlfile).

This difficulty is well-observed in the period of Sphinx 3.4-3.6. That might imply that a new LM format is needed if we want to sustain this part of the development.

Parameters:
lmfile  The lm file name, lmfile and lmctlfile are mutally exclusive
lmctlfile  The file that specified multiple LMs and class information, lmfile and lmctlfile are mutually exclusive
ctl_lm  The control file that describes which lm to use for a particular utterance
lmname  The LM name to use if ctl_lm is not specified
lmdumpdir  Currently not used
lw  Language model weight
wip  Word insertion penalty
uw  Unigram weight
dict  A pre-initialized dict_t structure

int32 lmset_name_to_idx lmset_t lms,
const char *  lmname
 

Convert name to index

Parameters:
lms  In: The set of LM
lmname  In: The LM name

lmset_t* lmset_read_ctl const char *  ctlfile,
dict_t dict,
float64  lw,
float64  wip,
float64  uw,
const char *  lmdumpdir,
logmath_t *  logmath
 

Read the LM control file. **Usually**, it is also a class-based LM,

Parameters:
ctlfile  Control file name
dict  In: Dictionary
lw  In: Language weight
wip  In: Word insertion penalty
uw  In: Unigram weight
lmdumpdir  In: LMdumpdir

lmset_t* lmset_read_lm const char *  lmfile,
dict_t dict,
const char *  lmname,
float64  lw,
float64  wip,
float64  uw,
const char *  lmdumpdir,
logmath_t *  logmath
 

Read a single LM into the lmset.

Parameters:
lmfile  In: The LM file
dict  In: A pre-initialized dictionary file
lmname  In: The LM name
lw  The language weight
wip  The word insertion penalty
uw  The unigram weight
lmdumpdir  In: LM dump dir

void lmset_set_curlm_widx lmset_t lms,
int32  lmidx
 

Set the current LM with index

Parameters:
lms  In: The set of LM
lmidx  In: LM index

S3DECODER_EXPORT void lmset_set_curlm_wname lmset_t lms,
const char *  lmname
 

Set the current LM with name

Parameters:
lms  In: The set of LM
lmname  In: The LM name

ug_t* NewUnigramTable int32  n_ug  ) 
 

Create a new unigram table

Parameters:
n_ug  Number of unigram

void swap_bg bg_t bg  ) 
 

Swap 16 bits bigram

void swap_bg32 bg32_t bg  ) 
 

Swap 32 bits bigram

void swap_tg tg_t tg  ) 
 

Swap 16 bits trigram

void swap_tg32 tg32_t tg  ) 
 

Swap 32 bits trigram

void tg32_write FILE *  fp,
tg32_t tg
 

Write of TG (32bits) structure

Parameters:
fp  A file pointer
tg  A pointer of the tg32_t structure

void tg_write FILE *  fp,
tg_t tg
 

Write of TG structure

Parameters:
fp  A file pointer
tg  A pointer of the tg_t structure

void ug_write FILE *  fp,
ug_t ug
 

Write of UG structure

Parameters:
fp  A file pointer
ug  A pointer of the ug_t structure


Generated on Sat Apr 11 00:02:29 2009 by  doxygen 1.3.9.1