00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif
00059
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063 const char *ext;
00064
00065 ext = strrchr(file_name, '.');
00066 if (ext == NULL) {
00067 return NGRAM_ARPA;
00068 }
00069 if (0 == strcmp_nocase(ext, ".gz")) {
00070 while (--ext >= file_name) {
00071 if (*ext == '.') break;
00072 }
00073 if (ext < file_name) {
00074 return NGRAM_ARPA;
00075 }
00076 }
00077
00078 if (0 == strncmp_nocase(ext, ".ARPA", 5))
00079 return NGRAM_ARPA;
00080 if (0 == strncmp_nocase(ext, ".DMP32", 6))
00081 return NGRAM_DMP32;
00082 if (0 == strncmp_nocase(ext, ".DMP", 4))
00083 return NGRAM_DMP;
00084 return NGRAM_ARPA;
00085 }
00086
00087 ngram_model_t *
00088 ngram_model_read(cmd_ln_t *config,
00089 const char *file_name,
00090 ngram_file_type_t file_type,
00091 logmath_t *lmath)
00092 {
00093 ngram_model_t *model = NULL;
00094
00095 switch (file_type) {
00096 case NGRAM_AUTO: {
00097 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00098 break;
00099 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00100 break;
00101 if ((model = ngram_model_dmp32_read(config, file_name, lmath)) != NULL)
00102 break;
00103 return NULL;
00104 }
00105 case NGRAM_ARPA:
00106 model = ngram_model_arpa_read(config, file_name, lmath);
00107 break;
00108 case NGRAM_DMP:
00109 model = ngram_model_dmp_read(config, file_name, lmath);
00110 break;
00111 case NGRAM_DMP32:
00112 model = ngram_model_dmp32_read(config, file_name, lmath);
00113 break;
00114 }
00115
00116
00117 if (config) {
00118 float32 lw = 1.0;
00119 float32 wip = 1.0;
00120 float32 uw = 1.0;
00121
00122 if (cmd_ln_exists_r(config, "-lw"))
00123 lw = cmd_ln_float32_r(config, "-lw");
00124 if (cmd_ln_exists_r(config, "-wip"))
00125 wip = cmd_ln_float32_r(config, "-wip");
00126 if (cmd_ln_exists_r(config, "-uw"))
00127 uw = cmd_ln_float32_r(config, "-uw");
00128
00129 ngram_model_apply_weights(model, lw, wip, uw);
00130 }
00131
00132 return model;
00133 }
00134
00135 int
00136 ngram_model_write(ngram_model_t *model, const char *file_name,
00137 ngram_file_type_t file_type)
00138 {
00139 switch (file_type) {
00140 case NGRAM_AUTO: {
00141 file_type = ngram_file_name_to_type(file_name);
00142 return ngram_model_write(model, file_name, file_type);
00143 }
00144 case NGRAM_ARPA:
00145 return ngram_model_arpa_write(model, file_name);
00146 case NGRAM_DMP:
00147 return ngram_model_dmp_write(model, file_name);
00148 case NGRAM_DMP32:
00149 return ngram_model_dmp32_write(model, file_name);
00150 }
00151
00152 return -1;
00153 }
00154
00155 int32
00156 ngram_model_init(ngram_model_t *base,
00157 ngram_funcs_t *funcs,
00158 logmath_t *lmath,
00159 int32 n, int32 n_unigram)
00160 {
00161 base->refcount = 1;
00162 base->funcs = funcs;
00163 base->n = n;
00164
00165 if (base->n_counts == NULL)
00166 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00167
00168 if (base->lmath != lmath) {
00169
00170 base->lw = 1.0;
00171 base->log_wip = 0;
00172 base->log_uw = 0;
00173 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00174 base->log_uniform_weight = logmath_get_zero(lmath);
00175 base->log_zero = logmath_get_zero(lmath);
00176 base->lmath = lmath;
00177 }
00178
00179 if (base->word_str) {
00180
00181 if (base->writable) {
00182 int32 i;
00183 for (i = 0; i < base->n_words; ++i) {
00184 ckd_free(base->word_str[i]);
00185 base->word_str[i] = NULL;
00186 }
00187 }
00188 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00189 }
00190 else
00191 base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00192
00193
00194 if (base->wid)
00195 hash_table_empty(base->wid);
00196 else
00197 base->wid = hash_table_new(n_unigram, FALSE);
00198 base->n_1g_alloc = base->n_words = n_unigram;
00199
00200 return 0;
00201 }
00202
00203 ngram_model_t *
00204 ngram_model_retain(ngram_model_t *model)
00205 {
00206 ++model->refcount;
00207 return model;
00208 }
00209
00210 int
00211 ngram_model_free(ngram_model_t *model)
00212 {
00213 int i;
00214
00215 if (model == NULL)
00216 return 0;
00217 if (--model->refcount > 0)
00218 return model->refcount;
00219 if (model->funcs && model->funcs->free)
00220 (*model->funcs->free)(model);
00221 if (model->writable) {
00222
00223 for (i = 0; i < model->n_words; ++i) {
00224 ckd_free(model->word_str[i]);
00225 }
00226 }
00227 else {
00228
00229 for (i = 0; i < model->n_classes; ++i) {
00230 ngram_class_t *lmclass;
00231 int32 j;
00232
00233 lmclass = model->classes[i];
00234 for (j = 0; j < lmclass->n_words; ++j) {
00235 ckd_free(model->word_str[lmclass->start_wid + j]);
00236 }
00237 for (j = 0; j < lmclass->n_hash; ++j) {
00238 if (lmclass->nword_hash[j].wid != -1) {
00239 ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00240 }
00241 }
00242 }
00243 }
00244 for (i = 0; i < model->n_classes; ++i) {
00245 ngram_class_free(model->classes[i]);
00246 }
00247 ckd_free(model->classes);
00248 hash_table_free(model->wid);
00249 ckd_free(model->word_str);
00250 ckd_free(model->n_counts);
00251 ckd_free(model);
00252 return 0;
00253 }
00254
00255
00256 #ifdef HAVE_ICONV
00257 int
00258 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00259 {
00260 iconv_t ic;
00261 char *outbuf;
00262 size_t maxlen;
00263 int i, writable;
00264 hash_table_t *new_wid;
00265
00266
00267
00268 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00269 E_ERROR_SYSTEM("iconv_open() failed");
00270 return -1;
00271 }
00272
00273
00274
00275
00276
00277
00278
00279 maxlen = 0;
00280 for (i = 0; i < model->n_words; ++i) {
00281 if (strlen(model->word_str[i]) > maxlen)
00282 maxlen = strlen(model->word_str[i]);
00283 }
00284
00285 writable = model->writable;
00286
00287 model->writable = TRUE;
00288
00289 maxlen = maxlen * sizeof(int) + 15;
00290 outbuf = ckd_calloc(maxlen, 1);
00291
00292
00293 new_wid = hash_table_new(model->n_words, FALSE);
00294 for (i = 0; i < model->n_words; ++i) {
00295 ICONV_CONST char *in;
00296 char *out;
00297 size_t inleft, outleft, result;
00298
00299 start_conversion:
00300 in = (ICONV_CONST char *)model->word_str[i];
00301
00302 inleft = strlen(in);
00303 out = outbuf;
00304 outleft = maxlen;
00305
00306 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00307 if (errno != E2BIG) {
00308
00309
00310 E_ERROR_SYSTEM("iconv() failed");
00311 ckd_free(outbuf);
00312 hash_table_free(new_wid);
00313 return -1;
00314 }
00315
00316 iconv(ic, NULL, NULL, NULL, NULL);
00317
00318 maxlen *= 2;
00319 out = outbuf = ckd_realloc(outbuf, maxlen);
00320
00321 in = (ICONV_CONST char *)model->word_str[i];
00322 inleft = strlen(in);
00323 }
00324
00325
00326 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00327 if (errno != E2BIG) {
00328
00329
00330 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00331 ckd_free(outbuf);
00332 hash_table_free(new_wid);
00333 return -1;
00334 }
00335
00336 iconv(ic, NULL, NULL, NULL, NULL);
00337
00338 maxlen *= 2;
00339 outbuf = ckd_realloc(outbuf, maxlen);
00340
00341 goto start_conversion;
00342 }
00343
00344 result = maxlen - outleft;
00345
00346 if (writable) {
00347
00348 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00349 model->word_str[i][result] = '\0';
00350 }
00351 else {
00352
00353 model->word_str[i] = ckd_calloc(result + 1, 1);
00354 }
00355
00356 memcpy(model->word_str[i], outbuf, result);
00357
00358
00359
00360
00361 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00362 E_WARN("Duplicate word in dictionary after conversion: %s\n",
00363 model->word_str[i]);
00364 }
00365 }
00366 ckd_free(outbuf);
00367 iconv_close(ic);
00368
00369 hash_table_free(model->wid);
00370 model->wid = new_wid;
00371
00372 return 0;
00373 }
00374 #else
00375 int
00376 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00377 {
00378 return -1;
00379 }
00380 #endif
00381
00382 int
00383 ngram_model_apply_weights(ngram_model_t *model,
00384 float32 lw, float32 wip, float32 uw)
00385 {
00386 return (*model->funcs->apply_weights)(model, lw, wip, uw);
00387 }
00388
00389 float32
00390 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00391 int32 *out_log_uw)
00392 {
00393 if (out_log_wip) *out_log_wip = model->log_wip;
00394 if (out_log_uw) *out_log_uw = model->log_uw;
00395 return model->lw;
00396 }
00397
00398
00399 int32
00400 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00401 int32 n_hist, int32 *n_used)
00402 {
00403 int32 score, class_weight = 0;
00404 int i;
00405
00406
00407 if (wid == NGRAM_INVALID_WID)
00408 return model->log_zero;
00409
00410
00411 if (NGRAM_IS_CLASSWID(wid)) {
00412 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00413
00414 class_weight = ngram_class_prob(lmclass, wid);
00415 if (class_weight == 0)
00416
00417
00418 return model->log_zero;
00419 wid = lmclass->tag_wid;
00420 }
00421 for (i = 0; i < n_hist; ++i) {
00422 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00423 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00424 }
00425 score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00426
00427
00428 return score + class_weight;
00429 }
00430
00431 int32
00432 ngram_score(ngram_model_t *model, const char *word, ...)
00433 {
00434 va_list history;
00435 const char *hword;
00436 int32 *histid;
00437 int32 n_hist;
00438 int32 n_used;
00439 int32 prob;
00440
00441 va_start(history, word);
00442 n_hist = 0;
00443 while ((hword = va_arg(history, const char *)) != NULL)
00444 ++n_hist;
00445 va_end(history);
00446
00447 histid = ckd_calloc(n_hist, sizeof(*histid));
00448 va_start(history, word);
00449 n_hist = 0;
00450 while ((hword = va_arg(history, const char *)) != NULL) {
00451 histid[n_hist] = ngram_wid(model, hword);
00452 ++n_hist;
00453 }
00454 va_end(history);
00455
00456 prob = ngram_ng_score(model, ngram_wid(model, word),
00457 histid, n_hist, &n_used);
00458 ckd_free(histid);
00459 return prob;
00460 }
00461
00462 int32
00463 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00464 {
00465 int32 hist[2] = { w2, w1 };
00466 return ngram_ng_score(model, w3, hist, 2, n_used);
00467 }
00468
00469 int32
00470 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00471 {
00472 return ngram_ng_score(model, w2, &w1, 1, n_used);
00473 }
00474
00475 int32
00476 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00477 int32 n_hist, int32 *n_used)
00478 {
00479 int32 prob, class_weight = 0;
00480 int i;
00481
00482
00483 if (wid == NGRAM_INVALID_WID)
00484 return model->log_zero;
00485
00486
00487 if (NGRAM_IS_CLASSWID(wid)) {
00488 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00489
00490 class_weight = ngram_class_prob(lmclass, wid);
00491 if (class_weight == model->log_zero)
00492 return class_weight;
00493 wid = lmclass->tag_wid;
00494 }
00495 for (i = 0; i < n_hist; ++i) {
00496 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00497 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00498 }
00499 prob = (*model->funcs->raw_score)(model, wid, history,
00500 n_hist, n_used);
00501
00502 return prob + class_weight;
00503 }
00504
00505 int32
00506 ngram_prob(ngram_model_t *model, const char *word, ...)
00507 {
00508 va_list history;
00509 const char *hword;
00510 int32 *histid;
00511 int32 n_hist;
00512 int32 n_used;
00513 int32 prob;
00514
00515 va_start(history, word);
00516 n_hist = 0;
00517 while ((hword = va_arg(history, const char *)) != NULL)
00518 ++n_hist;
00519 va_end(history);
00520
00521 histid = ckd_calloc(n_hist, sizeof(*histid));
00522 va_start(history, word);
00523 n_hist = 0;
00524 while ((hword = va_arg(history, const char *)) != NULL) {
00525 histid[n_hist] = ngram_wid(model, hword);
00526 ++n_hist;
00527 }
00528 va_end(history);
00529
00530 prob = ngram_ng_prob(model, ngram_wid(model, word),
00531 histid, n_hist, &n_used);
00532 ckd_free(histid);
00533 return prob;
00534 }
00535
00536 int32
00537 ngram_score_to_prob(ngram_model_t *base, int32 score)
00538 {
00539 int32 prob;
00540
00541
00542 prob = score - base->log_wip;
00543
00544 prob = (int32)(prob / base->lw);
00545
00546 return prob;
00547 }
00548
00549 int32
00550 ngram_unknown_wid(ngram_model_t *model)
00551 {
00552 int32 val;
00553
00554
00555
00556 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00557 return NGRAM_INVALID_WID;
00558 else
00559 return val;
00560 }
00561
00562 int32
00563 ngram_zero(ngram_model_t *model)
00564 {
00565 return model->log_zero;
00566 }
00567
00568 int32
00569 ngram_model_get_size(ngram_model_t *model)
00570 {
00571 if (model != NULL)
00572 return model->n;
00573 return 0;
00574 }
00575
00576 int32 const *
00577 ngram_model_get_counts(ngram_model_t *model)
00578 {
00579 if (model != NULL)
00580 return model->n_counts;
00581 return NULL;
00582 }
00583
00584 int32
00585 ngram_wid(ngram_model_t *model, const char *word)
00586 {
00587 int32 val;
00588
00589 if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00590 return ngram_unknown_wid(model);
00591 else
00592 return val;
00593 }
00594
00595 const char *
00596 ngram_word(ngram_model_t *model, int32 wid)
00597 {
00598
00599 wid = NGRAM_BASEWID(wid);
00600 if (wid >= model->n_words)
00601 return NULL;
00602 return model->word_str[wid];
00603 }
00604
00608 int32
00609 ngram_add_word_internal(ngram_model_t *model,
00610 const char *word,
00611 int32 classid)
00612 {
00613 void *dummy;
00614 int32 wid;
00615
00616
00617 wid = model->n_words;
00618 if (classid >= 0) {
00619 wid = NGRAM_CLASSWID(wid, classid);
00620 }
00621
00622 if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00623 E_ERROR("Duplicate definition of word %s\n", word);
00624 return NGRAM_INVALID_WID;
00625 }
00626
00627 if (model->n_words >= model->n_1g_alloc) {
00628 model->n_1g_alloc += UG_ALLOC_STEP;
00629 model->word_str = ckd_realloc(model->word_str,
00630 sizeof(*model->word_str) * model->n_1g_alloc);
00631 }
00632
00633
00634 model->word_str[model->n_words] = ckd_salloc(word);
00635
00636 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00637 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00638 model->word_str[model->n_words], (void *)(long)(wid));
00639 }
00640
00641 ++model->n_words;
00642 return wid;
00643 }
00644
00645 int32
00646 ngram_model_add_word(ngram_model_t *model,
00647 const char *word, float32 weight)
00648 {
00649 int32 wid, prob = model->log_zero;
00650
00651 wid = ngram_add_word_internal(model, word, -1);
00652 if (wid == NGRAM_INVALID_WID)
00653 return wid;
00654
00655
00656 if (model->funcs && model->funcs->add_ug)
00657 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00658 if (prob == 0) {
00659 if (model->writable)
00660 ckd_free(model->word_str[wid]);
00661 return -1;
00662 }
00663 return wid;
00664 }
00665
00666 ngram_class_t *
00667 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00668 {
00669 ngram_class_t *lmclass;
00670 gnode_t *gn;
00671 float32 tprob;
00672 int i;
00673
00674 lmclass = ckd_calloc(1, sizeof(*lmclass));
00675 lmclass->tag_wid = tag_wid;
00676
00677 lmclass->start_wid = start_wid;
00678 lmclass->n_words = glist_count(classwords);
00679 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00680 lmclass->nword_hash = NULL;
00681 lmclass->n_hash = 0;
00682 tprob = 0.0;
00683 for (gn = classwords; gn; gn = gnode_next(gn)) {
00684 tprob += gnode_float32(gn);
00685 }
00686 if (tprob > 1.1 || tprob < 0.9) {
00687 E_WARN("Total class probability is %f, will normalize\n", tprob);
00688 for (gn = classwords; gn; gn = gnode_next(gn)) {
00689 gn->data.fl /= tprob;
00690 }
00691 }
00692 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00693 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00694 }
00695
00696 return lmclass;
00697 }
00698
00699 int32
00700 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00701 {
00702 int32 hash;
00703
00704 if (lmclass->nword_hash == NULL) {
00705
00706 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00707 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00708 lmclass->n_hash = NGRAM_HASH_SIZE;
00709 lmclass->n_hash_inuse = 0;
00710 }
00711
00712
00713
00714 hash = wid & (lmclass->n_hash - 1);
00715 if (lmclass->nword_hash[hash].wid == -1) {
00716
00717 lmclass->nword_hash[hash].wid = wid;
00718 lmclass->nword_hash[hash].prob1 = lweight;
00719 ++lmclass->n_hash_inuse;
00720 return hash;
00721 }
00722 else {
00723 int32 next;
00724
00725 while (lmclass->nword_hash[hash].next != -1)
00726 hash = lmclass->nword_hash[hash].next;
00727 assert(hash != -1);
00728
00729 if (lmclass->n_hash_inuse == lmclass->n_hash) {
00730
00731 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
00732 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00733 memset(lmclass->nword_hash + lmclass->n_hash,
00734 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00735
00736 next = lmclass->n_hash;
00737 lmclass->n_hash *= 2;
00738 }
00739 else {
00740
00741 for (next = 0; next < lmclass->n_hash; ++next)
00742 if (lmclass->nword_hash[next].wid == -1)
00743 break;
00744
00745 assert(next != lmclass->n_hash);
00746 }
00747 lmclass->nword_hash[next].wid = wid;
00748 lmclass->nword_hash[next].prob1 = lweight;
00749 lmclass->nword_hash[hash].next = next;
00750 ++lmclass->n_hash_inuse;
00751 return next;
00752 }
00753 }
00754
00755 void
00756 ngram_class_free(ngram_class_t *lmclass)
00757 {
00758 ckd_free(lmclass->nword_hash);
00759 ckd_free(lmclass->prob1);
00760 ckd_free(lmclass);
00761 }
00762
00763 int32
00764 ngram_model_add_class_word(ngram_model_t *model,
00765 const char *classname,
00766 const char *word,
00767 float32 weight)
00768 {
00769 ngram_class_t *lmclass;
00770 int32 classid, tag_wid, wid, i, scale;
00771 float32 fprob;
00772
00773
00774
00775
00776 tag_wid = ngram_wid(model, classname);
00777 if (tag_wid == NGRAM_INVALID_WID) {
00778 E_ERROR("No such word or class tag: %s\n", classname);
00779 return tag_wid;
00780 }
00781 for (classid = 0; classid < model->n_classes; ++classid) {
00782 if (model->classes[classid]->tag_wid == tag_wid)
00783 break;
00784 }
00785
00786 if (classid == model->n_classes) {
00787 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00788 return NGRAM_INVALID_WID;
00789 }
00790 lmclass = model->classes[classid];
00791
00792
00793 wid = ngram_add_word_internal(model, word, classid);
00794 if (wid == NGRAM_INVALID_WID)
00795 return wid;
00796
00797
00798 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00799
00800
00801
00802 scale = logmath_log(model->lmath, 1.0 - fprob);
00803 for (i = 0; i < lmclass->n_words; ++i)
00804 lmclass->prob1[i] += scale;
00805 for (i = 0; i < lmclass->n_hash; ++i)
00806 if (lmclass->nword_hash[i].wid != -1)
00807 lmclass->nword_hash[i].prob1 += scale;
00808
00809
00810 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00811 }
00812
00813 int32
00814 ngram_model_add_class(ngram_model_t *model,
00815 const char *classname,
00816 float32 classweight,
00817 char **words,
00818 const float32 *weights,
00819 int32 n_words)
00820 {
00821 ngram_class_t *lmclass;
00822 glist_t classwords = NULL;
00823 int32 i, start_wid = -1;
00824 int32 classid, tag_wid;
00825
00826
00827 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
00828 tag_wid = ngram_model_add_word(model, classname, classweight);
00829 if (tag_wid == NGRAM_INVALID_WID)
00830 return -1;
00831 }
00832
00833 if (model->n_classes == 128) {
00834 E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
00835 return -1;
00836 }
00837 classid = model->n_classes;
00838 for (i = 0; i < n_words; ++i) {
00839 int32 wid;
00840
00841 wid = ngram_add_word_internal(model, words[i], classid);
00842 if (wid == NGRAM_INVALID_WID)
00843 return -1;
00844 if (start_wid == -1)
00845 start_wid = NGRAM_BASEWID(wid);
00846 classwords = glist_add_float32(classwords, weights[i]);
00847 }
00848 classwords = glist_reverse(classwords);
00849 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
00850 glist_free(classwords);
00851 if (lmclass == NULL)
00852 return -1;
00853
00854 ++model->n_classes;
00855 if (model->classes == NULL)
00856 model->classes = ckd_calloc(1, sizeof(*model->classes));
00857 else
00858 model->classes = ckd_realloc(model->classes,
00859 model->n_classes * sizeof(*model->classes));
00860 model->classes[classid] = lmclass;
00861 return classid;
00862 }
00863
00864 int32
00865 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
00866 {
00867 int32 base_wid = NGRAM_BASEWID(wid);
00868
00869 if (base_wid < lmclass->start_wid
00870 || base_wid > lmclass->start_wid + lmclass->n_words) {
00871 int32 hash;
00872
00873
00874 hash = wid & (lmclass->n_hash - 1);
00875 while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
00876 hash = lmclass->nword_hash[hash].next;
00877 if (hash == -1)
00878 return 0;
00879 return lmclass->nword_hash[hash].prob1;
00880 }
00881 else {
00882 return lmclass->prob1[base_wid - lmclass->start_wid];
00883 }
00884 }
00885
00886 int32
00887 read_classdef_file(hash_table_t *classes, const char *file_name)
00888 {
00889 FILE *fp;
00890 int32 is_pipe;
00891 int inclass;
00892 int32 rv = -1;
00893 gnode_t *gn;
00894 glist_t classwords = NULL;
00895 glist_t classprobs = NULL;
00896 char *classname = NULL;
00897
00898 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
00899 E_ERROR("File %s not found\n", file_name);
00900 return -1;
00901 }
00902
00903 inclass = FALSE;
00904 while (!feof(fp)) {
00905 char line[512];
00906 char *wptr[2];
00907 int n_words;
00908
00909 if (fgets(line, sizeof(line), fp) == NULL)
00910 break;
00911
00912 n_words = str2words(line, wptr, 2);
00913 if (n_words <= 0)
00914 continue;
00915
00916 if (inclass) {
00917
00918 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
00919 classdef_t *classdef;
00920 gnode_t *word, *weight;
00921 int32 i;
00922
00923 if (classname == NULL || 0 != strcmp(wptr[1], classname))
00924 goto error_out;
00925 inclass = FALSE;
00926
00927
00928 classdef = ckd_calloc(1, sizeof(*classdef));
00929 classwords = glist_reverse(classwords);
00930 classprobs = glist_reverse(classprobs);
00931 classdef->n_words = glist_count(classwords);
00932 classdef->words = ckd_calloc(classdef->n_words,
00933 sizeof(*classdef->words));
00934 classdef->weights = ckd_calloc(classdef->n_words,
00935 sizeof(*classdef->weights));
00936 word = classwords;
00937 weight = classprobs;
00938 for (i = 0; i < classdef->n_words; ++i) {
00939 classdef->words[i] = gnode_ptr(word);
00940 classdef->weights[i] = gnode_float32(weight);
00941 word = gnode_next(word);
00942 weight = gnode_next(weight);
00943 }
00944
00945
00946 if (hash_table_enter(classes, classname, classdef) != classdef) {
00947 classdef_free(classdef);
00948 goto error_out;
00949 }
00950
00951
00952 glist_free(classwords);
00953 glist_free(classprobs);
00954 classwords = NULL;
00955 classprobs = NULL;
00956 classname = NULL;
00957 }
00958 else {
00959 float32 fprob;
00960
00961 if (n_words == 2)
00962 fprob = (float32)atof_c(wptr[1]);
00963 else
00964 fprob = 1.0f;
00965
00966 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
00967 classprobs = glist_add_float32(classprobs, fprob);
00968 }
00969 }
00970 else {
00971
00972 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
00973 if (inclass)
00974 goto error_out;
00975 inclass = TRUE;
00976 classname = ckd_salloc(wptr[1]);
00977 }
00978
00979 }
00980 }
00981 rv = 0;
00982
00983 error_out:
00984
00985 fclose_comp(fp, is_pipe);
00986 for (gn = classwords; gn; gn = gnode_next(gn))
00987 ckd_free(gnode_ptr(gn));
00988 glist_free(classwords);
00989 glist_free(classprobs);
00990 ckd_free(classname);
00991
00992 return rv;
00993 }
00994
00995 void
00996 classdef_free(classdef_t *classdef)
00997 {
00998 int32 i;
00999 for (i = 0; i < classdef->n_words; ++i)
01000 ckd_free(classdef->words[i]);
01001 ckd_free(classdef->words);
01002 ckd_free(classdef->weights);
01003 ckd_free(classdef);
01004 }
01005
01006
01007 int32
01008 ngram_model_read_classdef(ngram_model_t *model,
01009 const char *file_name)
01010 {
01011 hash_table_t *classes;
01012 glist_t hl = NULL;
01013 gnode_t *gn;
01014 int32 rv = -1;
01015
01016 classes = hash_table_new(0, FALSE);
01017 if (read_classdef_file(classes, file_name) < 0) {
01018 hash_table_free(classes);
01019 return -1;
01020 }
01021
01022
01023 hl = hash_table_tolist(classes, NULL);
01024 for (gn = hl; gn; gn = gnode_next(gn)) {
01025 hash_entry_t *he = gnode_ptr(gn);
01026 classdef_t *classdef = he->val;
01027
01028 if (ngram_model_add_class(model, he->key, 1.0,
01029 classdef->words,
01030 classdef->weights,
01031 classdef->n_words) < 0)
01032 goto error_out;
01033 }
01034 rv = 0;
01035
01036 error_out:
01037 for (gn = hl; gn; gn = gnode_next(gn)) {
01038 hash_entry_t *he = gnode_ptr(gn);
01039 ckd_free((char *)he->key);
01040 classdef_free(he->val);
01041 }
01042 glist_free(hl);
01043 hash_table_free(classes);
01044 return rv;
01045 }