44 #pragma warning(disable:4244) // Conversion warnings
45 #pragma warning(disable:4800) // int/bool warnings
50 #define MAX_WERD_SIZE 100
63 #define BestCertainty(Choices) \
64 (((VIABLE_CHOICE) first_node (Choices))->Certainty)
66 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
68 #define BestFactor(Choices) \
69 (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
76 static int CmpChoiceRatings(
void *arg1,
83 return (R1 < R2) ? -1 : 1;
93 ExpandedChoice->
Choice = Choice;
94 for (i = 0, Chunk = 0; i < Choice->
Length; i++)
123 const float certainties[],
131 for (
int i = 0, bw_idx = 0; i < word_choice.
length(); i++, bw_idx++) {
132 int blob_width = pieces_state[bw_idx];
138 blob_width = pieces_state[++bw_idx];
139 assert(blob_width > 0);
155 for (
int i = 0; i < src_choices.
size(); ++i) {
156 BLOB_CHOICE_LIST *cc_list =
new BLOB_CHOICE_LIST();
158 list_it.add_after_then_move(cc_list);
176 for (i = 0, Chunk = 0; i < Choice->
Length; i++) {
195 bool *modified_blobs) {
198 if (modified_blobs !=
NULL) *modified_blobs =
false;
203 if (BestChoice->
length() == 0)
207 cprintf(
"AcceptableChoice(): a choice with fragments beats BestChoice");
214 Choices, modified_blobs));
219 tprintf(
"\nStopper: %s (word=%c, case=%c)\n",
221 (is_valid_word ?
'y' :
'n'),
222 (is_case_ok ?
'y' :
'n'));
225 if (reject_offset_ <= 0.0
f && !is_valid_word)
return false;
226 if (is_valid_word && is_case_ok) {
235 tprintf(
"Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
236 BestChoice->
certainty(), CertaintyThreshold);
238 if (no_dang_ambigs &&
239 BestChoice->
certainty() > CertaintyThreshold &&
244 tprintf(
"AcceptableChoice() returned false"
245 " (no_dang_ambig:%d cert:%g thresh:%g uniform:%d)\n",
259 tprintf(
"\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
270 cprintf(
"AcceptableResult(): a choice with fragments beats BestChoice\n");
283 cprintf (
"Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
284 BestChoice.
certainty(), CertaintyThreshold);
286 if (BestChoice.
certainty() > CertaintyThreshold &&
302 Alternatives =
list_rest (best_choices_);
312 return (best_choices_ !=
NIL_LIST &&
333 char LabelString[80];
342 if (best_raw_choice_)
346 Choices = best_choices_;
348 cprintf(
"\nBest Cooked Choices:\n");
350 sprintf(LabelString,
"Cooked Choice #%d: ", i);
359 int label_num_unichars) {
363 (label_num_unichars > 1 || Choice->
Length > 1)) {
364 for (
int i = 0; i < Choice->
Length; i++) {
369 fprintf(file,
"\t%s\t%.4f\t%.4f\n", label,
384 ExpandChoice(best_choice, &BestChoice);
390 &BestChoice, is_bad));
405 assert (best_raw_choice_ !=
NULL);
407 ExpandChoice(best_raw_choice_, &BestRaw);
410 for (i = 0, Chunk = 0; i < Choice->
Length; i++, Thresholds++) {
421 if (NumErrorChunks > 0) {
422 AvgRating /= NumErrorChunks;
426 *Thresholds = MaxRating;
428 if (*Thresholds > MaxRating)
429 *Thresholds = MaxRating;
430 if (*Thresholds < MinRating)
431 *Thresholds = MinRating;
438 if (best_raw_choice_)
439 delete best_raw_choice_;
440 best_raw_choice_ =
NULL;
452 for (BlobWidth = current_segmentation_,
454 BlobWidth <
End; *BlobWidth++ = 1);
465 for (Segmentation = current_segmentation_; *BlobWidth != 0;
466 BlobWidth++, Segmentation++)
467 *Segmentation = *BlobWidth;
473 if (best_raw_choice_)
AddNewChunk(best_raw_choice_, Blob);
474 Choices = best_choices_;
478 Choices = raw_choices_;
485 const float Certainties[],
493 if (!keep_word_choices_)
497 if (!best_raw_choice_) {
500 }
else if (WordChoice->
rating() < best_raw_choice_->
Rating) {
505 delete best_raw_choice_;
511 ChoicesList = raw_choices_;
513 ChoicesList = best_choices_;
527 tprintf(
"Discarding choice \"%s\" with an overly low certainty"
528 " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
540 Choices = ChoicesList;
562 ChoicesList =
s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
575 raw_choices_ = ChoicesList;
577 best_choices_ = ChoicesList;
583 bool fix_replaceable,
585 bool *modified_blobs) {
587 tprintf(
"\nRunning NoDangerousAmbig() for %s\n",
595 bool modified_best_choice =
false;
596 bool ambigs_found =
false;
612 for (
int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
613 bool replace = (fix_replaceable && pass == 0);
621 for (i = 0; i < best_choice->
length(); ++i) {
622 BLOB_CHOICE_LIST *lst =
new BLOB_CHOICE_LIST();
623 BLOB_CHOICE_IT lst_it(lst);
627 0.0, 0.0, -1, -1, -1, 0, 1,
false));
632 int wrong_ngram_index;
635 for (i = 0; i < best_choice->
length(); ++i) {
639 tprintf(
"Looking for %s ngrams starting with %s:\n",
640 replace ?
"replaceable" :
"ambiguous",
643 wrong_ngram_index = 0;
644 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
645 if (curr_unichar_id == INVALID_UNICHAR_ID ||
646 curr_unichar_id >= table.
size() ||
647 table[curr_unichar_id] ==
NULL) {
650 AmbigSpec_IT spec_it(table[curr_unichar_id]);
651 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
652 const AmbigSpec *ambig_spec = spec_it.data();
653 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
659 tprintf(
"current ngram from spec: ");
661 tprintf(
"comparison result: %d\n", compare);
667 blob_index, blob_index+wrong_ngram_index, replace,
670 tprintf(
"fixpt+=(%d %d %d %d)\n", blob_index,
671 blob_index+wrong_ngram_index,
false,
679 tprintf(
"replace ambiguity with: ");
685 best_choice, blob_choices, modified_blobs);
686 modified_best_choice =
true;
695 for (
int tmp_index = 0; tmp_index <= wrong_ngram_index;
704 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
707 -1, -1, -1, 0, 1,
false));
711 }
else if (compare == -1) {
713 ((next_index = wrong_ngram_index+1+i) < best_choice->
length())) {
716 wrong_ngram[++wrong_ngram_index] =
732 tprintf(
"\nResulting ambig_blob_choices:\n");
733 for (i = 0; i < ambig_blob_choices.
length(); ++i) {
739 ambigs_found = (alt_word->
rating() < 0.0);
742 tprintf (
"Stopper: Possible ambiguous word = %s\n",
751 for (i = 0; i < alt_word->
length(); ++i) {
752 bool replacement_is_ngram =
756 (orig_i == end_i && replacement_is_ngram)) {
758 replacement_is_ngram));
760 tprintf(
"fixpt->dangerous+=(%d %d %d %d)\n", orig_i, end_i,
761 true, replacement_is_ngram);
770 if (output_ambig_words_file_ !=
NULL) {
771 fprintf(output_ambig_words_file_,
"\n");
775 return !ambigs_found;
781 reject_offset_ = 0.0;
790 for (i = 0, LastChunk = 0; i < Choice->
Length; i++) {
792 if (Blob < LastChunk) {
797 cprintf (
"AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
798 Choice->
Length, LastChunk, Blob);
805 bool *modified_blobs) {
806 int num_blobs_to_replace = 0;
807 int begin_blob_index = 0;
809 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
810 if (i >= wrong_ngram_begin_index) {
817 int temp_blob_index = begin_blob_index;
818 const char *temp_uch =
NULL;
819 const char *correct_ngram_str =
821 for (
int replaced_count = 0; replaced_count < wrong_ngram_size;
823 if (blob_choices !=
NULL) {
827 for (i = 0; i < fraglen; ++i) {
834 bit.set_to_list(blob_choices->
get(temp_blob_index));
837 temp_blob_index - begin_blob_index,
838 num_blobs_to_replace,
false);
845 for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
846 if (bit.data()->unichar_id() == correct_frag_uch_id) {
849 if (bit.data()->unichar_id() == uch_id) {
850 bit.add_before_then_move(
new BLOB_CHOICE(*(bit.data())));
851 bit.data()->set_unichar_id(correct_frag_uch_id);
852 if (modified_blobs !=
NULL) *modified_blobs =
true;
861 if (replaced_count + 1 == wrong_ngram_size) {
863 num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
869 *modified_blobs && blob_choices !=
NULL) {
870 werd_choice->
print(
"ReplaceAmbig() ");
871 tprintf(
"Modified blob_choices: ");
872 for (
int i = 0; i < blob_choices->
size(); ++i) {
886 for (
int w = 0; w < WordChoice.
length(); ++w) {
889 }
else if (curr_len > 0) {
890 if (curr_len < shortest) shortest = curr_len;
894 if (curr_len > 0 && curr_len < shortest) {
904 const float Certainties[]) {
905 int Length = WordChoice.
length();
906 assert (Length <= MAX_NUM_CHUNKS && Length > 0);
914 fprintf (File,
"%s", Label);
915 fprintf(File,
"(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
919 for (i = 0; i < Choice->
Length; i++)
923 for (i = 0; i < Choice->
Length; i++) {
930 for (i = 0; i < Choice->
Length; i++) {
932 fprintf(File,
"%3d ", (
int) (Choice->
Blob[i].
Certainty * -10.0));
936 for (i = 0; i < Choice->
Length; i++) {
944 FLOAT32 AdjustFactor,
const float Certainties[],
946 ViableChoice->
Init(WordChoice, current_segmentation_, Certainties,
958 for (i = 0, CharChoice = &(ViableChoice->
Blob[0]);
959 i < ViableChoice->
Length; CharChoice++, i++) {
968 const char *String_lengths,
972 int current_unichar_length;
974 for (Char = &(ViableChoice->
Blob[0]), i = 0;
975 i < ViableChoice->Length;
976 String += *(String_lengths++), Char++, i++) {
978 if (current_unichar_length != *String_lengths ||
980 current_unichar_length) != 0)
983 return (*String == 0) ?
true :
false;
990 float CertaintyThreshold;
997 WordLength = Choices.
length();
1001 TotalCertainty = TotalCertaintySquared = 0.0;
1002 BLOB_CHOICE_IT BlobChoiceIt;
1003 for (
int i = 0; i < Choices.
length(); ++i) {
1004 BlobChoiceIt.set_to_list(Choices.
get(i));
1005 Certainty = BlobChoiceIt.data()->certainty();
1006 TotalCertainty += Certainty;
1007 TotalCertaintySquared += Certainty * Certainty;
1008 if (Certainty < WorstCertainty)
1009 WorstCertainty = Certainty;
1014 TotalCertainty -= WorstCertainty;
1015 TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1017 Mean = TotalCertainty / WordLength;
1018 Variance = ((WordLength * TotalCertaintySquared -
1019 TotalCertainty * TotalCertainty) /
1020 (WordLength * (WordLength - 1)));
1023 StdDev = sqrt (Variance);
1029 if (BestChoice.
certainty() < CertaintyThreshold) {
1031 cprintf(
"Stopper: Non-uniform certainty = %4.1f"
1032 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",