25 #pragma warning(disable:4244) // Conversion warnings
35 probability_in_context_(&
tesseract::
Dict::def_probability_in_context),
36 image_ptr_(image_ptr),
38 "A list of user-provided words.",
39 getImage()->getCCUtil()->params()),
41 "A list of user-provided patterns.",
42 getImage()->getCCUtil()->params()),
44 getImage()->getCCUtil()->params()),
46 getImage()->getCCUtil()->params()),
48 getImage()->getCCUtil()->params()),
50 " patterns.", getImage()->getCCUtil()->params()),
52 " patterns.", getImage()->getCCUtil()->params()),
54 " (e.g. for non-space delimited languages)",
55 getImage()->getCCUtil()->params()),
57 "bigrams.", getImage()->getCCUtil()->params()),
59 "Score multiplier for word matches which have good case and"
60 "are frequent in the given language (lower is better).",
61 getImage()->getCCUtil()->params()),
63 "Score multiplier for word matches that have good case "
64 "(lower is better).", getImage()->getCCUtil()->params()),
66 "Default score multiplier for word matches, which may have "
67 "case issues (lower is better).",
68 getImage()->getCCUtil()->params()),
70 "Multipler to for the best choice from the ngram model.",
71 getImage()->getCCUtil()->params()),
73 "Score multiplier for glyph fragment segmentations which "
74 "do not match a dictionary word (lower is better).",
75 getImage()->getCCUtil()->params()),
77 "Score multiplier for poorly cased strings that are not in"
78 " the dictionary and generally look like garbage (lower is"
79 " better).", getImage()->getCCUtil()->params()),
81 "Output file for ambiguities found in the dictionary",
82 getImage()->getCCUtil()->params()),
83 INT_MEMBER(dawg_debug_level, 0,
"Set to 1 for general debug info"
84 ", to 2 for more details, to 3 to see all the debug messages",
85 getImage()->getCCUtil()->params()),
86 INT_MEMBER(hyphen_debug_level, 0,
"Debug level for hyphenated words.",
87 getImage()->getCCUtil()->params()),
88 INT_MEMBER(max_viterbi_list_size, 10,
"Maximum size of viterbi list.",
89 getImage()->getCCUtil()->params()),
91 "Use only the first UTF8 step of the given string"
92 " when computing log probabilities.",
93 getImage()->getCCUtil()->params()),
94 double_MEMBER(certainty_scale, 20.0,
"Certainty scaling factor",
95 getImage()->getCCUtil()->params()),
97 "Certainty threshold for non-dict words",
98 getImage()->getCCUtil()->params()),
100 "Reject certainty offset",
101 getImage()->getCCUtil()->params()),
103 "Size of dict word to be treated as non-dict word",
104 getImage()->getCCUtil()->params()),
105 double_MEMBER(stopper_certainty_per_char, -0.50,
"Certainty to add"
106 " for each dict char above small word size.",
107 getImage()->getCCUtil()->params()),
109 "Max certaintly variation allowed in a word (in sigma)",
110 getImage()->getCCUtil()->params()),
111 INT_MEMBER(stopper_debug_level, 0,
"Stopper debug level",
112 getImage()->getCCUtil()->params()),
114 "Make AcceptableChoice() always return false. Useful"
115 " when there is a need to explore all segmentations",
116 getImage()->getCCUtil()->params()),
118 "Gain factor for ambiguity threshold.",
119 getImage()->getCCUtil()->params()),
121 "Certainty offset for ambiguity threshold.",
122 getImage()->getCCUtil()->params()),
123 BOOL_MEMBER(save_raw_choices, false,
"Save all explored raw choices",
124 getImage()->getCCUtil()->params()),
125 INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
126 "Max words to keep in list",
127 getImage()->getCCUtil()->params()),
128 STRING_MEMBER(word_to_debug,
"",
"Word for which stopper debug"
129 " information should be printed to stdout",
130 getImage()->getCCUtil()->params()),
132 "Lengths of unichars in word_to_debug",
133 getImage()->getCCUtil()->params()),
135 getImage()->getCCUtil()->params()),
137 getImage()->getCCUtil()->params()),
139 getImage()->getCCUtil()->params()),
140 double_MEMBER(bestrate_pruning_factor, 2.0,
"Multiplying factor of"
141 " current best rate to prune other hypotheses",
142 getImage()->getCCUtil()->params()),
144 "Turn on word script consistency permuter",
145 getImage()->getCCUtil()->params()),
147 "incorporate segmentation cost in word rating?",
148 getImage()->getCCUtil()->params()),
150 "Don't use any alphabetic-specific tricks."
151 "Set to true in the traineddata config file for"
152 " scripts that are cursive or inherently fixed-pitch",
153 getImage()->getCCUtil()->params()),
155 "Score multipler for script consistency within a word. "
156 "Being a 'reward' factor, it should be <= 1. "
157 "Smaller value implies bigger reward.",
158 getImage()->getCCUtil()->params()),
160 "Turn on fixed-length phrasebook search permuter",
161 getImage()->getCCUtil()->params()),
163 "Turn on character type (property) consistency permuter",
164 getImage()->getCCUtil()->params()),
166 "Score multipler for char type consistency within a word. ",
167 getImage()->getCCUtil()->params()),
169 "Score multipler for ngram permuter's best choice"
170 " (only used in the Han script path).",
171 getImage()->getCCUtil()->params()),
172 BOOL_MEMBER(save_doc_words, 0,
"Save Document Words",
173 getImage()->getCCUtil()->params()),
174 BOOL_MEMBER(doc_dict_enable, 1,
"Enable Document Dictionary ",
175 getImage()->getCCUtil()->params()),
177 "Worst certainty for using pending dictionary",
178 getImage()->getCCUtil()->params()),
180 "Worst certainty for words that can be inserted into the"
181 "document dictionary", getImage()->getCCUtil()->params()),
183 "Activate character-level n-gram-based permuter",
184 getImage()->getCCUtil()->params()),
186 " character choices to consider during permutation."
187 " This limit is especially useful when user patterns"
188 " are specified, since overly generic patterns can result in"
189 " dawg search exploring an overly large number of options.",
190 getImage()->getCCUtil()->params()),
192 getImage()->getCCUtil()->params()) {
193 dang_ambigs_table_ =
NULL;
194 replace_ambigs_table_ =
NULL;
195 keep_word_choices_ =
false;
196 reject_offset_ = 0.0;
197 best_raw_choice_ =
NULL;
202 last_word_on_line_ =
false;
203 hyphen_unichar_id_ = INVALID_UNICHAR_ID;
204 document_words_ =
NULL;
205 pending_words_ =
NULL;
209 max_fixed_length_dawgs_wdlen_ = -1;
210 wordseg_rating_adjust_factor_ = -1.0f;
211 output_ambig_words_file_ =
NULL;
215 if (hyphen_word_ !=
NULL)
delete hyphen_word_;
216 if (output_ambig_words_file_ !=
NULL) fclose(output_ambig_words_file_);
238 dawgs_ += punc_dawg_;
261 dawgs_ += freq_dawg_;
268 dawgs_ += unambig_dawg_;
302 dawgs_ += document_words_;
315 &dawgs_, &max_fixed_length_dawgs_wdlen_);
322 for (
int i = 0; i < dawgs_.
length(); ++i) {
323 const Dawg *dawg = dawgs_[i];
325 for (
int j = 0; j < dawgs_.
length(); ++j) {
326 const Dawg *other = dawgs_[j];
327 if (dawg !=
NULL && other !=
NULL &&
329 kDawgSuccessors[dawg->
type()][other->
type()]) *lst += j;
343 document_words_ =
NULL;
344 max_fixed_length_dawgs_wdlen_ = -1;
345 if (pending_words_ !=
NULL) {
346 delete pending_words_;
347 pending_words_ =
NULL;
358 for (
int i = 0; unichar_strings[i] != 0; i++) {
360 if (unichar_id != INVALID_UNICHAR_ID) {
369 for (
int i = 0; i < equivalent_symbols_.
size(); i++) {
370 if (equivalent_symbols_[i].contains(unichar_id)) {
371 return equivalent_symbols_[i][0];
382 bool word_end)
const {
386 tprintf(
"def_letter_is_okay: current unichar=%s word_end=%d"
387 " num active dawgs=%d num constraints=%d\n",
397 unichar_id == INVALID_UNICHAR_ID) {
403 PermuterType curr_perm = NO_PERM;
425 dawg_args, &curr_perm);
441 dawg->
edge_char_of(node, dawg_unichar_id, word_end) : NO_EDGE;
448 if (edge != NO_EDGE) {
450 word_end, dawg->
type())) {
457 "Append current dawg to updated active dawgs: ");
471 if (edge != NO_EDGE) {
474 "Recording constraint: ");
485 if (info.
ref == NO_EDGE) {
493 if (edge == NO_EDGE &&
504 for (
int s = 0; s < slist.
length(); ++s) {
505 int sdawg_index = slist[s];
506 const Dawg *sdawg = dawgs_[sdawg_index];
509 for (
int c = 0; c < constraints.
length(); ++c) {
512 const DawgInfo &cinfo = constraints[c];
517 if (snode == 0) snode = NO_EDGE;
527 if (sedge != NO_EDGE &&
529 dawgs_[sdawg_index]->type())) {
531 tprintf(
"Letter found in the successor dawg %d\n", sdawg_index);
537 "Append successor to updated active dawgs: ");
547 if (dawg_args->
permuter == NO_PERM || curr_perm == NO_PERM ||
548 (curr_perm != PUNC_PERM && dawg_args->
permuter != COMPOUND_PERM)) {
557 PermuterType *curr_perm)
const {
562 unichar_id_patterns.
push_back(unichar_id);
564 &unichar_id_patterns);
565 for (
int i = 0; i < unichar_id_patterns.
size(); ++i) {
568 for (
int k = 0; k < 2; ++k) {
570 dawg->
edge_char_of(node, unichar_id_patterns[i], word_end)
572 if (edge != NO_EDGE) {
578 word_end, dawg->
type())) {
585 "Append current dawg to updated active dawgs: ");
593 PermuterType perm,
int debug_level,
598 dawg_vec_copy.
move(dawg_vec);
600 fread(&num_dawgs,
sizeof(
inT32), 1, file);
602 if (swap) num_dawgs =
reverse32(num_dawgs);
604 int max_word_length = 0;
608 for (i = 0; i < num_dawgs; ++i) {
609 fread(&word_length,
sizeof(
inT32), 1, file);
610 if (swap) word_length =
reverse32(word_length);
613 (*dawg_vec)[word_length] =
615 if (word_length > max_word_length) max_word_length = word_length;
617 *max_wdlen = max_word_length;
621 for (i = 0; i < dawg_vec_copy.
size(); ++i) {
628 int num_dawgs,
int debug_level, FILE *output_file) {
629 fwrite(&num_dawgs,
sizeof(
inT32), 1, output_file);
630 if (debug_level)
tprintf(
"Writing %d split length dawgs\n", num_dawgs);
631 for (
int i = 1; i < dawg_vec.
size(); ++i) {
632 if ((dawg_vec)[i] !=
NULL) {
633 fwrite(&i,
sizeof(
inT32), 1, output_file);
634 dawg_vec[i]->write_squished_dawg(output_file);
635 if (debug_level)
tprintf(
"Wrote Dawg with word length %d\n", i);
645 bool ambigs_mode)
const {
647 if (sought_word_length != kAnyWordLength) {
649 if (sought_word_length <= max_fixed_length_dawgs_wdlen_ &&
650 dawgs_[sought_word_length] !=
NULL) {
651 *active_dawgs +=
DawgInfo(sought_word_length, NO_EDGE);
654 *active_dawgs = hyphen_active_dawgs_;
656 for (i = 0; i < hyphen_active_dawgs_.
size(); ++i) {
658 hyphen_active_dawgs_[i].dawg_index,
659 hyphen_active_dawgs_[i].ref);
663 for (i = 0; i < dawgs_.
length(); ++i) {
664 if (dawgs_[i] !=
NULL && kBeginningDawgsType[(dawgs_[i])->type()] &&
666 *active_dawgs +=
DawgInfo(i, NO_EDGE);
679 *constraints = hyphen_constraints_;
681 for (
int i = 0; i < hyphen_constraints_.
size(); ++i) {
683 hyphen_constraints_[i].dawg_index,
684 hyphen_constraints_[i].ref);
697 if (hyphen_word_)
return;
701 int stringlen = best_choice.
length();
708 if (best_choice.
length() >= kDocDictMaxRepChars) {
709 int num_rep_chars = 1;
711 for (
int i = 1; i < best_choice.
length(); ++i) {
717 if (num_rep_chars == kDocDictMaxRepChars)
return;
740 strcat(filename,
".doc");
741 doc_word_file =
open_file (filename,
"a");
742 fprintf(doc_word_file,
"%s\n",
744 fclose(doc_word_file);
750 float *certainty_array,
753 float additional_adjust,
755 bool is_han = (char_choices !=
NULL &&
762 float adjust_factor = additional_adjust;
763 float new_rating = word->
rating();
765 tprintf(
"%sWord: %s %4.2f ", nonword ?
"Non-" :
"",
768 new_rating += kRatingPad;
770 if (case_is_ok && punc_is_ok) {
772 new_rating *= adjust_factor;
776 new_rating *= adjust_factor;
778 if (!case_is_ok)
tprintf(
", C");
779 if (!punc_is_ok)
tprintf(
", P");
787 new_rating *= adjust_factor;
791 new_rating *= adjust_factor;
796 new_rating *= adjust_factor;
800 new_rating -= kRatingPad;
802 if (debug)
tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
803 LogNewChoice(adjust_factor, certainty_array,
false, word,
813 word_ptr = &temp_word;
815 if (word_ptr->
length() == 0)
return NO_PERM;
822 DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
823 &(active_dawgs[1]), &(constraints[1]),
824 0.0, NO_PERM, kAnyWordLength, 0);
825 int last_index = word_ptr->
length() - 1;
829 i == last_index)))
break;
843 delete[] active_dawgs;
844 delete[] constraints;
851 if (bigram_dawg_ ==
NULL)
return false;
855 int w1start, w1end, w2start, w2end;
861 if (w1start >= w1end)
return word1.
length() < 3;
862 if (w2start >= w2end)
return word2.
length() < 3;
866 for (
int i = w1start; i < w1end; i++) {
870 bigram_string +=
" ";
871 for (
int i = w2start; i < w2end; i++) {
880 if (word.
length() == 0)
return NO_PERM;
883 int last_index = word.
length() - 1;
885 for (i = 0; i <= last_index; ++i) {
888 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
892 }
else if ((new_len = new_word.length()) == 0 ||
897 for (i = 0; i < dawgs_.
size(); ++i) {
898 if (dawgs_[i] !=
NULL &&
900 dawgs_[i]->word_in_dawg(new_word))
return true;
911 int *sid =
new int[max_script];
913 for (x = 0; x < max_script; x++) sid[x] = 0;
914 for (x = 0; x < char_choices.
length(); ++x) {
915 BLOB_CHOICE_IT blob_choice_it(char_choices.
get(x));
916 sid[blob_choice_it.data()->script_id()]++;
932 for (x = 1; x < max_script; x++)
933 if (sid[x] >= sid[max_sid]) max_sid = x;
934 if (sid[max_sid] < char_choices.
length() / 2)