Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
language_model.h
Go to the documentation of this file.
1 
2 // File: language_model.h
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help recognition.
5 // Author: Daria Antonova
6 // Created: Mon Nov 11 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
22 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
23 
24 #include "associate.h"
25 #include "dawg.h"
26 #include "dict.h"
27 #include "fontinfo.h"
28 #include "intproto.h"
29 #include "matrix.h"
30 #include "oldheap.h"
31 #include "params.h"
32 #include "pageres.h"
33 
34 namespace tesseract {
35 
36 // Used for expressing various language model flags.
37 typedef unsigned char LanguageModelFlagsType;
38 
39 // Struct for keeping track of the consistency of the path.
42  : punc_ref(NO_EDGE), num_punc(0), invalid_punc(false),
44  script_id(0), inconsistent_script(false),
45  num_alphas(0), num_digits(0), num_other(0),
47  inline int NumInconsistentPunc() const {
48  return invalid_punc ? num_punc : 0;
49  }
50  inline int NumInconsistentCase() const {
52  }
53  inline int NumInconsistentChartype() const {
54  return (NumInconsistentPunc() + num_other +
56  }
57  inline bool Consistent() const {
58  return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
60  }
61  inline int NumInconsistentSpaces() const {
63  }
64 
66  int num_punc;
69  int num_lower;
70  int script_id;
74  int num_other;
77 };
78 
79 
80 // The following structs are used for storing the state of the language model
81 // in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
82 // and the links are the replationships between the underlying blobs (see
83 // segsearch.h for a more detailed description).
84 // Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
85 // a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
86 // search leading up to and including this BLOB_CHOICE.
87 // Each ViterbiStateEntry contains information from various components of the
88 // language model: dawgs in which the path is found, character ngram model
89 // probability of the path, script/chartype/font consistency info, state for
90 // language-specific heuristics (e.g. hyphenated and compund words, lower/upper
91 // case preferences, etc).
92 // Each ViterbiStateEntry also contains the parent pointer, so that the path
93 // that it represents (WERD_CHOICE) can be constructed by following these
94 // parent pointers.
95 
96 // Struct for storing additional information used by Dawg language model
97 // component. It stores the set of active dawgs in which the sequence of
98 // letters on a path can be found and the constraints that have to be
99 // satisfied at the end of the word (e.g. beginning/ending punctuation).
102  PermuterType pt) : permuter(pt) {
103  active_dawgs = new DawgInfoVector(*a);
104  constraints = new DawgInfoVector(*c);
105  }
107  delete active_dawgs;
108  delete constraints;
109  }
112  PermuterType permuter;
113 };
114 
115 // Struct for storing additional information used by Ngram language model
116 // component.
118  LanguageModelNgramInfo(const char *c, int l, bool p, float np, float nc)
120  ngram_cost(nc) {}
121  STRING context; // context string
122  // Length of the context measured by advancing using UNICHAR::utf8_step()
123  // (should be at most the order of the character ngram model used).
125  // The paths with pruned set are pruned out from the perspective of the
126  // character ngram model. They are explored further because they represent
127  // a dictionary match or a top choice. Thus ngram_info is still computed
128  // for them in order to calculate the combined cost.
129  bool pruned;
130  // -ln(P_ngram_model(path))
131  float ngram_prob;
132  // -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
133  float ngram_cost;
134 };
135 
136 // Struct for storing the information about a path in the segmentation graph
137 // explored by Viterbi search.
138 struct ViterbiStateEntry : public ELIST_LINK {
140  BLOB_CHOICE *b, float c, float ol,
142  const AssociateStats &as,
145  : cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()),
146  min_certainty(b->certainty()), adapted(b->adapted()), length(1),
148  top_choice_flags(tcf), dawg_info(d), ngram_info(n), updated(true) {
149  if (pe != NULL) {
150  ratings_sum += pe->ratings_sum;
151  if (pe->min_certainty < min_certainty) {
153  }
154  adapted += pe->adapted;
155  length += pe->length;
157  }
158  }
160  delete dawg_info;
161  delete ngram_info;
162  }
163  // Comparator function for sorting ViterbiStateEntry_LISTs in
164  // non-increasing order of costs.
165  static int Compare(const void *e1, const void *e2) {
166  const ViterbiStateEntry *ve1 =
167  *reinterpret_cast<const ViterbiStateEntry * const *>(e1);
168  const ViterbiStateEntry *ve2 =
169  *reinterpret_cast<const ViterbiStateEntry * const *>(e2);
170  return (ve1->cost < ve2->cost) ? -1 : 1;
171  }
172  inline bool Consistent() const {
174  return true;
175  }
176  return consistency_info.Consistent();
177  }
178 
179  // The cost is an adjusted ratings sum, that is adjusted by all the language
180  // model components that use Viterbi search.
181  float cost;
182 
183  // Pointers to parent BLOB_CHOICE and ViterbiStateEntry (not owned by this).
186 
187  // Various information about the characters on the path represented
188  // by this ViterbiStateEntry.
189  float ratings_sum; // sum of ratings of character on the path
190  float min_certainty; // minimum certainty on the path
191  int adapted; // number of BLOB_CHOICES from adapted templates
192  int length; // number of characters on the path
193  float outline_length; // length of the outline so far
195  AssociateStats associate_stats; // character widths/gaps/seams
196 
197  // Flags for marking the entry as a top choice path with
198  // the smallest rating or lower/upper case letters).
200 
201  // Extra information maintained by Dawg laguage model component
202  // (owned by ViterbiStateEntry).
204 
205  // Extra information maintained by Ngram laguage model component
206  // (owned by ViterbiStateEntry).
208 
209  bool updated; // set to true if the entry has just been created/updated
210 };
211 
213 
214 // Struct to store information maintained by various language model components.
216  LanguageModelState(int col, int row) : contained_in_col(col),
221 
222  // Ratings matrix cell that holds this LanguageModelState
223  // (needed to construct best STATE for rebuild_current_state()
224  // and best BLOB_CHOICE_LIST_VECTOR for AcceptableChoice()).
227 
228  // Storage for the Viterbi state.
229  ViterbiStateEntry_LIST viterbi_state_entries;
230  // Number and max cost of prunable paths in viterbi_state_entries.
232  // Total number of entries in viterbi_state_entries.
235 
236  // TODO(daria): add font consistency checking.
237 };
238 
239 // Bundle together all the things pertaining to the best choice/state.
243  : best_state(s), best_choice(bc), raw_choice(rc),
244  best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {}
245 
250  bool updated;
252  ViterbiStateEntry *best_vse; // best ViterbiStateEntry and BLOB_CHOICE
253  BLOB_CHOICE *best_b; // at the end of the best choice path
254 };
255 
257  float avg_cost;
260 };
261 
262 // This class that contains the data structures and functions necessary
263 // to represent and use the knowledge about the language.
265  public:
266  // Adjustments to pain point priority.
271 
272  // Denominator for normalizing per-letter ngram cost when deriving
273  // penalty adjustments.
274  static const float kMaxAvgNgramCost;
275  // Minimum word length for fixed length dawgs.
276  // TODO(daria): check in the new chi/jpn.traineddata without the
277  // fixed length dawg of length 1 and delete this variable.
278  static const int kMinFixedLengthDawgLength;
279  // If there is a significant drop in character ngram probability or a
280  // dangerous ambiguity make the thresholds on what blob combinations
281  // can be classified looser.
282  static const float kLooseMaxCharWhRatio;
283 
284  // Masks for interpreting which language model components
285  // were changed by the call to UpdateState().
290  static const LanguageModelFlagsType kDawgFlag = 0x10;
291  static const LanguageModelFlagsType kNgramFlag = 0x20;
294 
295  LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
296  ~LanguageModel();
297 
298  // Updates data structures that are used for the duration of the segmentation
299  // search on the current word;
300  void InitForWord(const WERD_CHOICE *prev_word,
301  bool fixed_pitch, float best_choice_cert,
302  float max_char_wh_ratio, float rating_cert_scale,
303  HEAP *pain_points, CHUNKS_RECORD *chunks_record,
304  BlamerBundle *blamer_bundle, bool debug_blamer);
305  // Resets all the "updated" flags used by the Viterbi search that were
306  // "registered" during the update of the ratings matrix.
307  void CleanUp();
308  // Deletes and sets to NULL language model states of each of the
309  // BLOB_CHOICEs in the given BLOB_CHOICE_LIST.
310  void DeleteState(BLOB_CHOICE_LIST *choices);
311 
312  // Updates language model state of the given BLOB_CHOICE_LIST (from
313  // the ratings matrix) a its parent. Updates pain_points if new
314  // problematic points are found in the segmentation graph.
315  //
316  // At most language_model_viterbi_list_size are kept in each
317  // LanguageModelState.viterbi_state_entries list.
318  // At most language_model_viterbi_list_max_num_prunable of those are prunable
319  // (non-dictionary) paths.
320  // The entries that represent dictionary word paths are kept at the front
321  // of the list.
322  // The list ordered by cost that is computed collectively by several
323  // language model components (currently dawg and ngram components).
324  //
325  // best_path_by_column records the lowest cost path found so far for each
326  // column of the chunks_record->ratings matrix over all the rows. This
327  // array is updated if a lower cost ViterbiStateEntry is created in curr_col.
329  LanguageModelFlagsType changed,
330  int curr_col, int curr_row,
331  BLOB_CHOICE_LIST *curr_list,
332  BLOB_CHOICE_LIST *parent_list,
333  HEAP *pain_points,
334  BestPathByColumn *best_path_by_column[],
335  CHUNKS_RECORD *chunks_record,
336  BestChoiceBundle *best_choice_bundle,
337  BlamerBundle *blamer_bundle);
338 
339  // Generates pain points from the problematic top choice paths when the
340  // segmentation search is guided by the character ngram model.
341  // It is necessary to consider problematic the top choice paths instead of
342  // the problematic lowest cost paths because the character ngram model
343  // might assign a very high cost to very improbably paths. For example,
344  // "liot" might have a much lower cost than "llot", and the character ngram
345  // model might detect a dip in probability for p(t|lio) at the end of the
346  // word, but not at the beginning (p(i|l) would be ok). However, looking at
347  // the dips in character ngram probability of the top choices we would be
348  // able to stop the problematic points (p(l| l) would be low).
349  void GenerateNgramModelPainPointsFromColumn(int col, int row,
350  HEAP *pain_points,
351  CHUNKS_RECORD *chunks_record);
352 
353  // Generates pain points from the problematic lowest cost paths that are
354  // "promising" (i.e. would have the cost lower than the one recorded in
355  // best_path_by_column if the problematic ending of the path is removed
356  // and after being combined with another blob the certainty of the last
357  // blob is improved).
359  int col, int row, float best_choice_cert,
360  HEAP *pain_points, BestPathByColumn *best_path_by_column[],
361  CHUNKS_RECORD *chunks_record);
362 
363  // This function can be called after processing column col of the
364  // chunks_record->ratings matrix in order to find the promising paths
365  // that were terminated or made inconsistent by the character choices
366  // in column col. If such paths are identified, this function generates
367  // pain points to combine the problematic cells of the matrix.
369  int col,
370  const GenericVector<int> &non_empty_rows,
371  float best_choice_cert,
372  HEAP *pain_points,
373  BestPathByColumn *best_path_by_column[],
374  CHUNKS_RECORD *chunks_record);
375 
376  // Generates a pain point for each problematic point on the best choice
377  // path. Such problematic points could be a termination of a dicionary
378  // word, dip in ngram probability, invalid punctuation, inconsistent
379  // case/chartype/script or punctuation in the middle of a word.
381  HEAP *pain_points,
382  CHUNKS_RECORD *chunks_record,
383  BestChoiceBundle *best_choice_bundle);
384 
385  // Adds a pain point to the given pain_points queue that will cause
386  // the entry at chunks_record->ratings(col, row) to be classified.
387  // The priority of the pain point is set to be:
388  //
389  // priority_adjustment * sqrt(avg_parent_cost)
390  // ----------------------------------------------------
391  // sqrt(dict_parent_path_length) * |worst_piece_cert|
392  //
393  // The priority is further lowered if fragmented is true.
394  // Reurns true if a new pain point was added to pain_points.
395  bool GeneratePainPoint(int col, int row, bool ok_to_extend,
396  float priority_adjustment,
397  float worst_piece_cert,
398  bool fragmented,
399  float best_choice_cert,
400  float max_char_wh_ratio,
401  BLOB_CHOICE *parent_b,
402  ViterbiStateEntry *parent_vse,
403  CHUNKS_RECORD *chunks_record,
404  HEAP *pain_points);
405 
406  // Returns true if an acceptable best choice was discovered.
408 
409  // Fills cert with the worst certainty of the top non-fragmented choice
410  // of the left and right neighbor of the given col,row.
411  // Sets fragmented if any of the neighbors have a fragmented character
412  // as the top choice.
413  inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings,
414  float *cert, bool *fragmented) {
415  *cert = 0.0f;
416  *fragmented = false;
417  if (row > 0) {
418  GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
419  }
420  if (col+1 < ratings->dimension()) {
421  GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
422  }
423  ASSERT_HOST(*cert < 0.0f);
424  }
425 
426  // Returns outline length of the given blob is computed as:
427  // rating_cert_scale * rating / certainty
428  // Since from Wordrec::SegSearch() in segsearch.cpp
429  // rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale
430  // And from Classify::ConvertMatchesToChoices() in adaptmatch.cpp
431  // Rating = Certainty = next.rating
432  // Rating *= rating_scale * Results->BlobLength
433  // Certainty *= -(getDict().certainty_scale)
435  return rating_cert_scale_ * b->rating() / b->certainty();
436  }
437 
438  protected:
439 
440  inline float CertaintyScore(float cert) {
442  // cert is assumed to be between 0 and -dict_->certainty_scale.
443  // If you enable language_model_use_sigmoidal_certainty, you
444  // need to adjust language_model_ngram_nonmatch_score as well.
445  cert = -cert / dict_->certainty_scale;
446  return 1.0f / (1.0f + exp(10.0f * cert));
447  } else {
448  return (-1.0f / cert);
449  }
450  }
451 
452  inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension,
453  UNICHAR_ID unichar_id) {
454  return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
455  !dict_->getUnicharset().get_isdigit(unichar_id) &&
456  col > 0 && row+1 < dimension);
457  }
458 
459  inline bool IsFragment(BLOB_CHOICE *b) {
460  return dict_->getUnicharset().get_fragment(b->unichar_id());
461  }
462 
463  inline bool IsHan(int script_id) {
464  return ((dict_->getUnicharset().han_sid() !=
465  dict_->getUnicharset().null_sid()) &&
466  (script_id == dict_->getUnicharset().han_sid()));
467  }
468 
469  // Finds the first non-fragmented character in the given BLOB_CHOICE_LIST
470  // and updates cert if its certainty is less than the one recorded in cert.
471  // Sets fragmented if the first choice in BLOB_CHOICE_LIST is a fragment.
472  inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist,
473  float *cert, bool *fragmented) {
474  if (blist == NOT_CLASSIFIED || blist->empty()) return;
475  BLOB_CHOICE_IT bit(blist);
476  while (!bit.at_last() && IsFragment(bit.data())) {
477  *fragmented = true;
478  bit.forward(); // skip fragments
479  }
480  // Each classification must have at least one non-fragmented choice.
481  ASSERT_HOST(!IsFragment(bit.data()));
482  if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
483  }
484 
485  inline float ComputeAdjustment(int num_problems, float penalty) {
486  if (num_problems == 0) return 0.0f;
487  if (num_problems == 1) return penalty;
488  return (penalty + (language_model_penalty_increment *
489  static_cast<float>(num_problems-1)));
490  }
491 
492  // Computes the adjustment to the ratings sum based on the given
493  // consistency_info. The paths with invalid punctuation, inconsistent
494  // case and character type are penalized proportionally to the number
495  // of inconsistencies on the path.
497  const LanguageModelDawgInfo *dawg_info,
498  const LanguageModelConsistencyInfo &consistency_info) {
499  if (dawg_info != NULL) {
500  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
502  }
503  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
505  ComputeAdjustment(consistency_info.NumInconsistentCase(),
507  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
509  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
511  (consistency_info.inconsistent_script ?
513  (consistency_info.inconsistent_font ?
515  }
516 
517  // Returns an adjusted ratings sum that includes inconsistency penalties.
519  float ratings_sum,
520  const LanguageModelDawgInfo *dawg_info,
521  const LanguageModelConsistencyInfo &consistency_info) {
522  return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
523  dawg_info, consistency_info)));
524  }
525 
526  // Returns an adjusted ratings sum that includes inconsistency penalties,
527  // penalties for non-dictionary paths and paths with dips in ngram
528  // probability.
530  float ratings_sum, int length, float dawg_score,
531  const LanguageModelDawgInfo *dawg_info,
532  const LanguageModelNgramInfo *ngram_info,
533  const LanguageModelConsistencyInfo &consistency_info,
534  const AssociateStats &associate_stats,
535  ViterbiStateEntry *parent_vse);
536 
537  // Returns true if the given ViterbiStateEntry represents a problematic
538  // path. A path is considered problematic if the last unichar makes it
539  // inconsistent, introduces a dip in ngram probability or transforms a
540  // dictionary path into a non-dictionary one.
541  bool ProblematicPath(const ViterbiStateEntry &vse,
542  UNICHAR_ID unichar_id, bool word_end);
543 
544  // Finds the first lower and upper case character in curr_list.
545  // If none found, chooses the first character in the list.
547  BLOB_CHOICE_LIST *curr_list,
548  BLOB_CHOICE **first_lower,
549  BLOB_CHOICE **first_upper);
550 
551  // Helper function that computes the cost of the path composed of the
552  // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
553  // Adds a new ViterbiStateEntry to the list of viterbi entries
554  // in the given BLOB_CHOICE if the new path looks good enough.
555  // Returns LanguageModelFlagsType that indicates which language
556  // model components were involved in creating the new entry.
558  LanguageModelFlagsType top_choice_flags,
559  float denom,
560  bool word_end,
561  int curr_col, int curr_row,
562  BLOB_CHOICE *b,
563  BLOB_CHOICE *parent_b,
564  ViterbiStateEntry *parent_vse,
565  HEAP *pain_points,
566  BestPathByColumn *best_path_by_column[],
567  CHUNKS_RECORD *chunks_record,
568  BestChoiceBundle *best_choice_bundle,
569  BlamerBundle *blamer_bundle);
570 
571  // Pretty print information in the given ViterbiStateEntry.
572  void PrintViterbiStateEntry(const char *msg,
573  ViterbiStateEntry *vse,
574  BLOB_CHOICE *b,
575  CHUNKS_RECORD *chunks_record);
576 
577  // Determines whether a potential entry is a true top choice and
578  // updates changed accordingly.
579  //
580  // Note: The function assumes that b, top_choice_flags and changed
581  // are not NULL.
583  float ratings_sum,
584  const LanguageModelDawgInfo *dawg_info,
585  const LanguageModelConsistencyInfo &consistency_info,
586  const ViterbiStateEntry *parent_vse,
587  BLOB_CHOICE *b,
588  LanguageModelFlagsType *top_choice_flags,
589  LanguageModelFlagsType *changed);
590 
591  // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
592  // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
593  // with updated active dawgs, constraints and permuter.
594  //
595  // Note: the caller is responsible for deleting the returned pointer.
596  LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id,
597  int curr_col, int curr_row,
598  const BLOB_CHOICE &b,
599  const ViterbiStateEntry *parent_vse,
600  LanguageModelFlagsType *changed);
601 
602  // Computes p(unichar | parent context) and records it in ngram_cost.
603  // If b.unichar_id() is an unlikely continuation of the parent context
604  // sets found_small_prob to true and returns NULL.
605  // Otherwise creates a new LanguageModelNgramInfo entry containing the
606  // updated context (that includes b.unichar_id() at the end) and returns it.
607  //
608  // Note: the caller is responsible for deleting the returned pointer.
609  LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar,
610  float certainty, float denom,
611  int curr_col, int curr_row,
612  const ViterbiStateEntry *parent_vse,
613  BLOB_CHOICE *parent_b,
614  LanguageModelFlagsType *changed);
615 
616  // Computes -(log(prob(classifier)) + log(prob(ngram model)))
617  // for the given unichar in the given context. If there are multiple
618  // unichars at one position - takes the average of their probabilities.
619  // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
620  // since probability_in_context() can only handle one at a time (while
621  // unicharset might contain ngrams and glyphs composed from multiple UTF8
622  // characters).
623  float ComputeNgramCost(const char *unichar, float certainty, float denom,
624  const char *context, int *unichar_step_len,
625  bool *found_small_prob, float *ngram_prob);
626 
627  // Computes the normalization factors for the classifier confidences
628  // (used by ComputeNgramCost()).
629  float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
630 
631  // Fills the given consistenty_info based on parent_vse.consistency_info
632  // and on the consistency of the given unichar_id with parent_vse.
633  void FillConsistencyInfo(
634  int curr_col, bool word_end, BLOB_CHOICE *b,
635  ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b,
636  CHUNKS_RECORD *chunks_record,
637  LanguageModelConsistencyInfo *consistency_info);
638 
639  // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
640  // on the path represented by the given BLOB_CHOICE and language model
641  // state entries (lmse, dse). The path is re-constructed by following
642  // the parent pointers in the the lang model state entries). If the
643  // constructed WERD_CHOICE is better than the best/raw choice recorded
644  // in the best_choice_bundle, this function updates the corresponding
645  // fields and sets best_choice_bunldle->updated to true.
647  ViterbiStateEntry *vse,
648  HEAP *pain_points,
649  CHUNKS_RECORD *chunks_record,
650  BestChoiceBundle *best_choice_bundle,
651  BlamerBundle *blamer_bundle);
652 
653  // Fills the given floats array with raw features extracted from the
654  // path represented by the given ViterbiStateEntry.
655  // See ccstruct/params_training_featdef.h for feature information.
657  float *features);
658 
659  // Constructs a WERD_CHOICE by tracing parent pointers starting with
660  // the given LanguageModelStateEntry. Returns the constructed word.
661  // Updates best_char_choices, certainties and state if they are not
662  // NULL (best_char_choices and certainties are assumed to have the
663  // length equal to lmse->length).
664  // The caller is resposible for freeing memory associated with the
665  // returned WERD_CHOICE.
667  ViterbiStateEntry *vse,
668  CHUNKS_RECORD *chunks_record,
669  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
670  float certainties[],
671  float *dawg_score,
672  STATE *state,
673  BlamerBundle *blamer_bundle,
674  bool *truth_path);
675 
676  // This function is used for non-space delimited languages when looking
677  // for word endings recorded while trying to separate the path into words.
678  //
679  // The function increments covered if a valid word ending is found in
680  // active_dawgs (if covered is incremented, skip is set to the number
681  // of unichars that should be skipped because they are covered by the
682  // word whose ending was just discovered).
683  //
684  // dawg_score and dawg_score_done are updated if:
685  // -- at the end of the path we discover a valid word ending from a
686  // non-fixed length dawg (this means that the whole word is a
687  // valid word, so dawg_score is set to 1.0f
688  // -- word_start is true (dawg_score is set to covered / word length)
689  //
690  // Note: this function assumes that skip, covered, dawg_score and
691  // dawg_score_done are not NULL.
692  void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs,
693  int word_index, int word_length,
694  int *skip, int *covered,
695  float *dawg_score,
696  bool *dawg_score_done);
697 
698  // Wrapper around AssociateUtils::ComputeStats().
699  inline void ComputeAssociateStats(int col, int row,
700  float max_char_wh_ratio,
701  ViterbiStateEntry *parent_vse,
702  CHUNKS_RECORD *chunks_record,
703  AssociateStats *associate_stats) {
705  col, row,
706  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
707  (parent_vse != NULL) ? parent_vse->length : 0,
708  fixed_pitch_, max_char_wh_ratio,
709  chunks_record->word_res != NULL ? &chunks_record->word_res->denorm : NULL,
710  chunks_record, language_model_debug_level, associate_stats);
711  }
712 
713  // Returns true if the path with such top_choice_flags and dawg_info
714  // could be pruned out (i.e. is neither a system/user/frequent dictionary
715  // nor a top choice path).
716  // In non-space delimited languages all paths can be "somewhat" dictionary
717  // words. In such languages we can not do dictionary-driven path prunning,
718  // so paths with non-empty dawg_info are considered prunable.
719  inline bool PrunablePath(LanguageModelFlagsType top_choice_flags,
720  const LanguageModelDawgInfo *dawg_info) {
721  if (top_choice_flags) return false;
722  if (dawg_info != NULL &&
723  (dawg_info->permuter == SYSTEM_DAWG_PERM ||
724  dawg_info->permuter == USER_DAWG_PERM ||
725  dawg_info->permuter == FREQ_DAWG_PERM) &&
726  dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
727  return true;
728  }
729 
730  // Returns true if the given ViterbiStateEntry represents an acceptable path.
731  inline bool AcceptablePath(const ViterbiStateEntry &vse) {
732  return (vse.dawg_info != NULL || vse.Consistent() ||
733  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
734  }
735 
736  public:
737  // Parameters.
738  INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
740  "Turn on/off the use of character ngram model");
742  "Maximum order of the character ngram model");
744  "Maximum number of prunable (those for which PrunablePath() is true)"
745  "entries in each viterbi list recorded in BLOB_CHOICEs");
747  "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
749  "To avoid overly small denominators use this as the floor"
750  " of the probability returned by the ngram model");
752  "Average classifier score of a non-matching unichar");
754  "Use only the first UTF8 step of the given string"
755  " when computing log probabilities");
757  "Strength of the character ngram model relative to the"
758  " character classifier ");
760  "Words are delimited by space");
761 
763  "Minimum length of compound words");
765  "Depth of blob choice lists to explore"
766  " when fixed length dawgs are on");
767  // Penalties used for adjusting path costs and final word rating.
769  "Penalty for words not in the frequent word dictionary");
771  "Penalty for non-dictionary words");
773  "Penalty for inconsistent punctuation");
775  "Penalty for inconsistent case");
777  "Penalty for inconsistent script");
779  "Penalty for inconsistent character type");
781  "Penalty for inconsistent font");
783  "Penalty for inconsistent spacing");
784  double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
786  "Use sigmoidal score for certainty");
787 
788  protected:
789  // Member Variables.
790 
791  // Temporary DawgArgs struct that is re-used across different words to
792  // avoid dynamic memory re-allocation (should be cleared before each use).
794  // List of pointers to updated flags used by Viterbi search to mark
795  // recently updated ViterbiStateEntries.
797  // Scaling for recovering blob outline length from rating and certainty.
799 
800  // The following variables are set at construction time.
801 
802  // Pointer to fontinfo table (not owned by LanguageModel).
804 
805  // Pointer to Dict class, that is used for querying the dictionaries
806  // (the pointer is not owned by LanguageModel).
808 
809  // TODO(daria): the following variables should become LanguageModel params
810  // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
811  //
812  // Set to true if we are dealing with fixed pitch text
813  // (set to assume_fixed_pitch_char_segment).
815  // Max char width-to-height ratio allowed
816  // (set to segsearch_max_char_wh_ratio).
818 
819  // The following variables are initialized with InitForWord().
820 
821  // String representation of the classification of the previous word
822  // (since this is only used by the character ngram model component,
823  // only the last language_model_ngram_order of the word are stored).
826  // Active dawg and constraints vector.
831  // Maximum adjustment factor for character ngram choices.
833  // Set to true if acceptable choice was discovered.
834  // Note: it would be nice to use this to terminate the search once an
835  // acceptable choices is found. However we do not do that and once an
836  // acceptable choice is found we finish looking for alternative choices
837  // in the current segmentation graph and then exit the search (no more
838  // classifications are done after an acceptable choice is found).
839  // This is needed in order to let the search find the words very close to
840  // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
841  // choices. This way the stopper will know that the best choice is not
842  // ambiguous (i.e. there are best choices in the best choice list that have
843  // ratings close to the very best one) and will be less likely to mis-adapt.
845  // Set to true if a choice representing correct segmentation was explored.
847 
848 };
849 
850 } // namespace tesseract
851 
852 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_