Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: Main program for merge of tess and editor.
4  * Author: Ray Smith
5  * Created: Tue Jan 07 15:21:46 GMT 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 //#include <osfcn.h>
22 //#include <signal.h>
23 //#include <time.h>
24 //#include <unistd.h>
25 #include "tfacep.h" //must be before main.h
26 //#include "fileerr.h"
27 #include "stderr.h"
28 #include "basedir.h"
29 #include "tessvars.h"
30 //#include "debgwin.h"
31 //#include "epapdest.h"
32 #include "control.h"
33 #include "imgs.h"
34 #include "reject.h"
35 #include "pageres.h"
36 //#include "gpapdest.h"
37 #include "nwmain.h"
38 #include "pgedit.h"
39 #include "tprintf.h"
40 //#include "ipeerr.h"
41 //#include "restart.h"
42 #include "tessedit.h"
43 //#include "fontfind.h"
44 #include "permute.h"
45 #include "stopper.h"
46 #include "intmatcher.h"
47 #include "chop.h"
48 #include "efio.h"
49 #include "danerror.h"
50 #include "globals.h"
51 #include "tesseractclass.h"
52 #include "params.h"
53 
54 #include "notdll.h" //phils nn stuff
55 
56 #define VARDIR "configs/" /*variables files */
57  //config under api
58 #define API_CONFIG "configs/api_config"
59 
60 ETEXT_DESC *global_monitor = NULL; // progress monitor
61 
62 namespace tesseract {
63 
64 // Read a "config" file containing a set of variable, value pairs.
65 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
66 // and also accepts a relative or absolute path name.
68  SetParamConstraint constraint) {
69  STRING path = datadir;
70  path += "configs/";
71  path += filename;
72  FILE* fp;
73  if ((fp = fopen(path.string(), "rb")) != NULL) {
74  fclose(fp);
75  } else {
76  path = datadir;
77  path += "tessconfigs/";
78  path += filename;
79  if ((fp = fopen(path.string(), "rb")) != NULL) {
80  fclose(fp);
81  } else {
82  path = filename;
83  }
84  }
85  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
86 }
87 
88 // Returns false if a unicharset file for the specified language was not found
89 // or was invalid.
90 // This function initializes TessdataManager. After TessdataManager is
91 // no longer needed, TessdataManager::End() should be called.
92 //
93 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
94 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
95 // from the language-specific config file (stored in [lang].traineddata), from
96 // the config files specified on the command line or left as the default
97 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
99  const char *arg0, const char *textbase, const char *language,
100  OcrEngineMode oem, char **configs, int configs_size,
101  const GenericVector<STRING> *vars_vec,
102  const GenericVector<STRING> *vars_values,
103  bool set_only_non_debug_params) {
104  // Set the basename, compute the data directory.
105  main_setup(arg0, textbase);
106 
107  // Set the language data path prefix
108  lang = language != NULL ? language : "eng";
112 
113  // Initialize TessdataManager.
114  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
115  if (!tessdata_manager.Init(tessdata_path.string(),
117  return false;
118  }
119 
120  // If a language specific config file (lang.config) exists, load it in.
127  tprintf("Loaded language config file\n");
128  }
129  }
130 
131  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
133  // Load tesseract variables from config files. This is done after loading
134  // language-specific variables from [lang].traineddata file, so that custom
135  // config files can override values in [lang].traineddata file.
136  for (int i = 0; i < configs_size; ++i) {
137  read_config_file(configs[i], set_params_constraint);
138  }
139 
140  // Set params specified in vars_vec (done after setting params from config
141  // files, so that params in vars_vec can override those from files).
142  if (vars_vec != NULL && vars_values != NULL) {
143  for (int i = 0; i < vars_vec->size(); ++i) {
144  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
145  (*vars_values)[i].string(),
146  set_params_constraint, this->params())) {
147  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
148  exit(1);
149  }
150  }
151  }
152 
153  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
154  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
155  if (params_file != NULL) {
156  ParamUtils::PrintParams(params_file, this->params());
157  fclose(params_file);
159  tprintf("Wrote parameters to %s\n",
160  tessedit_write_params_to_file.string());
161  }
162  } else {
163  tprintf("Failed to open %s for writing params.\n",
164  tessedit_write_params_to_file.string());
165  }
166  }
167 
168  // Determine which ocr engine(s) should be loaded and used for recognition.
169  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
171  tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
172  static_cast<int>(tessedit_ocr_engine_mode));
173  }
174 
175  // If we are only loading the config file (and so not planning on doing any
176  // recognition) then there's nothing else do here.
179  tprintf("Returning after loading config file\n");
180  }
181  return true;
182  }
183 
184  // Load the unicharset
187  return false;
188  }
189  if (unicharset.size() > MAX_NUM_CLASSES) {
190  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
191  return false;
192  }
193  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
194  right_to_left_ = unicharset.major_right_to_left();
195 
202  if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
203  }
204 
205  // Load Cube objects if necessary.
209  tprintf("Loaded Cube w/out combiner\n");
213  tprintf("Loaded Cube with combiner\n");
214  }
215 
216  return true;
217 }
218 
219 // Helper returns true if the given string is in the vector of strings.
220 static bool IsStrInList(const STRING& str,
221  const GenericVector<STRING>& str_list) {
222  for (int i = 0; i < str_list.size(); ++i) {
223  if (str_list[i] == str)
224  return true;
225  }
226  return false;
227 }
228 
229 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
230 // Langs with no prefix get appended to to_load, provided they
231 // are not in there already.
232 // Langs with ~ prefix get appended to not_to_load, provided they are not in
233 // there already.
234 void Tesseract::ParseLanguageString(const char* lang_str,
235  GenericVector<STRING>* to_load,
236  GenericVector<STRING>* not_to_load) {
237  STRING remains(lang_str);
238  while (remains.length() > 0) {
239  // Find the start of the lang code and which vector to add to.
240  const char* start = remains.string();
241  while (*start == '+')
242  ++start;
243  GenericVector<STRING>* target = to_load;
244  if (*start == '~') {
245  target = not_to_load;
246  ++start;
247  }
248  // Find the index of the end of the lang code in string start.
249  int end = strlen(start);
250  const char* plus = strchr(start, '+');
251  if (plus != NULL && plus - start < end)
252  end = plus - start;
253  STRING lang_code(start);
254  lang_code.truncate_at(end);
255  STRING next(start + end);
256  remains = next;
257  // Check whether lang_code is already in the target vector and add.
258  if (!IsStrInList(lang_code, *target)) {
260  tprintf("Adding language '%s' to list\n", lang_code.string());
261  target->push_back(lang_code);
262  }
263  }
264 }
265 
266 // Initialize for potentially a set of languages defined by the language
267 // string and recursively any additional languages required by any language
268 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
269 // See init_tesseract_internal for args.
271  const char *arg0, const char *textbase, const char *language,
272  OcrEngineMode oem, char **configs, int configs_size,
273  const GenericVector<STRING> *vars_vec,
274  const GenericVector<STRING> *vars_values,
275  bool set_only_non_debug_params) {
276  GenericVector<STRING> langs_to_load;
277  GenericVector<STRING> langs_not_to_load;
278  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
279 
280  sub_langs_.delete_data_pointers();
281  sub_langs_.clear();
282  // Find the first loadable lang and load into this.
283  // Add any languages that this language requires
284  bool loaded_primary = false;
285  // Load the rest into sub_langs_.
286  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
287  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
288  const char *lang_str = langs_to_load[lang_index].string();
289  Tesseract *tess_to_init;
290  if (!loaded_primary) {
291  tess_to_init = this;
292  } else {
293  tess_to_init = new Tesseract;
294  }
295 
296  int result = tess_to_init->init_tesseract_internal(
297  arg0, textbase, lang_str, oem, configs, configs_size,
298  vars_vec, vars_values, set_only_non_debug_params);
299 
300  if (!loaded_primary) {
301  if (result < 0) {
302  tprintf("Failed loading language '%s'\n", lang_str);
303  } else {
305  tprintf("Loaded language '%s' as main language\n", lang_str);
306  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
307  &langs_to_load, &langs_not_to_load);
308  loaded_primary = true;
309  }
310  } else {
311  if (result < 0) {
312  tprintf("Failed loading language '%s'\n", lang_str);
313  delete tess_to_init;
314  } else {
316  tprintf("Loaded language '%s' as secondary language\n", lang_str);
317  sub_langs_.push_back(tess_to_init);
318  // Add any languages that this language requires
319  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
320  &langs_to_load, &langs_not_to_load);
321  }
322  }
323  }
324  }
325  if (!loaded_primary) {
326  tprintf("Tesseract couldn't load any languages!\n");
327  return -1; // Couldn't load any language!
328  }
330  return 0;
331 }
332 
333 // Common initialization for a single language.
334 // arg0 is the datapath for the tessdata directory, which could be the
335 // path of the tessdata directory with no trailing /, or (if tessdata
336 // lives in the same directory as the executable, the path of the executable,
337 // hence the name arg0.
338 // textbase is an optional output file basename (used only for training)
339 // language is the language code to load.
340 // oem controls which engine(s) will operate on the image
341 // configs (argv) is an array of config filenames to load variables from.
342 // May be NULL.
343 // configs_size (argc) is the number of elements in configs.
344 // vars_vec is an optional vector of variables to set.
345 // vars_values is an optional corresponding vector of values for the variables
346 // in vars_vec.
347 // If set_only_init_params is true, then only the initialization variables
348 // will be set.
350  const char *arg0, const char *textbase, const char *language,
351  OcrEngineMode oem, char **configs, int configs_size,
352  const GenericVector<STRING> *vars_vec,
353  const GenericVector<STRING> *vars_values,
354  bool set_only_non_debug_params) {
355  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
356  configs_size, vars_vec, vars_values,
357  set_only_non_debug_params)) {
358  return -1;
359  }
362  return 0;
363  }
364  // If only Cube will be used, skip loading Tesseract classifier's
365  // pre-trained templates.
366  bool init_tesseract_classifier =
369  // If only Cube will be used and if it has its own Unicharset,
370  // skip initializing permuter and loading Tesseract Dawgs.
371  bool init_dict =
374  program_editup(textbase, init_tesseract_classifier, init_dict);
376  return 0; //Normal exit
377 }
378 
379 // Helper builds the all_fonts table by adding new fonts from new_fonts.
380 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
381  UnicityTable<FontInfo>* all_fonts) {
382  for (int i = 0; i < new_fonts.size(); ++i) {
383  // UnicityTable uniques as we go.
384  all_fonts->push_back(new_fonts.get(i));
385  }
386 }
387 
388 // Helper assigns an id to lang_fonts using the index in all_fonts table.
389 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
390  UnicityTable<FontInfo>* lang_fonts) {
391  for (int i = 0; i < lang_fonts->size(); ++i) {
392  int index = all_fonts.get_id(lang_fonts->get(i));
393  lang_fonts->get_mutable(i)->universal_id = index;
394  }
395 }
396 
397 // Set the universal_id member of each font to be unique among all
398 // instances of the same font loaded.
400  // Note that we can get away with bitwise copying FontInfo in
401  // all_fonts, as it is a temporary structure and we avoid setting the
402  // delete callback.
403  UnicityTable<FontInfo> all_fonts;
405 
406  // Create the universal ID table.
407  CollectFonts(get_fontinfo_table(), &all_fonts);
408  for (int i = 0; i < sub_langs_.size(); ++i) {
409  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
410  }
411  // Assign ids from the table to each font table.
412  AssignIds(all_fonts, &get_fontinfo_table());
413  for (int i = 0; i < sub_langs_.size(); ++i) {
414  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
415  }
416  font_table_size_ = all_fonts.size();
417 }
418 
419 // init the LM component
420 int Tesseract::init_tesseract_lm(const char *arg0,
421  const char *textbase,
422  const char *language) {
423  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
424  NULL, 0, NULL, NULL, false))
425  return -1;
426  getDict().Load();
428  return 0;
429 }
430 
432  end_recog();
433 }
434 
435 /* Define command type identifiers */
436 
438 {
443 };
444 
445 } // namespace tesseract