Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ScriptDetector Class Reference

#include <osdetect.h>

List of all members.

Public Member Functions

 ScriptDetector (OSResults *, tesseract::Tesseract *tess)
void detect_blob (BLOB_CHOICE_LIST *scores)
void get_script ()
bool must_stop (int orientation)

Detailed Description

Definition at line 91 of file osdetect.h.


Constructor & Destructor Documentation

ScriptDetector::ScriptDetector ( OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 419 of file osdetect.cpp.

{
osr_ = osr;
tess_ = tess;
katakana_id_ = tess_->unicharset.add_script(katakana_script);
hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
han_id_ = tess_->unicharset.add_script(han_script);
hangul_id_ = tess_->unicharset.add_script(hangul_script);
japanese_id_ = tess_->unicharset.add_script(japanese_script_);
korean_id_ = tess_->unicharset.add_script(korean_script_);
latin_id_ = tess_->unicharset.add_script(latin_script);
fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
}

Member Function Documentation

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 435 of file osdetect.cpp.

{
bool done[kMaxNumberOfScripts];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < kMaxNumberOfScripts; ++j)
done[j] = false;
BLOB_CHOICE_IT choice_it;
choice_it.set_to_list(scores + i);
float prev_score = -1;
int script_count = 0;
int prev_id = -1;
int prev_script;
int prev_class_id = -1;
int prev_fontinfo_id = -1;
const char* prev_unichar = "";
const char* unichar = "";
float next_best_score = -1.0;
int next_best_script_id = -1;
const char* next_best_unichar = "";
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
int id = choice->script_id();
// Script already processed before.
if (done[id]) continue;
done[id] = true;
unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
// Save data from the first match
if (prev_score < 0) {
prev_score = -choice->certainty();
script_count = 1;
prev_id = id;
prev_script = choice->script_id();
prev_unichar = unichar;
prev_class_id = choice->unichar_id();
prev_fontinfo_id = choice->fontinfo_id();
} else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
++script_count;
next_best_score = -choice->certainty();
next_best_script_id = choice->script_id();
next_best_unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
}
if (strlen(prev_unichar) == 1)
if (unichar[0] >= '0' && unichar[0] <= '9')
break;
// if script_count is >= 2, character is ambiguous, skip other matches
// since they are useless.
if (script_count >= 2)
break;
}
// Character is non ambiguous
if (script_count == 1) {
// Update the score of the winning script
osr_->scripts_na[i][prev_id] += 1.0;
// Workaround for Fraktur
if (prev_id == latin_id_) {
if (prev_fontinfo_id >= 0) {
const tesseract::FontInfo &fi =
tess_->get_fontinfo_table().get(prev_fontinfo_id);
//printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
// fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
// fi.is_serif(), fi.is_fraktur(),
// prev_unichar);
if (fi.is_fraktur()) {
osr_->scripts_na[i][prev_id] -= 1.0;
osr_->scripts_na[i][fraktur_id_] += 1.0;
}
}
}
// Update Japanese / Korean pseudo-scripts
if (prev_id == katakana_id_)
osr_->scripts_na[i][japanese_id_] += 1.0;
if (prev_id == hiragana_id_)
osr_->scripts_na[i][japanese_id_] += 1.0;
if (prev_id == hangul_id_)
osr_->scripts_na[i][korean_id_] += 1.0;
if (prev_id == han_id_)
osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
if (prev_id == han_id_)
osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
}
} // iterate over each orientation
}
void ScriptDetector::get_script ( )
bool ScriptDetector::must_stop ( int  orientation)

Definition at line 526 of file osdetect.cpp.

{
osr_->update_best_script(orientation);
return osr_->best_result.sconfidence > 1;
}

The documentation for this class was generated from the following files: