Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::Textord Class Reference

#include <textord.h>

List of all members.

Public Member Functions

 Textord (CCStruct *ccstruct)
 ~Textord ()
void TextordPage (PageSegMode pageseg_mode, int width, int height, Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
void CleanupSingleRowResult (PageSegMode pageseg_mode, PAGE_RES *page_res)
bool use_cjk_fp_model () const
void set_use_cjk_fp_model (bool flag)
void to_spacing (ICOORD page_tr, TO_BLOCK_LIST *blocks)
ROWmake_prop_words (TO_ROW *row, FCOORD rotation)
ROWmake_blob_words (TO_ROW *row, FCOORD rotation)
void find_components (Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
void filter_blobs (ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on)

Public Attributes

bool textord_single_height_mode = false
bool tosp_old_to_method = false
bool tosp_old_to_constrain_sp_kn = false
bool tosp_only_use_prop_rows = true
bool tosp_force_wordbreak_on_punct = false
bool tosp_use_pre_chopping = false
bool tosp_old_to_bug_fix = false
bool tosp_block_use_cert_spaces = true
bool tosp_row_use_cert_spaces = true
bool tosp_narrow_blobs_not_cert = true
bool tosp_row_use_cert_spaces1 = true
bool tosp_recovery_isolated_row_stats = true
bool tosp_only_small_gaps_for_kern = false
bool tosp_all_flips_fuzzy = false
bool tosp_fuzzy_limit_all = true
bool tosp_stats_use_xht_gaps = true
bool tosp_use_xht_gaps = true
bool tosp_only_use_xht_gaps = false
bool tosp_rule_9_test_punct = false
bool tosp_flip_fuzz_kn_to_sp = true
bool tosp_flip_fuzz_sp_to_kn = true
bool tosp_improve_thresh = false
int tosp_debug_level = 0
int tosp_enough_space_samples_for_median = 3
int tosp_redo_kern_limit = 10
int tosp_few_samples = 40
int tosp_short_row = 20
int tosp_sanity_method = 1
double tosp_old_sp_kn_th_factor = 2.0
double tosp_threshold_bias1 = 0
double tosp_threshold_bias2 = 0
double tosp_narrow_fraction = 0.3
double tosp_narrow_aspect_ratio = 0.48
double tosp_wide_fraction = 0.52
double tosp_wide_aspect_ratio = 0.0
double tosp_fuzzy_space_factor = 0.6
double tosp_fuzzy_space_factor1 = 0.5
double tosp_fuzzy_space_factor2 = 0.72
double tosp_gap_factor = 0.83
double tosp_kern_gap_factor1 = 2.0
double tosp_kern_gap_factor2 = 1.3
double tosp_kern_gap_factor3 = 2.5
double tosp_ignore_big_gaps = -1
double tosp_ignore_very_big_gaps = 3.5
double tosp_rep_space = 1.6
double tosp_enough_small_gaps = 0.65
double tosp_table_kn_sp_ratio = 2.25
double tosp_table_xht_sp_ratio = 0.33
double tosp_table_fuzzy_kn_sp_ratio = 3.0
double tosp_fuzzy_kn_fraction = 0.5
double tosp_fuzzy_sp_fraction = 0.5
double tosp_min_sane_kn_sp = 1.5
double tosp_init_guess_kn_mult = 2.2
double tosp_init_guess_xht_mult = 0.28
double tosp_max_sane_kn_thresh = 5.0
double tosp_flip_caution = 0.0
double tosp_large_kerning = 0.19
double tosp_dont_fool_with_small_kerns = -1
double tosp_near_lh_edge = 0
double tosp_silly_kn_sp_gap = 0.2
double tosp_pass_wide_fuzz_sp_to_context = 0.75
bool textord_no_rejects = false
bool textord_show_blobs = false
bool textord_show_boxes = false
int textord_max_noise_size = 7
double textord_blob_size_bigile = 95
double textord_noise_area_ratio = 0.7
double textord_blob_size_smallile = 20
double textord_initialx_ile = 0.75
double textord_initialasc_ile = 0.90
int textord_noise_sizefraction = 10
double textord_noise_sizelimit = 0.5
int textord_noise_translimit = 16
double textord_noise_normratio = 2.0
bool textord_noise_rejwords = true
bool textord_noise_rejrows = true
double textord_noise_syfract = 0.2
double textord_noise_sxfract = 0.4
double textord_noise_hfract = 1.0/64
int textord_noise_sncount = 1
double textord_noise_rowratio = 6.0
bool textord_noise_debug = FALSE
double textord_blshift_maxshift = 0.00
double textord_blshift_xfraction = 9.99

Detailed Description

Definition at line 39 of file textord.h.


Constructor & Destructor Documentation

tesseract::Textord::Textord ( CCStruct ccstruct)
explicit

Definition at line 34 of file textord.cpp.

: ccstruct_(ccstruct), use_cjk_fp_model_(false),
// makerow.cpp ///////////////////////////////////////////
"Script has no xheight, so use a single mode",
ccstruct_->params()),
// tospace.cpp ///////////////////////////////////////////
BOOL_MEMBER(tosp_old_to_method, false, "Space stats use prechopping?",
ccstruct_->params()),
"Constrain relative values of inter and intra-word gaps for "
"old_to_method.",
ccstruct_->params()),
"Block stats to use fixed pitch rows?",
ccstruct_->params()),
"Force word breaks on punct to break long lines in non-space "
"delimited langs",
ccstruct_->params()),
"Space stats use prechopping?",
ccstruct_->params()),
BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code",
ccstruct_->params()),
"Only stat OBVIOUS spaces",
ccstruct_->params()),
BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces",
ccstruct_->params()),
"Only stat OBVIOUS spaces",
ccstruct_->params()),
BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces",
ccstruct_->params()),
"Use row alone when inadequate cert spaces",
ccstruct_->params()),
ccstruct_->params()),
BOOL_MEMBER(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?",
ccstruct_->params()),
"Dont restrict kn->sp fuzzy limit to tables",
ccstruct_->params()),
"Use within xht gap for wd breaks",
ccstruct_->params()),
BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks",
ccstruct_->params()),
"Only use within xht gap for wd breaks",
ccstruct_->params()),
"Dont chng kn to space next to punct",
ccstruct_->params()),
BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip",
ccstruct_->params()),
BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip",
ccstruct_->params()),
BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic",
ccstruct_->params()),
INT_MEMBER(tosp_debug_level, 0, "Debug data",
ccstruct_->params()),
"or should we use mean",
ccstruct_->params()),
"No.samples reqd to reestimate for row",
ccstruct_->params()),
"No.gaps reqd with 1 large gap to treat as a table",
ccstruct_->params()),
"No.gaps reqd with few cert spaces to use certs",
ccstruct_->params()),
INT_MEMBER(tosp_sanity_method, 1, "How to avoid being silly",
ccstruct_->params()),
"Factor for defining space threshold in terms of space and "
"kern sizes",
ccstruct_->params()),
"how far between kern and space?",
ccstruct_->params()),
"how far between kern and space?",
ccstruct_->params()),
double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow",
ccstruct_->params()),
"narrow if w/h less than this",
ccstruct_->params()),
double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide",
ccstruct_->params()),
double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this",
ccstruct_->params()),
"Fract of xheight for fuzz sp",
ccstruct_->params()),
"Fract of xheight for fuzz sp",
ccstruct_->params()),
"Fract of xheight for fuzz sp",
ccstruct_->params()),
double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern",
ccstruct_->params()),
double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp",
ccstruct_->params()),
double_MEMBER(tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp",
ccstruct_->params()),
double_MEMBER(tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp",
ccstruct_->params()),
double_MEMBER(tosp_ignore_big_gaps, -1, "xht multiplier",
ccstruct_->params()),
double_MEMBER(tosp_ignore_very_big_gaps, 3.5, "xht multiplier",
ccstruct_->params()),
double_MEMBER(tosp_rep_space, 1.6, "rep gap multiplier for space",
ccstruct_->params()),
"Fract of kerns reqd for isolated row stats",
ccstruct_->params()),
"Min difference of kn & sp in table",
ccstruct_->params()),
"Expect spaces bigger than this",
ccstruct_->params()),
"Fuzzy if less than this",
ccstruct_->params()),
double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg",
ccstruct_->params()),
double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg",
ccstruct_->params()),
"Dont trust spaces less than this time kn",
ccstruct_->params()),
"Thresh guess - mult kn by this",
ccstruct_->params()),
"Thresh guess - mult xht by this",
ccstruct_->params()),
"Multiplier on kn to limit thresh",
ccstruct_->params()),
"Dont autoflip kn to sp when large separation",
ccstruct_->params()),
"Limit use of xht gap with large kns",
ccstruct_->params()),
"Limit use of xht gap with odd small kns",
ccstruct_->params()),
"Dont reduce box if the top left is non blank",
ccstruct_->params()),
"Dont let sp minus kn get too small",
ccstruct_->params()),
"How wide fuzzies need context",
ccstruct_->params()),
// tordmain.cpp ///////////////////////////////////////////
BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs",
ccstruct_->params()),
BOOL_MEMBER(textord_show_blobs, false, "Display unsorted blobs",
ccstruct_->params()),
BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs",
ccstruct_->params()),
INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise",
ccstruct_->params()),
double_MEMBER(textord_blob_size_bigile, 95, "Percentile for large blobs",
ccstruct_->params()),
"Fraction of bounding box for noise",
ccstruct_->params()),
"Percentile for small blobs",
ccstruct_->params()),
"Ile of sizes for xheight guess",
ccstruct_->params()),
"Ile of sizes for xheight guess",
ccstruct_->params()),
"Fraction of size for maxima",
ccstruct_->params()),
"Fraction of x for big t count",
ccstruct_->params()),
INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob",
ccstruct_->params()),
"Dot to norm ratio for deletion",
ccstruct_->params()),
BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words",
ccstruct_->params()),
BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows",
ccstruct_->params()),
"xh fract height error for norm blobs",
ccstruct_->params()),
"xh fract width error for norm blobs",
ccstruct_->params()),
"Height fraction to discard outlines as speckle noise",
ccstruct_->params()),
INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row",
ccstruct_->params()),
"Dot to norm ratio for deletion",
ccstruct_->params()),
BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector",
ccstruct_->params()),
double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift",
ccstruct_->params()),
"Min size of baseline shift",
ccstruct_->params()) {
}
tesseract::Textord::~Textord ( )

Definition at line 261 of file textord.cpp.

{
}

Member Function Documentation

void tesseract::Textord::CleanupSingleRowResult ( PageSegMode  pageseg_mode,
PAGE_RES page_res 
)

Definition at line 339 of file textord.cpp.

{
if (PSM_LINE_FIND_ENABLED(pageseg_mode))
return; // No cleanup required.
PAGE_RES_IT it(page_res);
// Find the best row, being the greatest mean word conf.
float row_total_conf = 0.0f;
int row_word_count = 0;
ROW_RES* best_row = NULL;
float best_conf = 0.0f;
for (it.restart_page(); it.word() != NULL; it.forward()) {
WERD_RES* word = it.word();
row_total_conf += word->best_choice->certainty();
++row_word_count;
if (it.next_row() != it.row()) {
row_total_conf /= row_word_count;
if (best_row == NULL || best_conf < row_total_conf) {
best_row = it.row();
best_conf = row_total_conf;
}
row_total_conf = 0.0f;
row_word_count = 0;
}
}
// Now eliminate any word not in the best row.
for (it.restart_page(); it.word() != NULL; it.forward()) {
if (it.row() != best_row)
it.DeleteCurrentWord();
}
}
void tesseract::Textord::filter_blobs ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks,
BOOL8  testing_on 
)

Definition at line 239 of file tordmain.cpp.

{ // for plotting
TO_BLOCK_IT block_it = blocks; // destination iterator
TO_BLOCK *block; // created block
#ifndef GRAPHICS_DISABLED
if (to_win != NULL)
#endif // GRAPHICS_DISABLED
for (block_it.mark_cycle_pt(); !block_it.cycled_list();
block_it.forward()) {
block = block_it.data();
block->line_size = filter_noise_blobs(&block->blobs,
&block->noise_blobs,
&block->small_blobs,
&block->large_blobs);
block->line_spacing = block->line_size *
#ifndef GRAPHICS_DISABLED
if (textord_show_blobs && testing_on) {
if (to_win == NULL)
create_to_win(page_tr);
}
if (textord_show_boxes && testing_on) {
if (to_win == NULL)
create_to_win(page_tr);
}
#endif // GRAPHICS_DISABLED
}
}
void tesseract::Textord::find_components ( Pix *  pix,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks 
)

Definition at line 208 of file tordmain.cpp.

{
int width = pixGetWidth(pix);
int height = pixGetHeight(pix);
if (width > MAX_INT16 || height > MAX_INT16) {
tprintf("Input image too large! (%d, %d)\n", width, height);
return; // Can't handle it.
}
BLOCK_IT block_it(blocks); // iterator
for (block_it.mark_cycle_pt(); !block_it.cycled_list();
block_it.forward()) {
BLOCK* block = block_it.data();
if (block->poly_block() == NULL || block->poly_block()->IsText()) {
extract_edges(pix, block);
}
}
assign_blobs_to_blocks2(pix, blocks, to_blocks);
ICOORD page_tr(width, height);
filter_blobs(page_tr, to_blocks, !textord_test_landscape);
}
ROW * tesseract::Textord::make_blob_words ( TO_ROW row,
FCOORD  rotation 
)

Definition at line 1183 of file tospace.cpp.

{
bool bol; // start of line
ROW *real_row; // output row
C_OUTLINE_IT cout_it;
C_BLOB_LIST cblobs;
C_BLOB_IT cblob_it = &cblobs;
WERD_LIST words;
WERD_IT word_it; // new words
WERD *word; // new word
double coeffs[3]; // quadratic
BLOBNBOX *bblob; // current blob
TBOX blob_box; // bounding box
BLOBNBOX_IT box_it; // iterator
inT16 word_count = 0;
cblob_it.set_to_list(&cblobs);
box_it.set_to_list(row->blob_list());
word_it.set_to_list(&words);
bol = TRUE;
if (!box_it.empty()) {
do {
bblob = box_it.data();
blob_box = bblob->bounding_box();
if (bblob->joined_to_prev()) {
if (bblob->cblob() != NULL) {
cout_it.set_to_list(cblob_it.data()->out_list());
cout_it.move_to_last();
cout_it.add_list_after(bblob->cblob()->out_list());
delete bblob->cblob();
}
} else {
if (bblob->cblob() != NULL)
cblob_it.add_after_then_move(bblob->cblob());
}
box_it.forward(); // next one
bblob = box_it.data();
blob_box = bblob->bounding_box();
if (!bblob->joined_to_prev() && !cblobs.empty()) {
word = new WERD(&cblobs, 1, NULL);
word_count++;
word_it.add_after_then_move(word);
if (bol) {
word->set_flag(W_BOL, TRUE);
bol = FALSE;
}
if (box_it.at_first()) { // at end of line
word->set_flag(W_EOL, TRUE);
}
}
}
while (!box_it.at_first()); // until back at start
/* Setup the row with created words. */
coeffs[0] = 0;
coeffs[1] = row->line_m();
coeffs[2] = row->line_c();
real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
word_it.set_to_list(real_row->word_list());
//put words in row
word_it.add_list_after(&words);
real_row->recalc_bounding_box();
if (tosp_debug_level > 4) {
tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
word_count,
real_row->bounding_box().left(),
real_row->bounding_box().bottom(),
real_row->bounding_box().right(),
real_row->bounding_box().top());
}
return real_row;
}
return NULL;
}
ROW * tesseract::Textord::make_prop_words ( TO_ROW row,
FCOORD  rotation 
)

Definition at line 886 of file tospace.cpp.

{
BOOL8 bol; //start of line
/* prev_ values are for start of word being built. non prev_ values are for
the gap between the word being built and the next one. */
BOOL8 prev_fuzzy_sp; //probably space
BOOL8 prev_fuzzy_non; //probably not
uinT8 prev_blanks; //in front of word
BOOL8 fuzzy_sp; //probably space
BOOL8 fuzzy_non; //probably not
uinT8 blanks; //in front of word
BOOL8 prev_gap_was_a_space = FALSE;
BOOL8 break_at_next_gap = FALSE;
ROW *real_row; //output row
C_OUTLINE_IT cout_it;
C_BLOB_LIST cblobs;
C_BLOB_IT cblob_it = &cblobs;
WERD_LIST words;
WERD_IT word_it; //new words
WERD *word; //new word
WERD_IT rep_char_it; //repeated char words
inT32 next_rep_char_word_right = MAX_INT32;
float repetition_spacing; //gap between repetitions
inT32 xstarts[2]; //row ends
double coeffs[3]; //quadratic
inT32 prev_x; //end of prev blob
BLOBNBOX *bblob; //current blob
TBOX blob_box; //bounding box
BLOBNBOX_IT box_it; //iterator
TBOX prev_blob_box;
TBOX next_blob_box;
inT16 prev_gap = MAX_INT16;
inT16 current_gap = MAX_INT16;
inT16 next_gap = MAX_INT16;
inT16 prev_within_xht_gap = MAX_INT16;
inT16 current_within_xht_gap = MAX_INT16;
inT16 next_within_xht_gap = MAX_INT16;
inT16 word_count = 0;
rep_char_it.set_to_list (&(row->rep_words));
if (!rep_char_it.empty ()) {
next_rep_char_word_right =
rep_char_it.data ()->bounding_box ().right ();
}
prev_x = -MAX_INT16;
cblob_it.set_to_list (&cblobs);
box_it.set_to_list (row->blob_list ());
word_it.set_to_list (&words);
bol = TRUE;
prev_blanks = 0;
prev_fuzzy_sp = FALSE;
prev_fuzzy_non = FALSE;
if (!box_it.empty ()) {
xstarts[0] = box_it.data ()->bounding_box ().left ();
if (xstarts[0] > next_rep_char_word_right) {
/* We need to insert a repeated char word at the start of the row */
word = rep_char_it.extract ();
word_it.add_after_then_move (word);
/* Set spaces before repeated char word */
word->set_flag (W_BOL, TRUE);
bol = FALSE;
word->set_blanks (0);
//NO uncertainty
xstarts[0] = word->bounding_box ().left ();
/* Set spaces after repeated char word (and leave current word set) */
repetition_spacing = find_mean_blob_spacing (word);
current_gap = box_it.data ()->bounding_box ().left () -
next_rep_char_word_right;
current_within_xht_gap = current_gap;
if (current_gap > tosp_rep_space * repetition_spacing) {
prev_blanks = (uinT8) floor (current_gap / row->space_size);
if (prev_blanks < 1)
prev_blanks = 1;
}
else
prev_blanks = 0;
tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
box_it.data ()->bounding_box ().left (),
box_it.data ()->bounding_box ().bottom (),
repetition_spacing, current_gap);
prev_fuzzy_sp = FALSE;
prev_fuzzy_non = FALSE;
if (rep_char_it.empty ()) {
next_rep_char_word_right = MAX_INT32;
}
else {
rep_char_it.forward ();
next_rep_char_word_right =
rep_char_it.data ()->bounding_box ().right ();
}
}
peek_at_next_gap(row,
box_it,
next_blob_box,
next_gap,
next_within_xht_gap);
do {
bblob = box_it.data ();
blob_box = bblob->bounding_box ();
if (bblob->joined_to_prev ()) {
if (bblob->cblob () != NULL) {
cout_it.set_to_list (cblob_it.data ()->out_list ());
cout_it.move_to_last ();
cout_it.add_list_after (bblob->cblob ()->out_list ());
delete bblob->cblob ();
}
} else {
if (bblob->cblob() != NULL)
cblob_it.add_after_then_move (bblob->cblob ());
prev_x = blob_box.right ();
}
box_it.forward (); //next one
bblob = box_it.data ();
blob_box = bblob->bounding_box ();
if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
/* Real Blob - not multiple outlines or pre-chopped */
prev_gap = current_gap;
prev_within_xht_gap = current_within_xht_gap;
prev_blob_box = next_blob_box;
current_gap = next_gap;
current_within_xht_gap = next_within_xht_gap;
peek_at_next_gap(row,
box_it,
next_blob_box,
next_gap,
next_within_xht_gap);
inT16 prev_gap_arg = prev_gap;
inT16 next_gap_arg = next_gap;
prev_gap_arg = prev_within_xht_gap;
next_gap_arg = next_within_xht_gap;
}
// Decide if a word-break should be inserted
if (blob_box.left () > next_rep_char_word_right ||
make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
current_gap, current_within_xht_gap,
next_blob_box, next_gap_arg,
blanks, fuzzy_sp, fuzzy_non,
prev_gap_was_a_space,
break_at_next_gap) ||
box_it.at_first()) {
/* Form a new word out of the blobs collected */
word = new WERD (&cblobs, prev_blanks, NULL);
word_count++;
word_it.add_after_then_move (word);
if (bol) {
word->set_flag (W_BOL, TRUE);
bol = FALSE;
}
if (prev_fuzzy_sp)
//probably space
else if (prev_fuzzy_non)
//probably not
if (blob_box.left () > next_rep_char_word_right) {
/* We need to insert a repeated char word */
word = rep_char_it.extract ();
word_it.add_after_then_move (word);
/* Set spaces before repeated char word */
repetition_spacing = find_mean_blob_spacing (word);
current_gap = word->bounding_box ().left () - prev_x;
current_within_xht_gap = current_gap;
if (current_gap > tosp_rep_space * repetition_spacing) {
blanks =
(uinT8) floor (current_gap / row->space_size);
if (blanks < 1)
blanks = 1;
}
else
blanks = 0;
("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
word->bounding_box ().left (),
word->bounding_box ().bottom (),
repetition_spacing, current_gap, blanks);
word->set_blanks (blanks);
//NO uncertainty
/* Set spaces after repeated char word (and leave current word set) */
current_gap =
blob_box.left () - next_rep_char_word_right;
if (current_gap > tosp_rep_space * repetition_spacing) {
blanks = (uinT8) (current_gap / row->space_size);
if (blanks < 1)
blanks = 1;
}
else
blanks = 0;
tprintf (" Rgap:%d (%d blanks)\n",
current_gap, blanks);
fuzzy_sp = FALSE;
fuzzy_non = FALSE;
if (rep_char_it.empty ()) {
next_rep_char_word_right = MAX_INT32;
}
else {
rep_char_it.forward ();
next_rep_char_word_right =
rep_char_it.data ()->bounding_box ().right ();
}
}
if (box_it.at_first () && rep_char_it.empty ()) {
//at end of line
word->set_flag (W_EOL, TRUE);
xstarts[1] = prev_x;
}
else {
prev_blanks = blanks;
prev_fuzzy_sp = fuzzy_sp;
prev_fuzzy_non = fuzzy_non;
}
}
}
}
while (!box_it.at_first ()); //until back at start
/* Insert any further repeated char words */
while (!rep_char_it.empty ()) {
word = rep_char_it.extract ();
word_it.add_after_then_move (word);
/* Set spaces before repeated char word */
repetition_spacing = find_mean_blob_spacing (word);
current_gap = word->bounding_box ().left () - prev_x;
if (current_gap > tosp_rep_space * repetition_spacing) {
blanks = (uinT8) floor (current_gap / row->space_size);
if (blanks < 1)
blanks = 1;
}
else
blanks = 0;
("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
word->bounding_box ().left (), word->bounding_box ().bottom (),
repetition_spacing, current_gap, blanks);
word->set_blanks (blanks);
//NO uncertainty
prev_x = word->bounding_box ().right ();
if (rep_char_it.empty ()) {
//at end of line
word->set_flag (W_EOL, TRUE);
xstarts[1] = prev_x;
}
else {
rep_char_it.forward ();
}
}
coeffs[0] = 0;
coeffs[1] = row->line_m ();
coeffs[2] = row->line_c ();
real_row = new ROW (row,
(inT16) row->kern_size, (inT16) row->space_size);
word_it.set_to_list (real_row->word_list ());
//put words in row
word_it.add_list_after (&words);
real_row->recalc_bounding_box ();
if (tosp_debug_level > 4) {
tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
word_count,
real_row->bounding_box ().left (),
real_row->bounding_box ().bottom (),
real_row->bounding_box ().right (),
real_row->bounding_box ().top ());
}
return real_row;
}
return NULL;
}
void tesseract::Textord::set_use_cjk_fp_model ( bool  flag)
inline

Definition at line 56 of file textord.h.

{
use_cjk_fp_model_ = flag;
}
void tesseract::Textord::TextordPage ( PageSegMode  pageseg_mode,
int  width,
int  height,
Pix *  pix,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks 
)

Definition at line 265 of file textord.cpp.

{
page_tr_.set_x(width);
page_tr_.set_y(height);
if (to_blocks->empty()) {
// AutoPageSeg was not used, so we need to find_components first.
find_components(pix, blocks, to_blocks);
} else {
// AutoPageSeg does not need to find_components as it did that already.
// Filter_blobs sets up the TO_BLOCKs the same as find_components does.
filter_blobs(page_tr_, to_blocks, true);
}
ASSERT_HOST(!to_blocks->empty());
if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) {
const FCOORD anticlockwise90(0.0f, 1.0f);
const FCOORD clockwise90(0.0f, -1.0f);
TO_BLOCK_IT it(to_blocks);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
TO_BLOCK* to_block = it.data();
BLOCK* block = to_block->block;
// Create a fake poly_block in block from its bounding box.
block->set_poly_block(new POLY_BLOCK(block->bounding_box(),
// Rotate the to_block along with its contained block and blobnbox lists.
to_block->rotate(anticlockwise90);
// Set the block's rotation values to obey the convention followed in
// layout analysis for vertical text.
block->set_re_rotation(clockwise90);
block->set_classify_rotation(clockwise90);
}
}
TO_BLOCK_IT to_block_it(to_blocks);
TO_BLOCK* to_block = to_block_it.data();
// Make the rows in the block.
float gradient;
// Do it the old fashioned way.
if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
gradient = make_rows(page_tr_, to_blocks);
} else {
// SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
gradient = make_single_row(page_tr_, to_block, to_blocks);
}
// Now fit baselines. For now only old mode is available.
fit_rows(gradient, page_tr_, to_blocks);
// Now make the words in the lines.
if (PSM_WORD_FIND_ENABLED(pageseg_mode)) {
// SINGLE_LINE uses the old word maker on the single line.
make_words(this, page_tr_, gradient, blocks, to_blocks);
} else {
// SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
// single word, and in SINGLE_CHAR mode, all the outlines
// go in a single blob.
TO_BLOCK* to_block = to_block_it.data();
to_block->get_rows(), to_block->block->row_list());
}
cleanup_blocks(blocks); // Remove empties.
// Compute the margins for each row in the block, to be used later for
// paragraph detection.
BLOCK_IT b_it(blocks);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
b_it.data()->compute_row_margins();
}
#ifndef GRAPHICS_DISABLED
#endif
}
void tesseract::Textord::to_spacing ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks 
)

Definition at line 35 of file tospace.cpp.

{
TO_BLOCK_IT block_it; //iterator
TO_BLOCK *block; //current block;
TO_ROW_IT row_it; //row iterator
TO_ROW *row; //current row
int block_index; //block number
int row_index; //row number
//estimated width of real spaces for whole block
inT16 block_space_gap_width;
//estimated width of non space gaps for whole block
inT16 block_non_space_gap_width;
BOOL8 old_text_ord_proportional;//old fixed/prop result
GAPMAP *gapmap = NULL; //map of big vert gaps in blk
block_it.set_to_list (blocks);
block_index = 1;
for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
block_it.forward ()) {
block = block_it.data ();
gapmap = new GAPMAP (block);
block_spacing_stats(block,
gapmap,
old_text_ord_proportional,
block_space_gap_width,
block_non_space_gap_width);
// Make sure relative values of block-level space and non-space gap
// widths are reasonable. The ratio of 1:3 is also used in
// block_spacing_stats, to corrrect the block_space_gap_width
// Useful for arabic and hindi, when the non-space gap width is
// often over-estimated and should not be trusted. A similar ratio
// is found in block_spacing_stats.
(float) block_space_gap_width / block_non_space_gap_width < 3.0) {
block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
}
row_it.set_to_list (block->get_rows ());
row_index = 1;
for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
row = row_it.data ();
if ((row->pitch_decision == PITCH_DEF_PROP) ||
if ((tosp_debug_level > 0) && !old_text_ord_proportional)
tprintf ("Block %d Row %d: Now Proportional\n",
block_index, row_index);
row_spacing_stats(row,
gapmap,
block_index,
row_index,
block_space_gap_width,
block_non_space_gap_width);
}
else {
if ((tosp_debug_level > 0) && old_text_ord_proportional)
("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
block_index, row_index, row->pitch_decision,
row->fixed_pitch);
}
#ifndef GRAPHICS_DISABLED
#endif
row_index++;
}
delete gapmap;
block_index++;
}
}
bool tesseract::Textord::use_cjk_fp_model ( ) const
inline

Definition at line 53 of file textord.h.

{
return use_cjk_fp_model_;
}

Member Data Documentation

double tesseract::Textord::textord_blob_size_bigile = 95

"Percentile for large blobs"

Definition at line 330 of file textord.h.

double tesseract::Textord::textord_blob_size_smallile = 20

"Percentile for small blobs"

Definition at line 333 of file textord.h.

double tesseract::Textord::textord_blshift_maxshift = 0.00

"Max baseline shift"

Definition at line 350 of file textord.h.

double tesseract::Textord::textord_blshift_xfraction = 9.99

"Min size of baseline shift"

Definition at line 351 of file textord.h.

double tesseract::Textord::textord_initialasc_ile = 0.90

"Ile of sizes for xheight guess"

Definition at line 335 of file textord.h.

double tesseract::Textord::textord_initialx_ile = 0.75

"Ile of sizes for xheight guess"

Definition at line 334 of file textord.h.

int tesseract::Textord::textord_max_noise_size = 7

"Pixel size of noise"

Definition at line 329 of file textord.h.

bool tesseract::Textord::textord_no_rejects = false

"Don't remove noise blobs"

Definition at line 326 of file textord.h.

double tesseract::Textord::textord_noise_area_ratio = 0.7

"Fraction of bounding box for noise"

Definition at line 332 of file textord.h.

bool tesseract::Textord::textord_noise_debug = FALSE

"Debug row garbage detector"

Definition at line 349 of file textord.h.

double tesseract::Textord::textord_noise_hfract = 1.0/64

"Height fraction to discard outlines as speckle noise"

Definition at line 346 of file textord.h.

double tesseract::Textord::textord_noise_normratio = 2.0

"Dot to norm ratio for deletion"

Definition at line 339 of file textord.h.

bool tesseract::Textord::textord_noise_rejrows = true

"Reject noise-like rows"

Definition at line 341 of file textord.h.

bool tesseract::Textord::textord_noise_rejwords = true

"Reject noise-like words"

Definition at line 340 of file textord.h.

double tesseract::Textord::textord_noise_rowratio = 6.0

"Dot to norm ratio for deletion"

Definition at line 348 of file textord.h.

int tesseract::Textord::textord_noise_sizefraction = 10

"Fraction of size for maxima"

Definition at line 336 of file textord.h.

double tesseract::Textord::textord_noise_sizelimit = 0.5

"Fraction of x for big t count"

Definition at line 337 of file textord.h.

int tesseract::Textord::textord_noise_sncount = 1

"super norm blobs to save row"

Definition at line 347 of file textord.h.

double tesseract::Textord::textord_noise_sxfract = 0.4

"xh fract width error for norm blobs"

Definition at line 344 of file textord.h.

double tesseract::Textord::textord_noise_syfract = 0.2

"xh fract error for norm blobs"

Definition at line 342 of file textord.h.

int tesseract::Textord::textord_noise_translimit = 16

"Transitions for normal blob"

Definition at line 338 of file textord.h.

bool tesseract::Textord::textord_show_blobs = false

"Display unsorted blobs"

Definition at line 327 of file textord.h.

bool tesseract::Textord::textord_show_boxes = false

"Display boxes"

Definition at line 328 of file textord.h.

bool tesseract::Textord::textord_single_height_mode = false

"Script has no xheight, so use a single mode for horizontal text"

Definition at line 214 of file textord.h.

bool tesseract::Textord::tosp_all_flips_fuzzy = false

"Pass ANY flip to context?"

Definition at line 240 of file textord.h.

bool tesseract::Textord::tosp_block_use_cert_spaces = true

"Only stat OBVIOUS spaces"

Definition at line 230 of file textord.h.

int tesseract::Textord::tosp_debug_level = 0

"Debug data"

Definition at line 255 of file textord.h.

double tesseract::Textord::tosp_dont_fool_with_small_kerns = -1

"Limit use of xht gap with odd small kns"

Definition at line 318 of file textord.h.

double tesseract::Textord::tosp_enough_small_gaps = 0.65

"Fract of kerns reqd for isolated row stats"

Definition at line 296 of file textord.h.

int tesseract::Textord::tosp_enough_space_samples_for_median = 3

"or should we use mean"

Definition at line 257 of file textord.h.

int tesseract::Textord::tosp_few_samples = 40

"No.gaps reqd with 1 large gap to treat as a table"

Definition at line 261 of file textord.h.

double tesseract::Textord::tosp_flip_caution = 0.0

"Dont autoflip kn to sp when large separation"

Definition at line 314 of file textord.h.

bool tesseract::Textord::tosp_flip_fuzz_kn_to_sp = true

"Default flip"

Definition at line 251 of file textord.h.

bool tesseract::Textord::tosp_flip_fuzz_sp_to_kn = true

"Default flip"

Definition at line 252 of file textord.h.

bool tesseract::Textord::tosp_force_wordbreak_on_punct = false

"Force word breaks on punct to break long lines in non-space " "delimited langs"

Definition at line 224 of file textord.h.

double tesseract::Textord::tosp_fuzzy_kn_fraction = 0.5

"New fuzzy kn alg"

Definition at line 303 of file textord.h.

bool tesseract::Textord::tosp_fuzzy_limit_all = true

"Dont restrict kn->sp fuzzy limit to tables"

Definition at line 242 of file textord.h.

double tesseract::Textord::tosp_fuzzy_sp_fraction = 0.5

"New fuzzy sp alg"

Definition at line 304 of file textord.h.

double tesseract::Textord::tosp_fuzzy_space_factor = 0.6

"Fract of xheight for fuzz sp"

Definition at line 280 of file textord.h.

double tesseract::Textord::tosp_fuzzy_space_factor1 = 0.5

"Fract of xheight for fuzz sp"

Definition at line 282 of file textord.h.

double tesseract::Textord::tosp_fuzzy_space_factor2 = 0.72

"Fract of xheight for fuzz sp"

Definition at line 284 of file textord.h.

double tesseract::Textord::tosp_gap_factor = 0.83

"gap ratio to flip sp->kern"

Definition at line 285 of file textord.h.

double tesseract::Textord::tosp_ignore_big_gaps = -1

"xht multiplier"

Definition at line 292 of file textord.h.

double tesseract::Textord::tosp_ignore_very_big_gaps = 3.5

"xht multiplier"

Definition at line 293 of file textord.h.

bool tesseract::Textord::tosp_improve_thresh = false

"Enable improvement heuristic"

Definition at line 254 of file textord.h.

double tesseract::Textord::tosp_init_guess_kn_mult = 2.2

"Thresh guess - mult kn by this"

Definition at line 308 of file textord.h.

double tesseract::Textord::tosp_init_guess_xht_mult = 0.28

"Thresh guess - mult xht by this"

Definition at line 310 of file textord.h.

double tesseract::Textord::tosp_kern_gap_factor1 = 2.0

"gap ratio to flip kern->sp"

Definition at line 287 of file textord.h.

double tesseract::Textord::tosp_kern_gap_factor2 = 1.3

"gap ratio to flip kern->sp"

Definition at line 289 of file textord.h.

double tesseract::Textord::tosp_kern_gap_factor3 = 2.5

"gap ratio to flip kern->sp"

Definition at line 291 of file textord.h.

double tesseract::Textord::tosp_large_kerning = 0.19

"Limit use of xht gap with large kns"

Definition at line 316 of file textord.h.

double tesseract::Textord::tosp_max_sane_kn_thresh = 5.0

"Multiplier on kn to limit thresh"

Definition at line 312 of file textord.h.

double tesseract::Textord::tosp_min_sane_kn_sp = 1.5

"Dont trust spaces less than this time kn"

Definition at line 306 of file textord.h.

double tesseract::Textord::tosp_narrow_aspect_ratio = 0.48

"narrow if w/h less than this"

Definition at line 275 of file textord.h.

bool tesseract::Textord::tosp_narrow_blobs_not_cert = true

"Only stat OBVIOUS spaces"

Definition at line 234 of file textord.h.

double tesseract::Textord::tosp_narrow_fraction = 0.3

"Fract of xheight for narrow"

Definition at line 273 of file textord.h.

double tesseract::Textord::tosp_near_lh_edge = 0

"Dont reduce box if the top left is non blank"

Definition at line 320 of file textord.h.

double tesseract::Textord::tosp_old_sp_kn_th_factor = 2.0

"Factor for defining space threshold in terms of space and " "kern sizes"

Definition at line 267 of file textord.h.

bool tesseract::Textord::tosp_old_to_bug_fix = false

"Fix suspected bug in old code"

Definition at line 228 of file textord.h.

bool tesseract::Textord::tosp_old_to_constrain_sp_kn = false

"Constrain relative values of inter and intra-word gaps for " "old_to_method."

Definition at line 219 of file textord.h.

bool tesseract::Textord::tosp_old_to_method = false

"Space stats use prechopping?"

Definition at line 216 of file textord.h.

bool tesseract::Textord::tosp_only_small_gaps_for_kern = false

"Better guess"

Definition at line 239 of file textord.h.

bool tesseract::Textord::tosp_only_use_prop_rows = true

"Block stats to use fixed pitch rows?"

Definition at line 221 of file textord.h.

bool tesseract::Textord::tosp_only_use_xht_gaps = false

"Only use within xht gap for wd breaks"

Definition at line 248 of file textord.h.

double tesseract::Textord::tosp_pass_wide_fuzz_sp_to_context = 0.75

"How wide fuzzies need context"

Definition at line 324 of file textord.h.

bool tesseract::Textord::tosp_recovery_isolated_row_stats = true

"Use row alone when inadequate cert spaces"

Definition at line 238 of file textord.h.

int tesseract::Textord::tosp_redo_kern_limit = 10

"No.samples reqd to reestimate for row"

Definition at line 259 of file textord.h.

double tesseract::Textord::tosp_rep_space = 1.6

"rep gap multiplier for space"

Definition at line 294 of file textord.h.

bool tesseract::Textord::tosp_row_use_cert_spaces = true

"Only stat OBVIOUS spaces"

Definition at line 232 of file textord.h.

bool tesseract::Textord::tosp_row_use_cert_spaces1 = true

"Only stat OBVIOUS spaces"

Definition at line 236 of file textord.h.

bool tesseract::Textord::tosp_rule_9_test_punct = false

"Dont chng kn to space next to punct"

Definition at line 250 of file textord.h.

int tesseract::Textord::tosp_sanity_method = 1

"How to avoid being silly"

Definition at line 264 of file textord.h.

int tesseract::Textord::tosp_short_row = 20

"No.gaps reqd with few cert spaces to use certs"

Definition at line 263 of file textord.h.

double tesseract::Textord::tosp_silly_kn_sp_gap = 0.2

"Dont let sp minus kn get too small"

Definition at line 322 of file textord.h.

bool tesseract::Textord::tosp_stats_use_xht_gaps = true

"Use within xht gap for wd breaks"

Definition at line 244 of file textord.h.

double tesseract::Textord::tosp_table_fuzzy_kn_sp_ratio = 3.0

"Fuzzy if less than this"

Definition at line 302 of file textord.h.

double tesseract::Textord::tosp_table_kn_sp_ratio = 2.25

"Min difference of kn & sp in table"

Definition at line 298 of file textord.h.

double tesseract::Textord::tosp_table_xht_sp_ratio = 0.33

"Expect spaces bigger than this"

Definition at line 300 of file textord.h.

double tesseract::Textord::tosp_threshold_bias1 = 0

"how far between kern and space?"

Definition at line 269 of file textord.h.

double tesseract::Textord::tosp_threshold_bias2 = 0

"how far between kern and space?"

Definition at line 271 of file textord.h.

bool tesseract::Textord::tosp_use_pre_chopping = false

"Space stats use prechopping?"

Definition at line 226 of file textord.h.

bool tesseract::Textord::tosp_use_xht_gaps = true

"Use within xht gap for wd breaks"

Definition at line 246 of file textord.h.

double tesseract::Textord::tosp_wide_aspect_ratio = 0.0

"wide if w/h less than this"

Definition at line 278 of file textord.h.

double tesseract::Textord::tosp_wide_fraction = 0.52

"Fract of xheight for wide"

Definition at line 276 of file textord.h.


The documentation for this class was generated from the following files: