30 #pragma warning(disable:4244) // Conversion warnings
85 : left_margin_(-
MAX_INT32), right_margin_(MAX_INT32),
86 median_bottom_(MAX_INT32), median_top_(-MAX_INT32), median_size_(0),
87 median_left_(MAX_INT32), median_right_(-MAX_INT32), median_width_(0),
88 blob_type_(blob_type), flow_(
BTFT_NONE), good_blob_score_(0),
89 good_width_(false), good_column_(false),
90 left_key_tab_(false), right_key_tab_(false),
91 left_key_(0), right_key_(0), type_(
PT_UNKNOWN), vertical_(vertical),
92 working_set_(
NULL), last_add_was_vertical_(false), block_owned_(false),
93 desperately_merged_(false),
94 first_column_(-1), last_column_(-1), column_set_(NULL),
95 side_step_(0), top_spacing_(0), bottom_spacing_(0),
96 type_before_table_(PT_UNKNOWN), inside_table_column_(false),
97 nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
98 space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
100 memset(special_blobs_densities_, 0,
sizeof(special_blobs_densities_));
130 ColPartition_LIST* big_part_list) {
139 if (big_part_list !=
NULL) {
140 ColPartition_IT part_it(big_part_list);
141 part_it.add_to_end(single);
149 ColPartition_C_IT it(&upper_partners_);
150 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
151 it.data()->RemovePartner(
false,
this);
153 it.set_to_list(&lower_partners_);
154 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
155 it.data()->RemovePartner(
true,
this);
163 int left,
int bottom,
164 int right,
int top) {
166 part->bounding_box_ =
TBOX(left, bottom, right, top);
167 part->median_bottom_ = bottom;
168 part->median_top_ = top;
169 part->median_size_ = top - bottom;
170 part->median_width_ = right - left;
183 if (boxes_.length() == 0) {
186 bounding_box_ += box;
190 if (!last_add_was_vertical_) {
191 boxes_.sort(SortByBoxBottom<BLOBNBOX>);
192 last_add_was_vertical_ =
true;
194 boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>,
true, bbox);
196 if (last_add_was_vertical_) {
197 boxes_.sort(SortByBoxLeft<BLOBNBOX>);
198 last_add_was_vertical_ =
false;
200 boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>,
true, bbox);
207 tprintf(
"Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
209 bounding_box_.
left(), bounding_box_.
right());
214 BLOBNBOX_C_IT bb_it(&boxes_);
215 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
216 if (box == bb_it.data()) {
228 BLOBNBOX_C_IT bb_it(&boxes_);
229 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
232 if (biggest ==
NULL ||
236 if (biggest ==
NULL ||
247 BLOBNBOX_C_IT bb_it(&boxes_);
248 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
249 if (box != bb_it.data()) {
250 result += bb_it.data()->bounding_box();
259 BLOBNBOX_C_IT bb_it(&boxes_);
260 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
275 BLOBNBOX_C_IT bb_it(&boxes_);
276 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
288 for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
290 delete bblob->
cblob();
300 ColPartition_CLIST reversed_boxes;
301 ColPartition_C_IT reversed_it(&reversed_boxes);
303 BLOBNBOX_C_IT bb_it(&boxes_);
304 for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
305 reversed_it.add_before_then_move(bb_it.extract());
307 bb_it.add_list_after(&reversed_boxes);
309 int tmp = left_margin_;
310 left_margin_ = -right_margin_;
311 right_margin_ = -tmp;
322 if (bounding_box_.
left() > bounding_box_.
right()) {
324 tprintf(
"Bounding box invalid\n");
329 if (left_margin_ > bounding_box_.
left() ||
330 right_margin_ < bounding_box_.
right()) {
339 tprintf(
"Key inside box: %d v %d or %d v %d\n",
350 int y = (
MidY() + other.
MidY()) / 2;
393 if (bounding_box_.
right() < other.bounding_box_.
left() &&
396 if (other.bounding_box_.
right() < bounding_box_.
left() &&
399 if (bounding_box_.
left() > other.bounding_box_.
right() &&
402 if (other.bounding_box_.
left() > bounding_box_.
right() &&
410 double fractional_tolerance,
411 double constant_tolerance)
const {
413 int nonmatch_count = 0;
414 BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
415 BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
416 box_it.mark_cycle_pt();
417 other_it.mark_cycle_pt();
418 while (!box_it.cycled_list() && !other_it.cycled_list()) {
419 if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
420 fractional_tolerance,
428 return match_count > nonmatch_count;
439 BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
442 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
446 tprintf(
"Blob is not a diacritic:");
458 bool result = min_top > candidate.median_bottom_ &&
459 max_bottom < candidate.median_top_;
464 tprintf(
"y ranges don\'t overlap: %d-%d / %d-%d\n",
465 max_bottom, min_top, median_bottom_, median_top_);
474 if (tab_vector !=
NULL) {
478 left_key_tab_ =
false;
486 if (tab_vector !=
NULL) {
487 right_key_ = tab_vector->
sort_key();
490 right_key_tab_ =
false;
499 left_key_tab_ = take_box ?
false : src.left_key_tab_;
501 left_key_ = src.left_key_;
506 if (left_margin_ > bounding_box_.
left())
507 left_margin_ = src.left_margin_;
512 right_key_tab_ = take_box ?
false : src.right_key_tab_;
513 if (right_key_tab_) {
514 right_key_ = src.right_key_;
519 if (right_margin_ < bounding_box_.
right())
520 right_margin_ = src.right_margin_;
525 BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
526 return it.data()->left_rule();
530 BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
532 return it.data()->right_rule();
537 return special_blobs_densities_[
type];
542 BLOBNBOX_C_IT blob_it(&boxes_);
544 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
547 if (blob_type == type) {
558 special_blobs_densities_[
type] = density;
562 memset(special_blobs_densities_, 0,
sizeof(special_blobs_densities_));
563 if (boxes_.empty()) {
567 BLOBNBOX_C_IT blob_it(&boxes_);
568 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
571 special_blobs_densities_[
type]++;
575 special_blobs_densities_[
type] /= boxes_.length();
584 partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
586 upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
true, partner);
588 partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
590 lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
true, partner);
598 ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
599 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
600 if (it.data() == partner) {
609 ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
610 if (!partners->singleton())
612 ColPartition_C_IT it(partners);
624 bounding_box_.
bottom()) ||
626 other->bounding_box_.
bottom())) {
633 memset(special_blobs_densities_, 0,
sizeof(special_blobs_densities_));
635 int w1 = boxes_.length(), w2 = other->boxes_.length();
636 float new_val = special_blobs_densities_[
type] * w1 +
637 other->special_blobs_densities_[
type] * w2;
639 special_blobs_densities_[
type] = new_val / (w1 + w2);
644 BLOBNBOX_C_IT it(&boxes_);
645 BLOBNBOX_C_IT it2(&other->boxes_);
646 for (; !it2.empty(); it2.forward()) {
649 if (prev_owner != other && prev_owner !=
NULL) {
654 if (prev_owner == other)
656 it.add_to_end(bbox2);
658 left_margin_ =
MIN(left_margin_, other->left_margin_);
659 right_margin_ =
MAX(right_margin_, other->right_margin_);
660 if (other->left_key_ < left_key_) {
661 left_key_ = other->left_key_;
662 left_key_tab_ = other->left_key_tab_;
664 if (other->right_key_ > right_key_) {
665 right_key_ = other->right_key_;
666 right_key_tab_ = other->right_key_tab_;
671 flow_ = other->flow_;
672 blob_type_ = other->blob_type_;
676 boxes_.sort(SortByBoxBottom<BLOBNBOX>);
677 last_add_was_vertical_ =
true;
679 boxes_.sort(SortByBoxLeft<BLOBNBOX>);
680 last_add_was_vertical_ =
false;
685 for (
int upper = 0; upper < 2; ++upper) {
686 ColPartition_CLIST partners;
687 ColPartition_C_IT part_it(&partners);
688 part_it.add_list_after(upper ? &other->upper_partners_
689 : &other->lower_partners_);
690 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
715 int ok_box_overlap,
bool debug) {
719 tprintf(
"Vertical partition\n");
733 if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
734 merged_box.bottom() < bounding_box_.
top() - ok_box_overlap &&
735 merged_box.top() > bounding_box_.
bottom() + ok_box_overlap) {
737 tprintf(
"Excessive box overlap\n");
747 if (boxes_.empty() || boxes_.singleton())
749 BLOBNBOX_C_IT it(&boxes_);
750 TBOX left_box(it.data()->bounding_box());
751 for (it.forward(); !it.at_first(); it.forward()) {
754 if (left_box.overlap(box))
767 BLOBNBOX_C_IT it(&boxes_);
768 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
772 if (bbox == split_blob || !split_part->boxes_.empty()) {
773 split_part->
AddBox(it.extract());
785 right_key_tab_ =
false;
786 split_part->left_key_tab_ =
false;
801 if (split_x <= bounding_box_.
left() || split_x >= bounding_box_.
right())
805 BLOBNBOX_C_IT it(&boxes_);
806 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
811 if (box.
left() >= split_x) {
812 split_part->
AddBox(it.extract());
824 right_key_tab_ =
false;
825 split_part->left_key_tab_ =
false;
826 right_margin_ = split_x;
827 split_part->left_margin_ = split_x;
835 bounding_box_ =
TBOX();
836 BLOBNBOX_C_IT it(&boxes_);
838 int non_leader_count = 0;
840 bounding_box_.
set_left(left_margin_);
845 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
857 tprintf(
"Computed left-illegal partition\n");
863 tprintf(
"Computed right-illegal partition\n");
870 median_top_ = bounding_box_.
top();
871 median_bottom_ = bounding_box_.
bottom();
872 median_size_ = bounding_box_.
height();
873 median_left_ = bounding_box_.
left();
874 median_right_ = bounding_box_.
right();
875 median_width_ = bounding_box_.
width();
878 STATS bottom_stats(bounding_box_.
bottom(), bounding_box_.
top() + 1);
880 STATS left_stats(bounding_box_.
left(), bounding_box_.
right() + 1);
881 STATS right_stats(bounding_box_.
left(), bounding_box_.
right() + 1);
882 STATS width_stats(0, bounding_box_.
width() + 1);
883 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
887 int area = box.
area();
888 top_stats.add(box.
top(), area);
889 bottom_stats.add(box.
bottom(), area);
890 size_stats.add(box.
height(), area);
891 left_stats.add(box.
left(), area);
892 right_stats.add(box.
right(), area);
893 width_stats.add(box.
width(), area);
896 median_top_ =
static_cast<int>(top_stats.median() + 0.5);
897 median_bottom_ =
static_cast<int>(bottom_stats.median() + 0.5);
898 median_size_ =
static_cast<int>(size_stats.median() + 0.5);
899 median_left_ =
static_cast<int>(left_stats.median() + 0.5);
900 median_right_ =
static_cast<int>(right_stats.median() + 0.5);
901 median_width_ =
static_cast<int>(width_stats.median() + 0.5);
905 tprintf(
"Made partition with bad right coords");
909 tprintf(
"Made partition with bad left coords");
915 for (
int upper = 0; upper < 2; ++upper) {
916 ColPartition_CLIST partners;
917 ColPartition_C_IT part_it(&partners);
918 part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
919 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
926 bounding_box_.
bottom())) {
927 tprintf(
"Recomputed box for partition %p\n",
this);
934 BLOBNBOX_C_IT it(&boxes_);
935 int overlap_count = 0;
936 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
941 return overlap_count;
947 int first_spanned_col = -1;
950 bounding_box_.
left(), bounding_box_.
right(),
951 MidY(), left_margin_, right_margin_,
952 &first_column_, &last_column_,
954 column_set_ = columns;
955 if (first_column_ < last_column_ && span_type ==
CST_PULLOUT &&
959 if (first_spanned_col >= 0) {
960 first_column_ = first_spanned_col;
961 last_column_ = first_spanned_col;
963 if ((first_column_ & 1) == 0)
964 last_column_ = first_column_;
965 else if ((last_column_ & 1) == 0)
966 first_column_ = last_column_;
968 first_column_ = last_column_ = (first_column_ + last_column_) / 2;
986 switch (blob_type_) {
1029 int* first_col,
int* last_col) {
1030 int first_spanned_col = -1;
1033 bounding_box_.
left(), bounding_box_.
right(),
1034 MidY(), left_margin_, right_margin_,
1035 first_col, last_col,
1036 &first_spanned_col);
1044 good_width_ = cb->
Run(width);
1045 good_column_ = blob_type_ ==
BRT_TEXT && left_key_tab_ && right_key_tab_;
1055 bool result =
false;
1057 int part_width = bounding_box_.
width();
1058 STATS gap_stats(0, part_width);
1059 STATS width_stats(0, part_width);
1060 BLOBNBOX_C_IT it(&boxes_);
1065 for (it.forward(); !it.at_first(); it.forward()) {
1070 width_stats.
add(right - left, 1);
1075 double median_gap = gap_stats.
median();
1077 double max_width =
MAX(median_gap, median_width);
1078 double min_width =
MIN(median_gap, median_width);
1079 double gap_iqr = gap_stats.
ile(0.75
f) - gap_stats.
ile(0.25
f);
1081 tprintf(
"gap iqr = %g, blob_count=%d, limits=%g,%g\n",
1091 int offset =
static_cast<int>(ceil(gap_iqr * 2));
1092 int min_step =
static_cast<int>(median_gap + median_width + 0.5);
1093 int max_step = min_step + offset;
1096 int part_left = bounding_box_.
left() - min_step / 2;
1097 part_width += min_step;
1099 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1104 for (
int x = left; x < right; ++x) {
1110 part_width, projection);
1114 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1118 if (it.at_first()) {
1119 int gap = it.data_relative(1)->bounding_box().
left() -
1128 it.data_relative(-1)->bounding_box().right();
1140 if (best_end ==
NULL) {
1143 tprintf(
"Total cost = %d vs allowed %d\n",
1147 delete [] projection;
1161 int good_blob_score_ = 0;
1162 int noisy_count = 0;
1163 int hline_count = 0;
1164 int vline_count = 0;
1165 BLOBNBOX_C_IT it(&boxes_);
1166 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1176 if (hline_count > vline_count) {
1179 }
else if (vline_count > hline_count) {
1182 }
else if (value < -1 || 1 < value) {
1186 long_side = bounding_box_.
width();
1187 short_side = bounding_box_.
height();
1190 long_side = bounding_box_.
height();
1191 short_side = bounding_box_.
width();
1207 if (flow_ ==
BTFT_CHAIN && strong_score == 3)
1215 if (noisy_count >= blob_count) {
1221 bounding_box_.
bottom())) {
1222 tprintf(
"RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1223 blob_count, noisy_count, good_blob_score_);
1224 tprintf(
" Projection value=%d, flow=%d, blob_type=%d\n",
1225 value, flow_, blob_type_);
1236 BLOBNBOX_C_IT it(&boxes_);
1237 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1254 int total_height = 0;
1256 int height_count = 0;
1258 BLOBNBOX_C_IT it(&boxes_);
1259 TBOX box(it.data()->bounding_box());
1265 ICOORD first_pt(box.right(), box.bottom());
1268 linepoints.
Add(first_pt);
1269 for (it.forward(); !it.at_last(); it.forward()) {
1272 ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1273 linepoints.
Add(box_pt);
1274 total_height += box.width();
1275 coverage += box.height();
1278 box = it.data()->bounding_box();
1279 ICOORD last_pt(box.right(), box.top());
1280 linepoints.
Add(last_pt);
1281 width = last_pt.y() - first_pt.y();
1285 TBOX box(it.data()->bounding_box());
1288 ICOORD first_pt(box.left(), box.bottom());
1289 linepoints.
Add(first_pt);
1290 for (it.forward(); !it.at_last(); it.forward()) {
1293 ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1294 linepoints.
Add(box_pt);
1295 total_height += box.height();
1296 coverage += box.width();
1299 box = it.data()->bounding_box();
1300 ICOORD last_pt(box.right(), box.bottom());
1301 linepoints.
Add(last_pt);
1302 width = last_pt.x() - first_pt.x();
1307 double error = linepoints.
Fit(&start_pt, &end_pt);
1315 ColPartition_LIST* used_parts,
1316 WorkingPartSet_LIST* working_sets) {
1319 block_owned_ =
true;
1320 WorkingPartSet_IT it(working_sets);
1323 if (partner !=
NULL && partner->working_set_ !=
NULL) {
1324 working_set_ = partner->working_set_;
1329 tprintf(
"Partition with partner has no working set!:");
1337 for (it.mark_cycle_pt(); !it.cycled_list() &&
1338 col_index != first_column_;
1339 it.forward(), ++col_index);
1341 tprintf(
"Match is %s for:", (col_index & 1) ?
"Real" :
"Between");
1345 tprintf(
"Target column=%d, only had %d\n", first_column_, col_index);
1348 work_set = it.data();
1351 if (!it.cycled_list() && last_column_ != first_column_) {
1353 BLOCK_LIST completed_blocks;
1354 TO_BLOCK_LIST to_blocks;
1355 for (; !it.cycled_list() && col_index <= last_column_;
1356 it.forward(), ++col_index) {
1359 &completed_blocks, &to_blocks);
1361 work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1363 working_set_ = work_set;
1375 ColPartition_LIST* block_parts,
1376 ColPartition_LIST* used_parts,
1377 BLOCK_LIST* completed_blocks,
1378 TO_BLOCK_LIST* to_blocks) {
1379 int page_height = tright.
y() - bleft.
y();
1381 ColPartition_IT it(block_parts);
1383 int max_line_height = 0;
1389 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1395 BLOBNBOX_C_IT blob_it(part->
boxes());
1396 int prev_bottom = blob_it.data()->bounding_box().bottom();
1397 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1400 int step = bottom - prev_bottom;
1403 side_steps.add(step, 1);
1404 prev_bottom = bottom;
1406 part->
set_side_step(static_cast<int>(side_steps.median() + 0.5));
1407 if (!it.at_last()) {
1418 tprintf(
"side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1423 if (part_count == 0)
1426 SmoothSpacings(resolution, page_height, block_parts);
1429 BLOCK_IT block_it(completed_blocks);
1430 TO_BLOCK_IT to_block_it(to_blocks);
1431 ColPartition_LIST spacing_parts;
1432 ColPartition_IT sp_block_it(&spacing_parts);
1434 for (it.mark_cycle_pt(); !it.empty();) {
1436 sp_block_it.add_to_end(part);
1438 if (it.empty() || part->
bottom_spacing() > same_block_threshold ||
1439 !part->SpacingsEqual(*it.data(), resolution)) {
1442 if (!it.empty() && part->
bottom_spacing() <= same_block_threshold) {
1448 tprintf(
"Spacings unequal: upper:%d/%d, lower:%d/%d,"
1449 " sizes %d %d %d\n",
1457 if (part->SizesSimilar(*next_part) &&
1458 next_part->
median_size() * kMaxSameBlockLineSpacing >
1464 if (third_part ==
NULL ||
1465 !next_part->SizesSimilar(*third_part) ||
1466 third_part->
median_size() * kMaxSameBlockLineSpacing <=
1468 next_part->
median_size() * kMaxSameBlockLineSpacing <=
1472 sp_block_it.add_to_end(it.extract());
1475 tprintf(
"Added line to current block.\n");
1481 if (to_block !=
NULL) {
1482 to_block_it.add_to_end(to_block);
1483 block_it.add_to_end(to_block->
block);
1485 sp_block_it.set_to_list(&spacing_parts);
1489 tprintf(
"Spacings equal: upper:%d/%d, lower:%d/%d\n",
1500 if (pos->
x() < bleft.
x())
1502 if (pos->
x() > tright.
x())
1504 if (pos->
y() < bleft.
y())
1506 if (pos->
y() > tright.
y())
1514 static TO_BLOCK* MoveBlobsToBlock(
bool vertical_text,
int line_spacing,
1516 ColPartition_LIST* block_parts,
1517 ColPartition_LIST* used_parts) {
1523 STATS sizes(0,
MAX(block_box.width(), block_box.height()));
1525 ColPartition_IT it(block_parts);
1527 BLOBNBOX_IT blob_it(&to_block->
blobs);
1528 ColPartition_IT used_it(used_parts);
1529 for (it.move_to_first(); !it.empty(); it.forward()) {
1530 ColPartition* part = it.extract();
1534 for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
1537 if (bblob->
owner() != part) {
1538 tprintf(
"Ownership incorrect for blob:");
1555 C_OUTLINE_IT ol_it(outlines);
1556 if (outlines->singleton()) {
1557 ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1562 blob_it.add_after_then_move(bblob);
1566 for (;!ol_it.empty(); ol_it.forward()) {
1573 blob_it.add_after_then_move(blob);
1575 delete bblob->
cblob();
1579 used_it.add_to_end(part);
1581 if (text_type && blob_it.empty()) {
1587 if (vertical_text) {
1589 if (block_width < line_spacing)
1590 line_spacing = block_width;
1591 to_block->
line_spacing =
static_cast<float>(line_spacing);
1592 to_block->
max_blob_size =
static_cast<float>(block_width + 1);
1595 if (block_height < line_spacing)
1596 line_spacing = block_height;
1597 to_block->
line_spacing =
static_cast<float>(line_spacing);
1598 to_block->
max_blob_size =
static_cast<float>(block_height + 1);
1606 ColPartition_LIST* block_parts,
1607 ColPartition_LIST* used_parts) {
1608 if (block_parts->empty())
1610 ColPartition_IT it(block_parts);
1621 ICOORDELT_LIST vertices;
1622 ICOORDELT_IT vert_it(&vertices);
1631 ColPartition::LeftEdgeRun(&it, &start, &end);
1633 ColPartition::RightEdgeRun(&it, &start, &end);
1634 ClipCoord(bleft, tright, &start);
1635 ClipCoord(bleft, tright, &end);
1636 vert_it.add_after_then_move(
new ICOORDELT(start));
1637 vert_it.add_after_then_move(
new ICOORDELT(end));
1642 if ((iteration == 0 && it.at_first()) ||
1643 (iteration == 1 && it.at_last())) {
1647 }
while (iteration < 2);
1649 tprintf(
"Making block at (%d,%d)->(%d,%d)\n",
1650 min_x, min_y, max_x, max_y);
1651 BLOCK* block =
new BLOCK(
"",
true, 0, 0, min_x, min_y, max_x, max_y);
1653 return MoveBlobsToBlock(
false, line_spacing, block, block_parts, used_parts);
1660 ColPartition_LIST* block_parts,
1661 ColPartition_LIST* used_parts) {
1662 if (block_parts->empty())
1664 ColPartition_IT it(block_parts);
1667 int line_spacing = block_box.
width();
1669 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1670 block_box += it.data()->bounding_box();
1677 block_box.
right(), block_box.
top());
1679 return MoveBlobsToBlock(
true, line_spacing, block, block_parts, used_parts);
1686 part->left_margin_ = left_margin_;
1687 part->right_margin_ = right_margin_;
1688 part->bounding_box_ = bounding_box_;
1689 memcpy(part->special_blobs_densities_, special_blobs_densities_,
1690 sizeof(special_blobs_densities_));
1691 part->median_bottom_ = median_bottom_;
1692 part->median_top_ = median_top_;
1693 part->median_size_ = median_size_;
1694 part->median_left_ = median_left_;
1695 part->median_right_ = median_right_;
1696 part->median_width_ = median_width_;
1697 part->good_width_ = good_width_;
1698 part->good_column_ = good_column_;
1699 part->left_key_tab_ = left_key_tab_;
1700 part->right_key_tab_ = right_key_tab_;
1701 part->type_ = type_;
1702 part->flow_ = flow_;
1703 part->left_key_ = left_key_;
1704 part->right_key_ = right_key_;
1705 part->first_column_ = first_column_;
1706 part->last_column_ = last_column_;
1707 part->owns_blobs_ =
false;
1714 BLOBNBOX_C_IT inserter(copy->
boxes());
1715 BLOBNBOX_C_IT traverser(
boxes());
1716 for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
1717 inserter.add_after_then_move(traverser.data());
1721 #ifndef GRAPHICS_DISABLED
1729 #endif // GRAPHICS_DISABLED
1732 static char kBlobTypes[
BRT_COUNT + 1] =
"NHSRIUVT";
1737 tprintf(
"ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1738 " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1739 " ts=%d bs=%d ls=%d rs=%d\n",
1740 boxes_.empty() ?
'E' :
' ',
1741 left_margin_, left_key_tab_ ?
'T' :
'B',
LeftAtY(y),
1742 bounding_box_.
left(), median_left_,
1743 bounding_box_.
bottom(), median_bottom_,
1744 bounding_box_.
right(),
RightAtY(y), right_key_tab_ ?
'T' :
'B',
1745 right_margin_, median_right_, bounding_box_.
top(), median_top_,
1746 good_width_, good_column_, type_,
1747 kBlobTypes[blob_type_], flow_,
1748 first_column_, last_column_, boxes_.length(),
1749 space_above_, space_below_, space_to_left_, space_to_right_);
1754 tprintf(
"Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
1755 color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
1756 color1_[L_ALPHA_CHANNEL],
1757 color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1762 STATS left_stats(0, working_set_count);
1763 STATS right_stats(0, working_set_count);
1768 if (partner->type_ > max_type)
1769 max_type = partner->type_;
1770 if (column_set_ == partner->column_set_) {
1771 left_stats.
add(partner->first_column_, 1);
1772 right_stats.
add(partner->last_column_, 1);
1780 first_column_ = left_stats.
mode();
1781 last_column_ = right_stats.
mode();
1782 if (last_column_ < first_column_)
1783 last_column_ = first_column_;
1788 partner->type_ = max_type;
1789 #if 0 // See TODO above
1790 if (column_set_ == partner->column_set_) {
1791 partner->first_column_ = first_column_;
1792 partner->last_column_ = last_column_;
1833 RefinePartnersInternal(
true, get_desperate, grid);
1834 RefinePartnersInternal(
false, get_desperate, grid);
1838 RefinePartnersByType(
true, &upper_partners_);
1839 RefinePartnersByType(
false, &lower_partners_);
1843 if (!upper_partners_.empty() && !upper_partners_.singleton())
1844 RefinePartnersByOverlap(
true, &upper_partners_);
1845 if (!lower_partners_.empty() && !lower_partners_.singleton())
1846 RefinePartnersByOverlap(
false, &lower_partners_);
1855 void ColPartition::RefinePartnersInternal(
bool upper,
bool get_desperate,
1857 ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
1858 if (!partners->empty() && !partners->singleton()) {
1859 RefinePartnersByType(upper, partners);
1860 if (!partners->empty() && !partners->singleton()) {
1862 RefinePartnerShortcuts(upper, partners);
1863 if (!partners->empty() && !partners->singleton()) {
1867 RefineTextPartnersByMerge(upper,
false, partners, grid);
1868 if (!partners->empty() && !partners->singleton())
1869 RefineTextPartnersByMerge(upper,
true, partners, grid);
1872 if (!partners->empty() && !partners->singleton())
1873 RefinePartnersByOverlap(upper, partners);
1882 void ColPartition::RefinePartnersByType(
bool upper,
1883 ColPartition_CLIST* partners) {
1887 tprintf(
"Refining %d %s partners by type for:\n",
1888 partners->length(), upper ?
"Upper" :
"Lower");
1891 ColPartition_C_IT it(partners);
1897 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1904 partner->RemovePartner(!upper,
this);
1913 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1921 partner->RemovePartner(!upper,
this);
1936 void ColPartition::RefinePartnerShortcuts(
bool upper,
1937 ColPartition_CLIST* partners) {
1938 bool done_any =
false;
1941 ColPartition_C_IT it(partners);
1942 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1946 ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
1947 for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
1952 a->RemovePartner(!upper,
this);
1955 ColPartition_C_IT it2(partners);
1956 for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
1961 b2->RemovePartner(!upper,
this);
1974 }
while (done_any && !partners->empty() && !partners->singleton());
1985 void ColPartition::RefineTextPartnersByMerge(
bool upper,
bool desperate,
1986 ColPartition_CLIST* partners,
1987 ColPartitionGrid* grid) {
1991 tprintf(
"Refining %d %s partners by merge for:\n",
1992 partners->length(), upper ?
"Upper" :
"Lower");
1995 while (!partners->empty() && !partners->singleton()) {
1998 ColPartition_C_IT it(partners);
2002 ColPartition_CLIST candidates;
2003 ColPartition_C_IT cand_it(&candidates);
2004 for (it.forward(); !it.at_first(); it.forward()) {
2006 if (part->first_column_ == candidate->last_column_ &&
2007 part->last_column_ == candidate->first_column_)
2008 cand_it.add_after_then_move(it.data());
2010 int overlap_increase;
2011 ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
2012 NULL, &overlap_increase);
2013 if (candidate !=
NULL && (overlap_increase <= 0 || desperate)) {
2015 tprintf(
"Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2016 part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2020 grid->RemoveBBox(candidate);
2021 grid->RemoveBBox(part);
2022 part->Absorb(candidate,
NULL);
2024 grid->InsertBBox(
true,
true, part);
2025 if (overlap_increase > 0)
2026 part->desperately_merged_ =
true;
2035 void ColPartition::RefinePartnersByOverlap(
bool upper,
2036 ColPartition_CLIST* partners) {
2040 tprintf(
"Refining %d %s partners by overlap for:\n",
2041 partners->length(), upper ?
"Upper" :
"Lower");
2044 ColPartition_C_IT it(partners);
2047 int best_overlap = 0;
2048 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2050 int overlap =
MIN(bounding_box_.
right(), partner->bounding_box_.right())
2051 -
MAX(bounding_box_.
left(), partner->bounding_box_.left());
2052 if (overlap > best_overlap) {
2053 best_overlap = overlap;
2054 best_partner = partner;
2058 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2060 if (partner != best_partner) {
2065 partner->RemovePartner(!upper,
this);
2072 bool ColPartition::ThisPartitionBetter(
BLOBNBOX* bbox,
2073 const ColPartition& other) {
2076 int left = box.
left();
2077 int right = box.
right();
2078 if (left < left_margin_ || right > right_margin_)
2080 if (left < other.left_margin_ || right > other.right_margin_)
2082 int top = box.
top();
2083 int bottom = box.
bottom();
2084 int this_overlap =
MIN(top, median_top_) -
MAX(bottom, median_bottom_);
2085 int other_overlap =
MIN(top, other.median_top_) -
2086 MAX(bottom, other.median_bottom_);
2087 int this_miss = median_top_ - median_bottom_ - this_overlap;
2088 int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2090 tprintf(
"Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2092 this_overlap, other_overlap, this_miss, other_miss,
2093 median_top_, other.median_top_);
2095 if (this_miss < other_miss)
2097 if (this_miss > other_miss)
2099 if (this_overlap > other_overlap)
2101 if (this_overlap < other_overlap)
2103 return median_top_ >= other.median_top_;
2110 static int MedianSpacing(
int page_height, ColPartition_IT it) {
2111 STATS stats(0, page_height);
2112 while (!it.cycled_list()) {
2113 ColPartition* part = it.data();
2115 stats.add(part->bottom_spacing(), 1);
2116 stats.add(part->top_spacing(), 1);
2118 return static_cast<int>(stats.median() + 0.5);
2132 return (last_column_ >= part.first_column_) &&
2133 (first_column_ <= part.last_column_);
2139 void ColPartition::SmoothSpacings(
int resolution,
int page_height,
2140 ColPartition_LIST* parts) {
2148 ColPartition_IT it(parts);
2155 int median_space = MedianSpacing(page_height, it);
2156 ColPartition_IT start_it(it);
2157 ColPartition_IT end_it(it);
2158 for (
int i = 0; i < PN_COUNT; ++i) {
2159 if (i < PN_UPPER || it.cycled_list()) {
2160 neighbourhood[i] =
NULL;
2164 neighbourhood[i] = it.data();
2168 while (neighbourhood[PN_UPPER] !=
NULL) {
2190 if (neighbourhood[PN_LOWER] ==
NULL ||
2191 (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2193 !OKSpacingBlip(resolution, median_space, neighbourhood) &&
2194 (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
2195 !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2196 (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
2197 !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2200 ColPartition_IT sum_it(start_it);
2202 double total_bottom = 0.0;
2203 double total_top = 0.0;
2204 int total_count = 0;
2207 while (upper != last_part) {
2208 total_bottom += upper->bottom_spacing();
2209 total_top += upper->top_spacing();
2212 upper = sum_it.data();
2214 if (total_count > 0) {
2216 int top_spacing =
static_cast<int>(total_top / total_count + 0.5);
2217 int bottom_spacing =
static_cast<int>(total_bottom / total_count + 0.5);
2219 tprintf(
"Spacing run ended. Cause:");
2220 if (neighbourhood[PN_LOWER] ==
NULL) {
2223 tprintf(
"Spacing change. Spacings:\n");
2224 for (
int i = 0; i < PN_COUNT; ++i) {
2225 if (neighbourhood[i] ==
NULL) {
2227 if (i > 0 && neighbourhood[i - 1] !=
NULL) {
2232 tprintf(
" NULL lower partner:\n");
2238 tprintf(
"Top = %d, bottom = %d\n",
2244 tprintf(
"Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
2247 upper = sum_it.data();
2248 while (upper != last_part) {
2249 upper->set_top_spacing(top_spacing);
2250 upper->set_bottom_spacing(bottom_spacing);
2256 upper = sum_it.data();
2263 median_space = MedianSpacing(page_height, end_it);
2266 for (
int j = 1; j < PN_COUNT; ++j) {
2267 neighbourhood[j - 1] = neighbourhood[j];
2269 if (it.cycled_list()) {
2270 neighbourhood[PN_COUNT - 1] =
NULL;
2272 neighbourhood[PN_COUNT - 1] = it.data();
2282 bool ColPartition::OKSpacingBlip(
int resolution,
int median_spacing,
2283 ColPartition** parts) {
2284 if (parts[PN_UPPER] ==
NULL || parts[PN_LOWER] ==
NULL)
2288 return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
2289 median_spacing, resolution) &&
2290 ((parts[PN_ABOVE1] !=
NULL &&
2291 parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2292 (parts[PN_BELOW1] !=
NULL &&
2293 parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2298 bool ColPartition::SpacingEqual(
int spacing,
int resolution)
const {
2299 int bottom_error = BottomSpacingMargin(resolution);
2300 int top_error = TopSpacingMargin(resolution);
2301 return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2307 bool ColPartition::SpacingsEqual(
const ColPartition& other,
2308 int resolution)
const {
2309 int bottom_error =
MAX(BottomSpacingMargin(resolution),
2310 other.BottomSpacingMargin(resolution));
2311 int top_error =
MAX(TopSpacingMargin(resolution),
2312 other.TopSpacingMargin(resolution));
2313 return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2314 (
NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2315 NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2322 bool ColPartition::SummedSpacingOK(
const ColPartition& other,
2323 int spacing,
int resolution)
const {
2324 int bottom_error =
MAX(BottomSpacingMargin(resolution),
2325 other.BottomSpacingMargin(resolution));
2326 int top_error =
MAX(TopSpacingMargin(resolution),
2327 other.TopSpacingMargin(resolution));
2328 int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2329 int top_total = top_spacing_ + other.top_spacing_;
2330 return (
NearlyEqual(spacing, bottom_total, bottom_error) &&
2332 (
NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2338 int ColPartition::BottomSpacingMargin(
int resolution)
const {
2344 int ColPartition::TopSpacingMargin(
int resolution)
const {
2346 BottomSpacingMargin(resolution);
2351 bool ColPartition::SizesSimilar(
const ColPartition& other)
const {
2352 return median_size_ <= other.median_size_ *
kMaxSizeRatio &&
2359 static bool UpdateLeftMargin(
const ColPartition& part,
2360 int* margin_left,
int* margin_right) {
2361 const TBOX& part_box = part.bounding_box();
2362 int top = part_box.
top();
2363 int bottom = part_box.
bottom();
2364 int tl_key = part.SortKey(part.left_margin(), top);
2365 int tr_key = part.SortKey(part_box.
left(), top);
2366 int bl_key = part.SortKey(part.left_margin(), bottom);
2367 int br_key = part.SortKey(part_box.
left(), bottom);
2368 int left_key =
MAX(tl_key, bl_key);
2369 int right_key =
MIN(tr_key, br_key);
2370 if (left_key <= *margin_right && right_key >= *margin_left) {
2372 *margin_right =
MIN(*margin_right, right_key);
2373 *margin_left =
MAX(*margin_left, left_key);
2384 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
2388 int start_y = part->bounding_box_.top();
2389 if (!part_it->at_first()) {
2390 int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2391 if (prev_bottom < start_y)
2392 start_y = prev_bottom;
2393 else if (prev_bottom > start_y)
2394 start_y = (start_y + prev_bottom) / 2;
2396 int end_y = part->bounding_box_.bottom();
2399 UpdateLeftMargin(*part, &margin_left, &margin_right);
2402 part = part_it->data();
2403 }
while (!part_it->at_first() &&
2404 UpdateLeftMargin(*part, &margin_left, &margin_right));
2410 UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2411 if (next_margin_left > margin_right) {
2412 ColPartition_IT next_it(*part_it);
2415 part = next_it.data();
2416 }
while (!next_it.at_first() &&
2417 UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2421 part_it->backward();
2422 part = part_it->data();
2423 }
while (part != start_part &&
2424 UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2428 part = part_it->data_relative(-1);
2429 end_y = part->bounding_box_.bottom();
2430 if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
2431 end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2432 start->
set_y(start_y);
2433 start->
set_x(part->XAtY(margin_right, start_y));
2435 end->
set_x(part->XAtY(margin_right, end_y));
2437 tprintf(
"Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2438 start_y, end_y, part->XAtY(margin_left, end_y),
2439 end->
x(), part->left_margin_, part->bounding_box_.left());
2445 static bool UpdateRightMargin(
const ColPartition& part,
2446 int* margin_left,
int* margin_right) {
2447 const TBOX& part_box = part.bounding_box();
2448 int top = part_box.
top();
2449 int bottom = part_box.
bottom();
2450 int tl_key = part.SortKey(part_box.
right(), top);
2451 int tr_key = part.SortKey(part.right_margin(), top);
2452 int bl_key = part.SortKey(part_box.
right(), bottom);
2453 int br_key = part.SortKey(part.right_margin(), bottom);
2454 int left_key =
MAX(tl_key, bl_key);
2455 int right_key =
MIN(tr_key, br_key);
2456 if (left_key <= *margin_right && right_key >= *margin_left) {
2458 *margin_right =
MIN(*margin_right, right_key);
2459 *margin_left =
MAX(*margin_left, left_key);
2471 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
2475 int start_y = part->bounding_box_.bottom();
2476 if (!part_it->at_last()) {
2477 int next_y = part_it->data_relative(1)->bounding_box_.top();
2478 if (next_y > start_y)
2480 else if (next_y < start_y)
2481 start_y = (start_y + next_y) / 2;
2483 int end_y = part->bounding_box_.top();
2486 UpdateRightMargin(*part, &margin_left, &margin_right);
2488 part_it->backward();
2489 part = part_it->data();
2490 }
while (!part_it->at_last() &&
2491 UpdateRightMargin(*part, &margin_left, &margin_right));
2496 UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2497 if (next_margin_right < margin_left) {
2498 ColPartition_IT next_it(*part_it);
2501 part = next_it.data();
2502 }
while (!next_it.at_last() &&
2503 UpdateRightMargin(*part, &next_margin_left,
2504 &next_margin_right));
2509 part = part_it->data();
2510 }
while (part != start_part &&
2511 UpdateRightMargin(*part, &next_margin_left,
2512 &next_margin_right));
2513 part_it->backward();
2516 part = part_it->data_relative(1);
2517 end_y = part->bounding_box().top();
2518 if (!part_it->at_last() &&
2519 part_it->data()->bounding_box_.bottom() > end_y)
2520 end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2521 start->
set_y(start_y);
2522 start->
set_x(part->XAtY(margin_left, start_y));
2524 end->
set_x(part->XAtY(margin_left, end_y));
2526 tprintf(
"Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2527 start_y, end_y, end->
x(), part->XAtY(margin_right, end_y),
2528 part->bounding_box_.right(), part->right_margin_);