20 #include "config_auto.h" 27 #include "allheaders.h" 142 static BOOL_VAR(textord_show_tables,
false,
"Show table regions");
143 static BOOL_VAR(textord_tablefind_show_mark,
false,
144 "Debug table marking steps in detail");
145 static BOOL_VAR(textord_tablefind_show_stats,
false,
146 "Show page stats used in table finding");
147 static BOOL_VAR(textord_tablefind_recognize_tables,
false,
148 "Enables the table recognizer for table layout and filtering.");
161 global_median_xheight_(0),
162 global_median_blob_width_(0),
163 global_median_ledding_(0),
164 left_to_right_language_(true) {
182 const ICOORD& top_right) {
221 BLOBNBOX_CLIST* part_boxes = part->
boxes();
222 BLOBNBOX_C_IT pit(part_boxes);
223 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
230 if (leader_part ==
nullptr) {
234 leader_part->
AddBox(pblob);
236 clean_part->
AddBox(pblob);
243 if (leader_part !=
nullptr) {
266 #ifndef GRAPHICS_DISABLED 267 if (textord_show_tables) {
275 table_win =
MakeWindow(100, 300,
"Fragmented Text");
278 #endif // GRAPHICS_DISABLED 285 ColSegment_LIST column_blocks;
299 ColSegment_LIST table_columns;
305 ColSegment_LIST table_regions;
308 #ifndef GRAPHICS_DISABLED 309 if (textord_tablefind_show_mark) {
314 #endif // GRAPHICS_DISABLED 326 if (textord_tablefind_recognize_tables) {
330 #ifndef GRAPHICS_DISABLED 331 if (textord_show_tables) {
337 #endif // GRAPHICS_DISABLED 344 #ifndef GRAPHICS_DISABLED 345 if (textord_show_tables) {
351 #endif // GRAPHICS_DISABLED 359 #ifndef GRAPHICS_DISABLED 360 if (textord_show_tables) {
366 #endif // GRAPHICS_DISABLED 439 if (part->
boxes()->empty()) {
449 bool found_split =
true;
450 while (found_split) {
452 BLOBNBOX_C_IT box_it(right_part->
boxes());
457 int previous_right = INT32_MIN;
460 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461 const TBOX& box = box_it.data()->bounding_box();
462 if (previous_right != INT32_MIN &&
463 box.
left() - previous_right > kThreshold) {
466 int mid_x = (box.
left() + previous_right) / 2;
468 right_part = left_part->
SplitAt(mid_x);
476 previous_right = std::max(previous_right, static_cast<int>(box.
right()));
509 return box.
height() > kHeightRequired &&
510 box.
width() > kWidthRequired &&
511 box.
area() > kAreaRequired;
524 ColSegment_LIST* column_blocks) {
527 if (columns !=
nullptr) {
528 ColSegment_LIST new_blocks;
539 ColSegment_LIST* column_blocks) {
540 ColSegment_IT src_it(new_blocks);
541 ColSegment_IT dest_it(column_blocks);
543 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
546 bool match_found =
false;
548 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
556 delete src_it.extract();
562 dest_it.add_after_then_move(src_it.extract());
571 return (abs(b1.
left() - b2.
left()) < x_margin) &&
595 int y = part->
MidY();
600 int left_space = std::max(0, box.
left() - left_column->
LeftAtY(y));
605 int right_space = std::max(0, right_column->
RightAtY(y) - box.
right());
619 if (right < box.
left()) {
632 if (left > box.
right()) {
684 if (neighbor == part)
690 if (neighbor_box.
top() < part_box.
bottom() &&
691 gap < min_space_below) {
692 min_space_below = gap;
693 below_neighbor = neighbor;
695 else if (part_box.
top() < neighbor_box.
bottom() &&
696 gap < min_space_above) {
697 min_space_above = gap;
698 above_neighbor = neighbor;
731 BLOBNBOX_C_IT it(part->
boxes());
732 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733 xheight_stats.
add(it.data()->bounding_box().height(), 1);
734 width_stats.
add(it.data()->bounding_box().width(), 1);
745 #ifndef GRAPHICS_DISABLED 746 if (textord_tablefind_show_stats) {
747 const char* kWindowName =
"X-height (R), X-width (G), and ledding (B)";
753 #endif // GRAPHICS_DISABLED 791 if (textord_tablefind_show_mark) {
798 if (textord_tablefind_show_mark) {
805 if (textord_tablefind_show_mark) {
812 if (textord_tablefind_show_mark || textord_show_tables) {
861 BLOBNBOX_CLIST* part_boxes = part->
boxes();
862 BLOBNBOX_C_IT it(part_boxes);
872 int previous_x1 = -1;
874 int largest_partition_gap_found = -1;
881 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
885 if (previous_x1 != -1) {
886 int gap = current_x0 - previous_x1;
898 previous_x1 = std::max(previous_x1, current_x1);
912 if (gap > largest_partition_gap_found)
913 largest_partition_gap_found = gap;
915 previous_x1 = current_x1;
929 if (largest_partition_gap_found == -1)
935 return largest_partition_gap_found < min_gap;
955 const int top = box.
top() + search_size;
956 const int bottom = box.
bottom() - search_size;
958 for (
int direction = 0; direction < 2; ++direction) {
959 bool right_to_left = (direction == 0);
960 int x = right_to_left ? box.
right() : box.
left();
963 while ((leader = hsearch.
NextSideSearch(right_to_left)) !=
nullptr) {
1022 int current_spacing = 0;
1023 int upper_spacing = 0;
1029 current_spacing = mid - left;
1030 upper_spacing = upper_mid - left;
1036 current_spacing = right - mid;
1037 upper_spacing = right - upper_mid;
1078 int max_top = INT32_MIN;
1079 int min_bottom = INT32_MAX;
1088 if (top > max_top) {
1092 if (bottom < min_bottom) {
1093 min_bottom = bottom;
1118 if (!upper_part || !lower_part)
1144 ColSegment_IT it(column_blocks);
1145 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1148 int num_table_cells = 0;
1149 int num_text_cells = 0;
1164 if (!num_table_cells && !num_text_cells) {
1165 delete it.extract();
1178 ColSegment_IT it(segments);
1179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1206 bool neighbor_found =
false;
1207 bool modified =
false;
1213 int top_range = std::min(box.
top() + margin,
static_cast<int>(
tright().
y()));
1214 int bottom_range = std::max(box.
bottom() - margin,
static_cast<int>(
bleft().
y()));
1217 neighbor_found =
false;
1223 if (neighbor == seg)
1245 neighbor_found =
true;
1252 }
while (neighbor_found);
1274 ColSegment_IT it(table_columns);
1285 col->InsertBox(box);
1294 bool found_neighbours =
false;
1308 col->InsertBox(neighbor_box);
1310 found_neighbours =
true;
1312 if (found_neighbours) {
1313 it.add_after_then_move(col);
1324 ColSegment_LIST* table_regions) {
1325 ColSegment_IT cit(table_columns);
1326 ColSegment_IT rit(table_regions);
1335 bool* table_region =
new bool[page_height];
1339 for (
int i = 0; i < page_height; i++) {
1340 table_region[i] =
false;
1344 cit.move_to_first();
1345 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346 TBOX col_box = cit.data()->bounding_box();
1350 for (
int i = intersection_box.
bottom(); i < intersection_box.
top(); i++) {
1351 table_region[i -
bleft().
y()] =
true;
1355 TBOX current_table_box;
1360 for (
int i = 1; i < page_height; i++) {
1362 if (!table_region[i - 1] && table_region[i]) {
1367 if (table_region[i - 1] && !table_region[i]) {
1369 if (!current_table_box.
null_box()) {
1371 seg->InsertBox(current_table_box);
1372 rit.add_after_then_move(seg);
1377 delete[] table_region;
1393 bool neighbor_found =
false;
1394 bool modified =
false;
1398 TBOX search_region(box);
1401 neighbor_found =
false;
1407 if (neighbor == seg)
1425 neighbor_found =
true;
1432 }
while (neighbor_found);
1488 ColSegment_CLIST adjusted_tables;
1489 ColSegment_C_IT it(&adjusted_tables);
1495 TBOX grown_box = table_box;
1502 col->InsertBox(grown_box);
1503 it.add_after_then_move(col);
1514 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1529 TBOX search_box = table_box;
1543 const TBOX& search_range,
1547 for (
int i = 0; i < 2; ++i) {
1571 const TBOX& search_range,
1586 if (result_box->
contains(part_box))
1601 const TBOX& table_box) {
1619 int num_extra_partitions = 0;
1620 int extra_space_to_right = 0;
1621 int extra_space_to_left = 0;
1624 for (
int i = 0; i < 2; ++i) {
1641 num_extra_partitions++;
1645 extra_space_to_right++;
1646 extra_space_to_left++;
1651 extra_space_to_right++;
1653 extra_space_to_left++;
1658 return (extra_space_to_right > num_extra_partitions / 2) ||
1659 (extra_space_to_left > num_extra_partitions / 2);
1675 int table_top = table_box->
top();
1678 if (box.
bottom() - table_top > max_distance)
1684 previous_neighbor =
nullptr;
1689 if (previous_neighbor ==
nullptr) {
1690 previous_neighbor = neighbor;
1707 int* table_xprojection =
new int[page_width];
1716 for (
int i = 0; i < page_width; i++) {
1717 table_xprojection[i] = 0;
1734 BLOBNBOX_CLIST* part_boxes = part->
boxes();
1735 BLOBNBOX_C_IT pit(part_boxes);
1742 int next_position_to_write = 0;
1744 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1751 xstart = std::max(xstart, next_position_to_write);
1752 for (
int i = xstart; i < xend; i++)
1753 table_xprojection[i -
bleft().
x()]++;
1754 next_position_to_write = xend;
1763 delete[] table_xprojection;
1771 for (
int i = 0; i < length; i++) {
1772 if (xprojection[i] > peak_value) {
1773 peak_value = xprojection[i];
1785 for (
int i = 0; i < length; i++) {
1786 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1789 int largest_gap = 0;
1791 for (
int i = 1; i < length; i++) {
1793 if (xprojection[i - 1] && !xprojection[i]) {
1797 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1798 int gap = i - run_start;
1799 if (gap > largest_gap)
1819 if (textord_show_tables) {
1820 table_win =
MakeWindow(0, 0,
"Table Structure");
1835 ColSegment_CLIST good_tables;
1836 ColSegment_C_IT good_it(&good_tables);
1851 if (table_structure !=
nullptr) {
1852 if (textord_show_tables) {
1856 delete table_structure;
1857 good_it.add_after_then_move(found_table);
1866 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1872 ColSegment_LIST *segments,
1874 #ifndef GRAPHICS_DISABLED 1877 ColSegment_IT it(segments);
1878 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1881 int left_x = box.
left();
1882 int right_x = box.
right();
1883 int top_y = box.
top();
1884 int bottom_y = box.
bottom();
1885 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1893 #ifndef GRAPHICS_DISABLED 1901 int left_x = box.
left();
1902 int right_x = box.
right();
1903 int top_y = box.
top();
1904 int bottom_y = box.
bottom();
1907 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1920 #ifndef GRAPHICS_DISABLED 1928 color = default_color;
1930 color = table_color;
1933 int left_x = box.
left();
1934 int right_x = box.
right();
1935 int top_y = box.
top();
1936 int bottom_y = box.
bottom();
1939 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1954 #ifndef GRAPHICS_DISABLED 1962 int left_x = box.
left();
1963 int right_x = box.
right();
1964 int top_y = box.
top();
1965 int bottom_y = box.
bottom();
1970 int mid_x = (left_x + right_x) / 2;
1971 int mid_y = (top_y + bottom_y) / 2;
1972 int other_x = (upper_box.
left() + upper_box.
right()) / 2;
1973 int other_y = (upper_box.
top() + upper_box.
bottom()) / 2;
1976 win->
Line(mid_x, mid_y, other_x, other_y);
1981 int mid_x = (left_x + right_x) / 2;
1982 int mid_y = (top_y + bottom_y) / 2;
1983 int other_x = (lower_box.
left() + lower_box.
right()) / 2;
1984 int other_y = (lower_box.
top() + lower_box.
bottom()) / 2;
1987 win->
Line(mid_x, mid_y, other_x, other_y);
2035 if (table_partition) {
2036 table_partition->
Absorb(part, width_cb);
2038 table_partition = part;
2043 if (table_partition) {
2055 grid->
InsertBBox(
true,
true, table_partition);
2064 num_table_cells_(0),
2077 return kBoxColors[type_];
2090 else if (num_text_cells_ > num_table_cells_)
ColPartitionGrid clean_part_grid_
#define ELISTIZE(CLASSNAME)
void set_num_text_cells(int n)
void Rectangle(int x1, int y1, int x2, int y2)
void set_inside_table_column(bool val)
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
ColPartition * nearest_neighbor_below() const
int RightAtY(int y) const
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
ColSegmentGrid table_grid_
bool major_x_overlap(const TBOX &box) const
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
void set_text_grid(ColPartitionGrid *text)
void FilterParagraphEndings()
bool IsHorizontalLine() const
void set_min_height(int height)
void set_global_median_xheight(int xheight)
void set_bounding_box(const TBOX &other)
bool left_to_right_language_
const ICOORD & tright() const
ColPartition * ColumnContaining(int x, int y)
bool contains(const FCOORD pt) const
void add(int32_t value, int32_t count)
const double kStrokeWidthConstantTolerance
int global_median_ledding_
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
ScrollView::Color BoxColor() const
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
void MarkTablePartitions()
const TBOX & bounding_box() const
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
ColSegmentGrid col_seg_grid_
const TBOX & bounding_box() const
bool inside_table_column()
void DisplayBoxes(ScrollView *window)
bool IsInSameColumnAs(const ColPartition &part) const
ColPartitionGrid fragmented_text_grid_
const double kMaxBlobOverlapFactor
double overlap_fraction(const TBOX &box) const
void set_space_to_right(int space)
bool GapInXProjection(int *xprojection, int length)
const int kMaxVerticalSpacing
void DeleteSingleColumnTables()
void Absorb(ColPartition *other, WidthCallback *cb)
void SetUniqueMode(bool mode)
const double kAllowTextWidth
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
void set_global_median_blob_width(int width)
void FindPartitionPartners()
void DisplayColSegmentGrid(ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
void Display(ScrollView *window, ScrollView::Color color)
BBC * NextSideSearch(bool right_to_left)
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_left_to_right_language(bool order)
int median_height() const
void RepositionIterator()
const int kAdjacentLeaderSearchPadding
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void set_global_median_ledding(int ledding)
ScrollView * MakeWindow(int x, int y, const char *window_name)
void GetTableColumns(ColSegment_LIST *table_columns)
int median_bottom() const
void InitializePartitions(ColPartitionSet **all_columns)
const double kParagraphEndingPreviousLineRatio
void set_num_table_cells(int n)
bool HasLeaderAdjacent(const ColPartition &part)
void GridMergeTableRegions()
const int kMaxBoxesInDataPartition
#define BOOL_VAR(name, val, comment)
BlobTextFlowType flow() const
void SetGlobalSpacings(ColPartitionGrid *grid)
const double kMinOverlapWithTable
BBC * NextVerticalSearch(bool top_to_bottom)
void InsertTextPartition(ColPartition *part)
bool AllowBlob(const BLOBNBOX &blob) const
const int kMaxColumnHeaderDistance
const double kSplitPartitionSize
const double kMaxXProjectionGapFactor
BlobRegionType region_type() const
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
void InsertImagePartition(ColPartition *part)
void set_flow(BlobTextFlowType f)
void set_line_grid(ColPartitionGrid *lines)
bool AllowTextPartition(const ColPartition &part) const
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
ColPartition * SingletonPartner(bool upper)
const double kTableColumnThreshold
int global_median_xheight_
int16_t x() const
access function
const TBOX & bounding_box() const
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
const double kAllowBlobArea
TBOX intersection(const TBOX &box) const
void set_nearest_neighbor_above(ColPartition *part)
void Line(int x1, int y1, int x2, int y2)
void AddBox(BLOBNBOX *box)
ColPartition * CopyButDontOwnBlobs()
void set_max_text_height(int height)
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
const double kMinMaxGapInTextPartition
bool major_y_overlap(const TBOX &box) const
const int kSideSpaceMargin
PolyBlockType type() const
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
void DeleteObject(T *object)
void ClearGridData(void(*free_method)(BBC *))
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void GridMergeColumnBlocks()
const int kLargeTableRowCount
const ICOORD & tright() const
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
const int kMinRowsInTable
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
void SetColumnsType(ColSegment_LIST *col_segments)
bool VSignificantCoreOverlap(const ColPartition &other) const
StructuredTable * RecognizeTable(const TBOX &guess_box)
const double kAllowBlobWidth
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
const double kMaxTableCellXheight
BlobTextFlowType flow() const
TBOX bounding_union(const TBOX &box) const
void StartVerticalSearch(int xmin, int xmax, int y)
void FilterHeaderAndFooter()
int global_median_blob_width_
const double kAllowTextArea
void InsertLeaderPartition(ColPartition *part)
void SetVerticalSpacing(ColPartition *part)
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
void set_nearest_neighbor_below(ColPartition *part)
void SmoothTablePartitionRuns()
const double kSmallTableProjectionThreshold
const double kMaxParagraphEndingLeftSpaceMultiple
bool overlap(const TBOX &box) const
void InsertRulingPartition(ColPartition *part)
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
void InsertFragmentedTextPartition(ColPartition *part)
int16_t y() const
access_function
BlobRegionType blob_type() const
void set_space_below(int space)
ColPartition * ShallowCopy() const
const double kMaxGapInTextPartition
ColPartitionGrid leader_and_ruling_grid_
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
void RefinePartitionPartners(bool get_desperate)
void set_space_above(int space)
int space_to_left() const
const double kLargeTableProjectionThreshold
bool MatchingSizes(const ColPartition &other) const
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
void IncludeLeftOutColumnHeaders(TBOX *table_box)
ColPartition * nearest_neighbor_above() const
void AdjustTableBoundaries()
const ICOORD & bleft() const
const TBOX & bounding_box() const
void set_space_to_left(int space)
int space_to_right() const
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
bool HasWideOrNoInterWordGap(ColPartition *part) const
const int kMinBoxesInTextPartition
ScrollView * MakeWindow(int x, int y, const char *window_name)
void MarkPartitionsUsingLocalInformation()
ColPartition * SplitAt(int split_x)
const double kMinParagraphEndingTextToWhitespaceRatio
void set_blob_type(BlobRegionType t)
const double kAllowBlobHeight
void StartSideSearch(int x, int ymin, int ymax)
const double kAllowTextHeight
const double kStrokeWidthFractionalTolerance
void InsertBox(const TBOX &other)
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
void StartRectSearch(const TBOX &rect)
const ICOORD & bleft() const
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)