From 3bb9d66028df56b4535a75a56f93949cf4b1763e Mon Sep 17 00:00:00 2001
From: Marek Kasik <mkasik@redhat.com>
Date: Tue, 16 Nov 2010 14:53:09 +0100
Subject: [PATCH 2/2] Improve efficiency of searching for tables

The patch searches for left/right/up/down neighbour of
each block. These blocks are then used to determine whether
the actual block belong to a table or not (#3188).
---
 poppler/TextOutputDev.cc |  236 ++++++++++++++++++++++++++++++++++++----------
 poppler/TextOutputDev.h  |    4 +
 2 files changed, 189 insertions(+), 51 deletions(-)

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 1c3863b..3c035a7 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -1150,6 +1150,8 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) {
   rot = rotA;
   xMin = yMin = 0;
   xMax = yMax = -1;
+  ExMin = EyMin = 0;
+  ExMax = EyMax = -1;
   priMin = 0;
   priMax = page->pageWidth;
   pool = new TextPool();
@@ -1157,6 +1159,10 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) {
   curLine = NULL;
   next = NULL;
   stackNext = NULL;
+  left = NULL;
+  right = NULL;
+  up = NULL;
+  down = NULL;
   tableId = -1;
   tableEnd = gFalse;
 }
@@ -2319,6 +2325,22 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) {
   links->append(new TextLink(xMin, yMin, xMax, yMax, link));
 }
 
+typedef struct {
+  TextBlock *blk;
+  TextBlock *blk_end;
+  double value;
+} Range;
+
+int cmpRange(const void *p1, const void *p2) {
+  Range *r1 = *(Range **)p1;
+  Range *r2 = *(Range **)p2;
+  double cmp = 0.0;
+
+  cmp = r1->value - r2->value;
+
+  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
 void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   UnicodeMap *uMap;
   TextPool *pool;
@@ -3001,6 +3023,126 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
     visited[i] = gFalse;
   }
 
+  /*
+   * Find adjacent blocks
+   */
+  Range **ranges = new Range* [2 * nBlocks];
+  Range *range1 = NULL, *range2 = NULL;
+  int k = 0, l = 0;
+
+  for (blk1 = blkList; blk1; blk1 = blk1->next) {
+    range1 = new Range[1];
+    range2 = new Range[1];
+
+    range1->blk = blk1;
+    range2->blk = NULL;
+
+    range1->blk_end = NULL;
+    range2->blk_end = blk1;
+
+    range1->value = blk1->yMin;
+    range2->value = blk1->yMax;
+
+    ranges[k] = range1;
+    ranges[k + 1] = range2;
+
+    k += 2;
+  }
+
+  /*
+   * Sort all y coordinates
+   */
+  qsort (ranges, 2 * nBlocks, sizeof (Range *), cmpRange);
+
+  /*
+   * Find the closest neighbour (left/right) of each block
+   */
+  for (k = 0; k < (2 * nBlocks); k++) {
+    range1 = ranges[k];
+
+    if (range1->blk) {
+      blk1 = range1->blk;
+      l = k + 1;
+
+      while (ranges[l]->value < range1->blk->yMax) {
+        range2 = ranges[l];
+
+        if (range2->blk) {
+          blk2 = range2->blk;
+
+          if (blk2->xMin > blk1->xMax) {
+            if (blk1->right == NULL || blk2->xMin < blk1->right->xMin)
+              blk1->right = blk2;
+
+            if (blk2->left == NULL || blk1->xMax > blk2->left->xMax)
+              blk2->left = blk1;
+          }
+          else if (blk2->xMax < blk1->xMin) {
+            if (blk1->left == NULL || blk2->xMax > blk1->left->xMax)
+              blk1->left = blk2;
+
+            if (blk2->right == NULL || blk1->xMin < blk2->right->xMin)
+              blk2->right = blk1;
+          }
+        }
+
+        l++;
+      }
+    }
+  }
+
+  for (k = 0; k < (2 * nBlocks); k++) {
+    range1 = ranges[k];
+
+    if (range1->blk)
+      range1->value = range1->blk->xMin;
+
+    if (range1->blk_end)
+      range1->value = range1->blk_end->xMax;
+  }
+
+  /*
+   * Sort all x coordinates
+   */
+  qsort (ranges, 2 * nBlocks, sizeof (Range *), cmpRange);
+
+  /*
+   * Find the closest neighbour (up/down) of each block
+   */
+  for (k = 0; k < (2 * nBlocks); k++) {
+    range1 = ranges[k];
+
+    if (range1->blk) {
+      blk1 = range1->blk;
+      l = k + 1;
+
+      while (ranges[l]->value < range1->blk->xMax) {
+        range2 = ranges[l];
+
+        if (range2->blk) {
+          blk2 = range2->blk;
+
+          if (blk2->yMin > blk1->yMax) {
+            if (blk1->down == NULL || blk2->yMin < blk1->down->yMin)
+              blk1->down = blk2;
+
+            if (blk2->up == NULL || blk1->yMax > blk2->up->yMax)
+              blk2->up = blk1;
+          }
+          else if (blk2->yMax < blk1->yMin) {
+            if (blk1->up == NULL || blk2->yMax > blk1->up->yMax)
+              blk1->up = blk2;
+
+            if (blk2->down == NULL || blk1->yMin < blk2->down->yMin)
+              blk2->down = blk1;
+          }
+        }
+
+        l++;
+      }
+    }
+  }
+
   double bxMin0, byMin0, bxMin1, byMin1;
   int numTables = 0;
   int tableId = -1;
@@ -3028,33 +3170,11 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
     /*  find fblk2, fblk3 and fblk4 so that
      *  fblk2 is on the right of blk1 and overlap with blk1 in y axis
      *  fblk3 is under blk1 and overlap with blk1 in x axis
-     *  fblk4 is under blk1 and on the right of blk1
-     *  and they are closest to blk1
+     *  fblk4 is under fblk2 and on the right of fblk3
      */
-    for (blk2 = blkList; blk2; blk2 = blk2->next) {
-      if (blk2 != blk1) {
-        if (blk2->yMin <= blk1->yMax &&
-            blk2->yMax >= blk1->yMin &&
-            blk2->xMin > blk1->xMax &&
-            blk2->xMin < bxMin0) {
-          bxMin0 = blk2->xMin;
-          fblk2 = blk2;
-        } else if (blk2->xMin <= blk1->xMax &&
-                   blk2->xMax >= blk1->xMin &&
-                   blk2->yMin > blk1->yMax &&
-                   blk2->yMin < byMin0) {
-          byMin0 = blk2->yMin;
-          fblk3 = blk2;
-        } else if (blk2->xMin > blk1->xMax &&
-                   blk2->xMin < bxMin1 &&
-                   blk2->yMin > blk1->yMax &&
-                   blk2->yMin < byMin1) {
-          bxMin1 = blk2->xMin;
-          byMin1 = blk2->yMin;
-          fblk4 = blk2;
-        }
-      }
-    }
+    fblk2 = blk1->right;
+    fblk3 = blk1->down;
+    fblk4 = (fblk2 && fblk3 && fblk2->down == fblk3->right) ? fblk2->down : NULL;
 
     /*  fblk4 can not overlap with fblk3 in x and with fblk2 in y
      *  fblk2 can not overlap with fblk3 in x and y
@@ -3224,42 +3344,56 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) {
   /*  set extended bounding boxes of all other blocks
    *  so that they extend in x without hitting neighbours
    */
-  for (blk1 = blkList; blk1; blk1 = blk1->next) {
-    if (!blk1->tableId >= 0) {
-      double xMax = DBL_MAX;
-      double xMin = DBL_MIN;
+  for (k = 0; k < (2 * nBlocks); k++) {
+    range1 = ranges[k];
+    double xMaxValue, xMinValue;
 
-      for (blk2 = blkList; blk2; blk2 = blk2->next) {
-        if (blk2 == blk1)
-           continue;
+    if (range1->blk) {
+      blk1 = range1->blk;
+      l = k + 1;
 
-        if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
-          if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
-            xMax = blk2->xMin;
+      if (blk1->right)
+        xMaxValue = blk1->right->xMin;
+      else
+        xMaxValue = DBL_MAX;
 
-          if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
-            xMin = blk2->xMax;
-        }
+      if (blk1->left)
+        xMinValue = blk1->left->xMax;
+      else
+        xMinValue = DBL_MIN;
+
+      while (l < (2 * nBlocks) && ranges[l]->value <= xMaxValue) {
+        range2 = ranges[l];
+
+        if (range2->blk_end &&
+            range2->value > blk1->ExMax &&
+            range2->blk_end->yMin >= blk1->yMax)
+          blk1->ExMax = range2->value;
+
+        l++;
       }
 
-      for (blk2 = blkList; blk2; blk2 = blk2->next) {
-        if (blk2 == blk1)
-           continue;
+      l = k - 1;
 
-        if (blk2->xMax > blk1->ExMax &&
-            blk2->xMax <= xMax &&
-            blk2->yMin >= blk1->yMax) {
-          blk1->ExMax = blk2->xMax;
-        }
+      while (l >= 0 && ranges[l]->value >= xMinValue) {
+        range2 = ranges[l];
 
-        if (blk2->xMin < blk1->ExMin &&
-            blk2->xMin >= xMin &&
-            blk2->yMin >= blk1->yMax)
-          blk1->ExMin = blk2->xMin;
+        if (range2->blk &&
+            range2->value < blk1->ExMin &&
+            range2->blk->yMin >= blk1->yMax)
+          blk1->ExMin = range2->value;
+
+        l--;
       }
     }
   }
 
+  for (k = 0; k < (2 * nBlocks); k++) {
+    delete ranges[k];
+  }
+  delete ranges;
+
+
   // sort blocks into yx order (in preparation for reading order sort)
   TextBlock **yxordered;
   yxordered = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index 553c526..0b2c806 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -390,6 +390,10 @@ private:
 
   TextBlock *next;
   TextBlock *stackNext;
+  TextBlock *left;
+  TextBlock *right;
+  TextBlock *up;
+  TextBlock *down;
 
   friend class TextLine;
   friend class TextLineFrag;
-- 
1.7.3.2