From 3bb9d66028df56b4535a75a56f93949cf4b1763e Mon Sep 17 00:00:00 2001 From: Marek Kasik Date: Tue, 16 Nov 2010 14:53:09 +0100 Subject: [PATCH 2/2] Improve efficiency of searching for tables The patch searches for left/right/up/down neighbour of each block. These blocks are then used to determine whether the actual block belong to a table or not (#3188). --- poppler/TextOutputDev.cc | 236 ++++++++++++++++++++++++++++++++++++---------- poppler/TextOutputDev.h | 4 + 2 files changed, 189 insertions(+), 51 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 1c3863b..3c035a7 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -1150,6 +1150,8 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) { rot = rotA; xMin = yMin = 0; xMax = yMax = -1; + ExMin = EyMin = 0; + ExMax = EyMax = -1; priMin = 0; priMax = page->pageWidth; pool = new TextPool(); @@ -1157,6 +1159,10 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) { curLine = NULL; next = NULL; stackNext = NULL; + left = NULL; + right = NULL; + up = NULL; + down = NULL; tableId = -1; tableEnd = gFalse; } @@ -2319,6 +2325,22 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) { links->append(new TextLink(xMin, yMin, xMax, yMax, link)); } +typedef struct { + TextBlock *blk; + TextBlock *blk_end; + double value; +} Range; + +int cmpRange(const void *p1, const void *p2) { + Range *r1 = *(Range **)p1; + Range *r2 = *(Range **)p2; + double cmp = 0.0; + + cmp = r1->value - r2->value; + + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + void TextPage::coalesce(GBool physLayout, GBool doHTML) { UnicodeMap *uMap; TextPool *pool; @@ -3001,6 +3023,126 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { visited[i] = gFalse; } + /* + * Find adjacent blocks + */ + Range **ranges = new Range* [2 * nBlocks]; + Range *range1 = NULL, *range2 = NULL; + int k = 0, l = 0; + + for (blk1 = blkList; blk1; blk1 = blk1->next) { + range1 = new Range[1]; + range2 = new Range[1]; + + range1->blk = blk1; + range2->blk = NULL; + + range1->blk_end = NULL; + range2->blk_end = blk1; + + range1->value = blk1->yMin; + range2->value = blk1->yMax; + + ranges[k] = range1; + ranges[k + 1] = range2; + + k += 2; + } + + /* + * Sort all y coordinates + */ + qsort (ranges, 2 * nBlocks, sizeof (Range *), cmpRange); + + /* + * Find the closest neighbour (left/right) of each block + */ + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + + if (range1->blk) { + blk1 = range1->blk; + l = k + 1; + + while (ranges[l]->value < range1->blk->yMax) { + range2 = ranges[l]; + + if (range2->blk) { + blk2 = range2->blk; + + if (blk2->xMin > blk1->xMax) { + if (blk1->right == NULL || blk2->xMin < blk1->right->xMin) + blk1->right = blk2; + + if (blk2->left == NULL || blk1->xMax > blk2->left->xMax) + blk2->left = blk1; + } + else if (blk2->xMax < blk1->xMin) { + if (blk1->left == NULL || blk2->xMax > blk1->left->xMax) + blk1->left = blk2; + + if (blk2->right == NULL || blk1->xMin < blk2->right->xMin) + blk2->right = blk1; + } + } + + l++; + } + } + } + + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + + if (range1->blk) + range1->value = range1->blk->xMin; + + if (range1->blk_end) + range1->value = range1->blk_end->xMax; + } + + /* + * Sort all x coordinates + */ + qsort (ranges, 2 * nBlocks, sizeof (Range *), cmpRange); + + /* + * Find the closest neighbour (up/down) of each block + */ + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + + if (range1->blk) { + blk1 = range1->blk; + l = k + 1; + + while (ranges[l]->value < range1->blk->xMax) { + range2 = ranges[l]; + + if (range2->blk) { + blk2 = range2->blk; + + if (blk2->yMin > blk1->yMax) { + if (blk1->down == NULL || blk2->yMin < blk1->down->yMin) + blk1->down = blk2; + + if (blk2->up == NULL || blk1->yMax > blk2->up->yMax) + blk2->up = blk1; + } + else if (blk2->yMax < blk1->yMin) { + if (blk1->up == NULL || blk2->yMax > blk1->up->yMax) + blk1->up = blk2; + + if (blk2->down == NULL || blk1->yMin < blk2->down->yMin) + blk2->down = blk1; + } + } + + l++; + } + } + } + double bxMin0, byMin0, bxMin1, byMin1; int numTables = 0; int tableId = -1; @@ -3028,33 +3170,11 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { /* find fblk2, fblk3 and fblk4 so that * fblk2 is on the right of blk1 and overlap with blk1 in y axis * fblk3 is under blk1 and overlap with blk1 in x axis - * fblk4 is under blk1 and on the right of blk1 - * and they are closest to blk1 + * fblk4 is under fblk2 and on the right of fblk3 */ - for (blk2 = blkList; blk2; blk2 = blk2->next) { - if (blk2 != blk1) { - if (blk2->yMin <= blk1->yMax && - blk2->yMax >= blk1->yMin && - blk2->xMin > blk1->xMax && - blk2->xMin < bxMin0) { - bxMin0 = blk2->xMin; - fblk2 = blk2; - } else if (blk2->xMin <= blk1->xMax && - blk2->xMax >= blk1->xMin && - blk2->yMin > blk1->yMax && - blk2->yMin < byMin0) { - byMin0 = blk2->yMin; - fblk3 = blk2; - } else if (blk2->xMin > blk1->xMax && - blk2->xMin < bxMin1 && - blk2->yMin > blk1->yMax && - blk2->yMin < byMin1) { - bxMin1 = blk2->xMin; - byMin1 = blk2->yMin; - fblk4 = blk2; - } - } - } + fblk2 = blk1->right; + fblk3 = blk1->down; + fblk4 = (fblk2 && fblk3 && fblk2->down == fblk3->right) ? fblk2->down : NULL; /* fblk4 can not overlap with fblk3 in x and with fblk2 in y * fblk2 can not overlap with fblk3 in x and y @@ -3224,42 +3344,56 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { /* set extended bounding boxes of all other blocks * so that they extend in x without hitting neighbours */ - for (blk1 = blkList; blk1; blk1 = blk1->next) { - if (!blk1->tableId >= 0) { - double xMax = DBL_MAX; - double xMin = DBL_MIN; + for (k = 0; k < (2 * nBlocks); k++) { + range1 = ranges[k]; + double xMaxValue, xMinValue; - for (blk2 = blkList; blk2; blk2 = blk2->next) { - if (blk2 == blk1) - continue; + if (range1->blk) { + blk1 = range1->blk; + l = k + 1; - if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) { - if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) - xMax = blk2->xMin; + if (blk1->right) + xMaxValue = blk1->right->xMin; + else + xMaxValue = DBL_MAX; - if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) - xMin = blk2->xMax; - } + if (blk1->left) + xMinValue = blk1->left->xMax; + else + xMinValue = DBL_MIN; + + while (l < (2 * nBlocks) && ranges[l]->value <= xMaxValue) { + range2 = ranges[l]; + + if (range2->blk_end && + range2->value > blk1->ExMax && + range2->blk_end->yMin >= blk1->yMax) + blk1->ExMax = range2->value; + + l++; } - for (blk2 = blkList; blk2; blk2 = blk2->next) { - if (blk2 == blk1) - continue; + l = k - 1; - if (blk2->xMax > blk1->ExMax && - blk2->xMax <= xMax && - blk2->yMin >= blk1->yMax) { - blk1->ExMax = blk2->xMax; - } + while (l >= 0 && ranges[l]->value >= xMinValue) { + range2 = ranges[l]; - if (blk2->xMin < blk1->ExMin && - blk2->xMin >= xMin && - blk2->yMin >= blk1->yMax) - blk1->ExMin = blk2->xMin; + if (range2->blk && + range2->value < blk1->ExMin && + range2->blk->yMin >= blk1->yMax) + blk1->ExMin = range2->value; + + l--; } } } + for (k = 0; k < (2 * nBlocks); k++) { + delete ranges[k]; + } + delete ranges; + + // sort blocks into yx order (in preparation for reading order sort) TextBlock **yxordered; yxordered = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *)); diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 553c526..0b2c806 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -390,6 +390,10 @@ private: TextBlock *next; TextBlock *stackNext; + TextBlock *left; + TextBlock *right; + TextBlock *up; + TextBlock *down; friend class TextLine; friend class TextLineFrag; -- 1.7.3.2