From e1dd515d8373e233a8b81812d453e6daf7d164ae Mon Sep 17 00:00:00 2001 From: Brian Ewins Date: Thu, 12 Nov 2009 02:50:29 +0000 Subject: [PATCH] Use a reading-order sort to order blocks This switches the block sort from XY to reading order, using the rules from T. Breuel's "High Performance Document Layout Analysis". Block rotation and RTL are not yet handled. Signed-off-by: Brian Ewins --- poppler/TextOutputDev.cc | 92 ++++++++++++++++++++++++++++++++++++++++++++-- poppler/TextOutputDev.h | 5 ++ 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index aeb4f37..0bc022f 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2178,6 +2178,77 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) { links->append(new TextLink(xMin, yMin, xMax, yMax, link)); } +// Sort into reading order by performing a topological sort using the rules +// given in "High Performance Document Layout Analysis", T.M. Breuel, 2003. +// See http://pubs.iupr.org/#2003-breuel-sdiut +// Topological sort is done by depth first search, see +// http://en.wikipedia.org/wiki/Topological_sorting +int TextPage::depth_first_visit(TextBlock *blkList, int pos1, + TextBlock *blk1, + TextBlock **sorted, int sortPos, + GBool* visited) { + int pos2, pos3; + TextBlock *blk2, *blk3; + GBool before; + if (visited[pos1]) { + return sortPos; + } + visited[pos1] = gTrue; + pos2 = -1; + for (blk2 = blkList; blk2; blk2 = blk2->next) { + pos2++; + if (visited[pos2]) { + // skip visited nodes + continue; + } + before = gFalse; + // The xy comparisons here only work for LTR. + // I've omitted RTL for clarity. I think these + // are wrong anyway, I should be performing + // appropriate comparisons for blk->rot. + if (blk2->rot < blk1->rot) { + before = gTrue; + } else if (blk2->rot != blk1->rot) { + continue; + } else if (((blk1->xMin <= blk2->xMin && + blk2->xMin <= blk1->xMax) || + (blk2->xMin <= blk1->xMin && + blk1->xMin <= blk2->xMax)) && + blk2->yMin < blk1->yMin) { + // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1. + before = gTrue; + } else if (blk2->xMax <= blk1->xMin) { + // Rule (2) blk2 left of blk1, and no intervening blk3 + // such that blk3 overlaps blk1 and blk2. + // (blk2 right of blk1 for RTL text) + before = gTrue; + pos3 = -1; + for (blk3 = blkList; blk3; blk3 = blk3->next) { + pos3++; + if (pos3 == pos1 || pos3 == pos2) { + // not an intervening block + continue; + } + if (blk3->xMin <= blk2->xMax && + blk1->xMin <= blk3->xMax && + blk1->yMin < blk3->yMin && + blk3->yMin < blk2->yMin) { + // blk3 between blk1 and blk2, overlapping both + before = gFalse; + break; + } + } + } + if (before) { + // blk2 is before blk1, so it needs to be visited + // before we can add blk1 to the sorted list. + sortPos = this->depth_first_visit(blkList, pos2, blk2, sorted, sortPos, visited); + } + } + sorted[sortPos++] = blk1; + return sortPos; +} + void TextPage::coalesce(GBool physLayout, GBool doHTML) { UnicodeMap *uMap; TextPool *pool; @@ -2839,9 +2910,6 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { //----- reading order sort - // sort blocks into xy order (in preparation for reading order sort) - qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); - // compute space on left and right sides of each block for (i = 0; i < nBlocks; ++i) { blk0 = blocks[i]; @@ -2853,13 +2921,28 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { } } + int sortPos = 0; + GBool *visited = (GBool *)gmallocn(nBlocks, sizeof(GBool *)); + for (i = 0; i < nBlocks; i++) { + visited[i] = gFalse; + } + i = -1; + for (blk1 = blkList; blk1; blk1 = blk1->next) { + i++; + sortPos = this->depth_first_visit(blkList, i, blk1, blocks, sortPos, visited); + } + if (visited) { + gfree(visited); + } + #if 0 // for debugging - printf("*** blocks, after yx sort ***\n"); + printf("*** blocks, after ro sort ***\n"); for (i = 0; i < nBlocks; ++i) { blk = blocks[i]; printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n", blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->priMin, blk->priMax); + for (line = blk->lines; line; line = line->next) { printf(" line:\n"); for (word0 = line->words; word0; word0 = word0->next) { @@ -2872,6 +2955,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { printf("'\n"); } } + } printf("\n"); #endif diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index d3ae2f9..e52a319 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -568,6 +568,11 @@ private: void assignColumns(TextLineFrag *frags, int nFrags, int rot); int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s); + int depth_first_visit(TextBlock *blkList, int pos1, + TextBlock *blk1, + TextBlock **sorted, int sortPos, + GBool* visited); + GBool rawOrder; // keep text in content stream order double pageWidth, pageHeight; // width and height of current page -- 1.5.6.6