From 5aede949034ebabdeace4ea069d99342f420b082 Mon Sep 17 00:00:00 2001 From: Brian Ewins Date: Thu, 12 Nov 2009 02:50:29 +0000 Subject: [PATCH] Use a reading-order sort to order blocks This switches the block sort from XY to reading order, using the rules from T. Breuel's "High Performance Document Layout Analysis". Block rotation and RTL are not yet handled. Something of a work-in-progress; this is the first build I've had thats close to doing the right thing. Signed-off-by: Brian Ewins --- poppler/TextOutputDev.cc | 99 ++++++++++++++++++++++++++++++++++++++++++++- poppler/TextOutputDev.h | 5 ++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index aeb4f37..4717e09 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2178,6 +2178,88 @@ void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) { links->append(new TextLink(xMin, yMin, xMax, yMax, link)); } +// Sort into reading order by performing a topological sort using the rules +// given in "High Performance Document Layout Analysis", T.M. Breuel, 2003. +// See http://pubs.iupr.org/#2003-breuel-sdiut +// Topological sort is done by depth first search, see +// http://en.wikipedia.org/wiki/Topological_sorting +int TextPage::depth_first_visit(TextBlock *blkList, int pos1, + TextBlock *blk1, + TextBlock **sorted, int sortPos, + GBool* visited) { + int pos2, pos3; + TextBlock *blk2, *blk3; + GBool before; + if (visited[pos1]) { + return sortPos; + } + visited[pos1] = true; + pos2 = -1; + for (blk2 = blkList; blk2; blk2 = blk2->next) { + pos2++; + if (visited[pos2]) { + // skip visited nodes + continue; + } + before = gFalse; + // The xy comparisons here only work for LTR. + // I've omitted RTL for clarity. I think these + // are wrong anyway, I should be performing + // appropriate comparisons for blk->rot. + if (blk2->rot < blk1->rot) { + before = gTrue; + } else if (blk2->rot != blk1->rot) { + continue; + } else if ((blk1->xMin <= blk2->xMin && + blk2->xMin <= blk1->xMax) || + (blk2->xMin <= blk1->xMin && + blk1->xMin <= blk2->xMax) && + blk1->yMin < blk2->yMin) { + // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1. + before = gTrue; + /* + } else if (blk2->yMin <= blk1->yMin) { + if (blk2->xMax <= blk1->xMin) { + // Rule (2) blk2 left of blk1, and no intervening blk3 + // such that blk3 overlaps blk1 and blk2. + // (blk2 right of blk1 for RTL text) + // In this case, blk2 is above blk3 and blk3 is above + // blk1; hence blk2 is before blk1 even if we find + // a candidate blk3. Hence, bypass the expensive check. + before = gTrue; + }*/ + } else if (blk1->xMax <= blk2->xMin) { + // Rule (2) blk2 left of blk1, and no intervening blk3 + // such that blk3 overlaps blk1 and blk2. + // (blk2 right of blk1 for RTL text) + before = gTrue; + pos3 = -1; + for (blk3 = blkList; blk3; blk3 = blk3->next) { + pos3++; + if (pos3 == pos1 || pos3 == pos2) { + // not an intervening block + continue; + } + if (blk3->xMin <= blk1->xMax && + blk2->xMin <= blk3->xMax && + blk2->yMin < blk3->yMin && + blk3->yMin < blk1->yMin) { + // blk3 between blk1 and blk2, overlapping both + before = gFalse; + break; + } + } + } + if (before) { + // blk2 is before blk1, so it needs to be visited + // before we can add blk1 to the sorted list. + sortPos = this->depth_first_visit(blkList, pos2, blk2, sorted, sortPos, visited); + } + } + sorted[sortPos--] = blk1; + return sortPos; +} + void TextPage::coalesce(GBool physLayout, GBool doHTML) { UnicodeMap *uMap; TextPool *pool; @@ -2839,9 +2921,6 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { //----- reading order sort - // sort blocks into xy order (in preparation for reading order sort) - qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); - // compute space on left and right sides of each block for (i = 0; i < nBlocks; ++i) { blk0 = blocks[i]; @@ -2853,6 +2932,20 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { } } + int sortPos = nBlocks - 1; + GBool *visited = (GBool *)gmallocn(nBlocks, sizeof(GBool *)); + for (i = 0; i < nBlocks; i++) { + visited[i] = gFalse; + } + i = -1; + for (blk1 = blkList; blk1; blk1 = blk1->next) { + i++; + sortPos = this->depth_first_visit(blkList, i, blk1, blocks, sortPos, visited); + } + if (visited) { + gfree(visited); + } + #if 0 // for debugging printf("*** blocks, after yx sort ***\n"); for (i = 0; i < nBlocks; ++i) { diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index d3ae2f9..e52a319 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -568,6 +568,11 @@ private: void assignColumns(TextLineFrag *frags, int nFrags, int rot); int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s); + int depth_first_visit(TextBlock *blkList, int pos1, + TextBlock *blk1, + TextBlock **sorted, int sortPos, + GBool* visited); + GBool rawOrder; // keep text in content stream order double pageWidth, pageHeight; // width and height of current page -- 1.5.6.6