From fc6874c455ea2930bc6e0ec3a2aea7df924634f7 Mon Sep 17 00:00:00 2001 From: Marek Kasik Date: Fri, 12 Feb 2010 14:31:01 +0100 Subject: [PATCH] Distinguish between columns and tables when selecting text This commit add ability to detect tables in text by checking borders of 4 neighbouring text blocks for arrangement (to the left, to the right, center, ...). Detected border of whole table is then stored in ExMin, ExMax, EyMin and EyMax of each block together with id of detected table. Sorting of blocks is then performed on the these borders to be able to distinguish tables from columns. Pasting of selected text was modified so that tables are pasted correctly (even with multi line cells). --- poppler/TextOutputDev.cc | 490 ++++++++++++++++++++++++++++++++++++++++------ poppler/TextOutputDev.h | 5 + 2 files changed, 437 insertions(+), 58 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 94a86b6..a307830 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -38,6 +38,7 @@ #include #include #include +#include #include #ifdef _WIN32 #include // for O_BINARY @@ -1154,6 +1155,8 @@ TextBlock::TextBlock(TextPage *pageA, int rotA) { curLine = NULL; next = NULL; stackNext = NULL; + tableId = -1; + tableEnd = gFalse; } TextBlock::~TextBlock() { @@ -1622,31 +1625,31 @@ GBool TextBlock::isBeforeByRule1(TextBlock *blk1) { switch (this->page->primaryRot) { case 0: case 2: - overlap = ((this->xMin <= blk1->xMin) && - (blk1->xMin <= this->xMax)) || - ((blk1->xMin <= this->xMin) && - (this->xMin <= blk1->xMax)); + overlap = ((this->ExMin <= blk1->ExMin) && + (blk1->ExMin <= this->ExMax)) || + ((blk1->ExMin <= this->ExMin) && + (this->ExMin <= blk1->ExMax)); break; case 1: case 3: - overlap = ((this->yMin <= blk1->yMin) && - (blk1->yMin <= this->yMax)) || - ((blk1->yMin <= this->yMin) && - (this->yMin <= blk1->yMax)); + overlap = ((this->EyMin <= blk1->EyMin) && + (blk1->EyMin <= this->EyMax)) || + ((blk1->EyMin <= this->EyMin) && + (this->EyMin <= blk1->EyMax)); break; } switch (this->page->primaryRot) { case 0: - before = overlap && this->yMin < blk1->yMin; + before = overlap && this->EyMin < blk1->EyMin; break; case 1: - before = overlap && this->xMax > blk1->xMax; + before = overlap && this->ExMax > blk1->ExMax; break; case 2: - before = overlap && this->yMax > blk1->yMax; + before = overlap && this->EyMax > blk1->EyMax; break; case 3: - before = overlap && this->xMin < blk1->xMin; + before = overlap && this->ExMin < blk1->ExMin; break; } return before; @@ -1662,16 +1665,16 @@ GBool TextBlock::isBeforeByRule2(TextBlock *blk1) { switch (rotLR) { case 0: - cmp = xMax - blk1->xMin; + cmp = ExMax - blk1->ExMin; break; case 1: - cmp = yMin - blk1->yMax; + cmp = EyMin - blk1->EyMax; break; case 2: - cmp = blk1->xMax - xMin; + cmp = blk1->ExMax - ExMin; break; case 3: - cmp = blk1->yMin - yMax; + cmp = blk1->EyMin - EyMax; break; } return cmp <= 0; @@ -1697,7 +1700,7 @@ int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, #if 0 // for debugging printf("visited: %d %.2f..%.2f %.2f..%.2f\n", - sortPos, blk1->xMin, blk1->xMax, blk1->yMin, blk1->yMax); + sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); #endif visited[pos1] = gTrue; pos2 = -1; @@ -1708,36 +1711,55 @@ int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, continue; } before = gFalse; - if (blk2->isBeforeByRule1(blk1)) { - // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1. - before = gTrue; -#if 0 // for debugging - printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n", - blk2->xMin, blk2->xMax, blk2->yMin, blk2->yMax, - blk1->xMin, blk1->xMax, blk1->yMin, blk1->yMax); -#endif - } else if (blk2->isBeforeByRule2(blk1)) { - // Rule (2) blk2 left of blk1, and no intervening blk3 - // such that blk1 is before blk3 by rule 1, - // and blk3 is before blk2 by rule 1. - before = gTrue; - for (blk3 = blkList; blk3; blk3 = blk3->next) { - if (blk3 == blk2 || blk3 == blk1) { - continue; - } - if (blk1->isBeforeByRule1(blk3) && - blk3->isBeforeByRule1(blk2)) { - before = gFalse; - break; - } + + // is blk2 before blk1? (for table entries) + if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) { + if (page->primaryLR) { + if (blk2->xMax <= blk1->xMin && + blk2->yMin <= blk1->yMax && + blk2->yMax >= blk1->yMin) + before = gTrue; + } else { + if (blk2->xMin >= blk1->xMax && + blk2->yMin <= blk1->yMax && + blk2->yMax >= blk1->yMin) + before = gTrue; } + + if (blk2->yMax <= blk1->yMin) + before = gTrue; + } else { + if (blk2->isBeforeByRule1(blk1)) { + // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1. + before = gTrue; +#if 0 // for debugging + printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n", + blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax, + blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); +#endif + } else if (blk2->isBeforeByRule2(blk1)) { + // Rule (2) blk2 left of blk1, and no intervening blk3 + // such that blk1 is before blk3 by rule 1, + // and blk3 is before blk2 by rule 1. + before = gTrue; + for (blk3 = blkList; blk3; blk3 = blk3->next) { + if (blk3 == blk2 || blk3 == blk1) { + continue; + } + if (blk1->isBeforeByRule1(blk3) && + blk3->isBeforeByRule1(blk2)) { + before = gFalse; + break; + } + } #if 0 // for debugging - if (before) { - printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n", - blk1->xMin, blk1->xMax, blk1->yMin, blk1->yMax, - blk2->xMin, blk2->xMax, blk2->yMin, blk2->yMax); - } + if (before) { + printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n", + blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax, + blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax); + } #endif + } } if (before) { // blk2 is before blk1, so it needs to be visited @@ -1747,7 +1769,7 @@ int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, } #if 0 // for debugging printf("sorted: %d %.2f..%.2f %.2f..%.2f\n", - sortPos, blk1->xMin, blk1->xMax, blk1->yMin, blk1->yMax); + sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); #endif sorted[sortPos++] = blk1; return sortPos; @@ -2321,7 +2343,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { TextPool *pool; TextWord *word0, *word1, *word2; TextLine *line; - TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1; + TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2; TextFlow *flow, *lastFlow; TextUnderline *underline; TextLink *link; @@ -2997,6 +3019,263 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { for (i = 0; i < nBlocks; i++) { visited[i] = gFalse; } + + double bxMin0, byMin0, bxMin1, byMin1; + int numTables = 0; + int tableId = -1; + int correspondenceX, correspondenceY; + double xCentre1, yCentre1, xCentre2, yCentre2; + double xCentre3, yCentre3, xCentre4, yCentre4; + double deltaX, deltaY; + TextBlock *fblk2 = NULL, *fblk3 = NULL, *fblk4 = NULL; + + for (blk1 = blkList; blk1; blk1 = blk1->next) { + blk1->ExMin = blk1->xMin; + blk1->ExMax = blk1->xMax; + blk1->EyMin = blk1->yMin; + blk1->EyMax = blk1->yMax; + + bxMin0 = std::numeric_limits::max(); + byMin0 = std::numeric_limits::max(); + bxMin1 = std::numeric_limits::max(); + byMin1 = std::numeric_limits::max(); + + fblk2 = NULL; + fblk3 = NULL; + fblk4 = NULL; + + /* find fblk2, fblk3 and fblk4 so that + * fblk2 is on the right of blk1 and overlap with blk1 in y axis + * fblk3 is under blk1 and overlap with blk1 in x axis + * fblk4 is under blk1 and on the right of blk1 + * and they are closest to blk1 + */ + for (blk2 = blkList; blk2; blk2 = blk2->next) { + if (blk2 != blk1) { + if (blk2->yMin <= blk1->yMax && + blk2->yMax >= blk1->yMin && + blk2->xMin > blk1->xMax && + blk2->xMin < bxMin0) { + bxMin0 = blk2->xMin; + fblk2 = blk2; + } else if (blk2->xMin <= blk1->xMax && + blk2->xMax >= blk1->xMin && + blk2->yMin > blk1->yMax && + blk2->yMin < byMin0) { + byMin0 = blk2->yMin; + fblk3 = blk2; + } else if (blk2->xMin > blk1->xMax && + blk2->xMin < bxMin1 && + blk2->yMin > blk1->yMax && + blk2->yMin < byMin1) { + bxMin1 = blk2->xMin; + byMin1 = blk2->yMin; + fblk4 = blk2; + } + } + } + + /* fblk4 can not overlap with fblk3 in x and with fblk2 in y + * fblk4 has to overlap with fblk3 in y and with fblk2 in x + */ + if (fblk2 != NULL && + fblk3 != NULL && + fblk4 != NULL) { + if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || + (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin)) || + !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && + fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) { + fblk2 = NULL; + fblk3 = NULL; + fblk4 = NULL; + } + } + + // if we found any then look whether they form a table + if (fblk2 != NULL && + fblk3 != NULL && + fblk4 != NULL) { + tableId = -1; + correspondenceX = 0; + correspondenceY = 0; + deltaX = 0.0; + deltaY = 0.0; + + if (blk1->lines && blk1->lines->words) + deltaX = blk1->lines->words->getFontSize(); + if (fblk2->lines && fblk2->lines->words) + deltaX = deltaX < fblk2->lines->words->getFontSize() ? + deltaX : fblk2->lines->words->getFontSize(); + if (fblk3->lines && fblk3->lines->words) + deltaX = deltaX < fblk3->lines->words->getFontSize() ? + deltaX : fblk3->lines->words->getFontSize(); + if (fblk4->lines && fblk4->lines->words) + deltaX = deltaX < fblk4->lines->words->getFontSize() ? + deltaX : fblk4->lines->words->getFontSize(); + + deltaY = deltaX; + + deltaX *= minColSpacing1; + deltaY *= maxIntraLineDelta; + + xCentre1 = (blk1->xMax + blk1->xMin) / 2.0; + yCentre1 = (blk1->yMax + blk1->yMin) / 2.0; + xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0; + yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0; + xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0; + yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0; + xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0; + yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0; + + // are blocks centrally aligned in x ? + if (abs (xCentre1 - xCentre3) <= deltaX && + abs (xCentre2 - xCentre4) <= deltaX) + correspondenceX++; + + // are blocks centrally aligned in y ? + if (abs (yCentre1 - yCentre2) <= deltaY && + abs (yCentre3 - yCentre4) <= deltaY) + correspondenceY++; + + // are blocks aligned to the left ? + if (abs (blk1->xMin - fblk3->xMin) <= deltaX && + abs (fblk2->xMin - fblk4->xMin) <= deltaX) + correspondenceX++; + + // are blocks aligned to the right ? + if (abs (blk1->xMax - fblk3->xMax) <= deltaX && + abs (fblk2->xMax - fblk4->xMax) <= deltaX) + correspondenceX++; + + // are blocks aligned to the top ? + if (abs (blk1->yMin - fblk2->yMin) <= deltaY && + abs (fblk3->yMin - fblk4->yMin) <= deltaY) + correspondenceY++; + + // are blocks aligned to the bottom ? + if (abs (blk1->yMax - fblk2->yMax) <= deltaY && + abs (fblk3->yMax - fblk4->yMax) <= deltaY) + correspondenceY++; + + // are blocks aligned in x and y ? + if (correspondenceX > 0 && + correspondenceY > 0) { + + // find maximal tableId + tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId; + tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId; + tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId; + tableId = tableId < blk1->tableId ? blk1->tableId : tableId; + + // if the tableId is -1, then we found new table + if (tableId < 0) { + tableId = numTables; + numTables++; + } + + blk1->tableId = tableId; + fblk2->tableId = tableId; + fblk3->tableId = tableId; + fblk4->tableId = tableId; + } + } + } + + /* set extended bounding boxes of all table entries + * so that they contain whole table + * (we need to process whole table size when comparing it + * with regular text blocks) + */ + PDFRectangle *envelopes = new PDFRectangle [numTables]; + TextBlock **ending_blocks = new TextBlock* [numTables]; + + for (i = 0; i < numTables; i++) { + envelopes[i].x1 = std::numeric_limits::max(); + envelopes[i].x2 = std::numeric_limits::min(); + envelopes[i].y1 = std::numeric_limits::max(); + envelopes[i].y2 = std::numeric_limits::min(); + } + + for (blk1 = blkList; blk1; blk1 = blk1->next) { + if (blk1->tableId >= 0) { + if (blk1->ExMin < envelopes[blk1->tableId].x1) { + envelopes[blk1->tableId].x1 = blk1->ExMin; + if (!blk1->page->primaryLR) + ending_blocks[blk1->tableId] = blk1; + } + + if (blk1->ExMax > envelopes[blk1->tableId].x2) { + envelopes[blk1->tableId].x2 = blk1->ExMax; + if (blk1->page->primaryLR) + ending_blocks[blk1->tableId] = blk1; + } + + envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? + blk1->EyMin : envelopes[blk1->tableId].y1; + envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? + blk1->EyMax : envelopes[blk1->tableId].y2; + } + } + + for (blk1 = blkList; blk1; blk1 = blk1->next) { + if (blk1->tableId >= 0 && + blk1->xMin <= ending_blocks[blk1->tableId]->xMax && + blk1->xMax >= ending_blocks[blk1->tableId]->xMin) { + blk1->tableEnd = gTrue; + } + } + + for (blk1 = blkList; blk1; blk1 = blk1->next) { + if (blk1->tableId >= 0) { + blk1->ExMin = envelopes[blk1->tableId].x1; + blk1->ExMax = envelopes[blk1->tableId].x2; + blk1->EyMin = envelopes[blk1->tableId].y1; + blk1->EyMax = envelopes[blk1->tableId].y2; + } + } + delete[] envelopes; + delete[] ending_blocks; + + + /* set extended bounding boxes of all other blocks + * so that they extend in x without hitting neighbours + */ + for (blk1 = blkList; blk1; blk1 = blk1->next) { + if (!blk1->tableId >= 0) { + double xMax = std::numeric_limits::max(); + double xMin = std::numeric_limits::min(); + + for (blk2 = blkList; blk2; blk2 = blk2->next) { + if (blk2 == blk1) + continue; + + if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) { + if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) + xMax = blk2->xMin; + + if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) + xMin = blk2->xMax; + } + } + + for (blk2 = blkList; blk2; blk2 = blk2->next) { + if (blk2 == blk1) + continue; + + if (blk2->xMax > blk1->ExMax && + blk2->xMax <= xMax && + blk2->yMin >= blk1->yMax) { + blk1->ExMax = blk2->xMax; + } + + if (blk2->xMin < blk1->ExMin && + blk2->xMin >= xMin && + blk2->yMin >= blk1->yMax) + blk1->ExMin = blk2->xMin; + } + } + } + i = -1; for (blk1 = blkList; blk1; blk1 = blk1->next) { i++; @@ -3072,7 +3351,7 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { flow->priMin, flow->priMax); for (blk = flow->blocks; blk; blk = blk->next) { printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n", - blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, + blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax, blk->priMin, blk->priMax); for (line = blk->lines; line; line = line->next) { printf(" line:\n"); @@ -3615,11 +3894,16 @@ GooString *TextSelectionDumper::getText (void) { GooString *s; TextLineFrag *frag; - int i; + int i, j; GBool multiLine; UnicodeMap *uMap; char space[8], eol[16]; int spaceLen, eolLen; + GooList *strings = NULL; + int actual_table = -1; + int actual_line = -1; + int last_length = 0; + TextBlock *actual_block = NULL; s = new GooString(); @@ -3636,8 +3920,84 @@ GooString *TextSelectionDumper::getText (void) for (i = 0; i < nFrags; ++i) { frag = &frags[i]; - page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); - s->append(eol, eolLen); + if (actual_table >= 0 && frag->line->blk->tableId < 0) { + for (j = 0; j < strings->getLength (); j++) { + s->append ((GooString*) strings->get (j)); + s->append (eol, eolLen); + delete ((GooString*) strings->get (j)); + } + delete strings; + strings = NULL; + actual_table = -1; + actual_line = -1; + actual_block = NULL; + } + + // a table + if (frag->line->blk->tableId >= 0) { + if (actual_table == -1) { + strings = new GooList(); + actual_table = frag->line->blk->tableId; + actual_block = frag->line->blk; + actual_line = -1; + } + + // the same block + if (actual_block == frag->line->blk) { + actual_line++; + if (actual_line >= strings->getLength ()) { + GooString *t = new GooString (); + // add some spaces to have this block correctly aligned + if (actual_line > 0) + for (j = 0; j < ((GooString*) (strings->get (actual_line - 1)))->getLength() - last_length - 1; j++) + t->append (space, spaceLen); + strings->append (t); + } + } + // another block + else { + // previous block ended its row + if (actual_block->tableEnd) { + for (j = 0; j < strings->getLength (); j++) { + s->append ((GooString*) strings->get (j)); + s->append (eol, eolLen); + delete ((GooString*) strings->get (j)); + } + delete strings; + + strings = new GooList(); + GooString *t = new GooString (); + strings->append (t); + } + actual_block = frag->line->blk; + actual_line = 0; + } + + page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line))); + last_length = frag->len; + + if (!frag->line->blk->tableEnd) { + ((GooString*) strings->get (actual_line))->append (space, spaceLen); + } + } + // not a table + else { + page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s); + s->append (eol, eolLen); + } + } + + if (strings != NULL) { + for (j = 0; j < strings->getLength (); j++) { + s->append((GooString*) strings->get (j)); + s->append(eol, eolLen); + delete ((GooString*) strings->get (j)); + } + delete strings; + strings = NULL; + actual_table = -1; + actual_line = -1; + actual_block = NULL; } } @@ -3867,14 +4227,28 @@ void TextLine::visitSelection(TextSelectionVisitor *visitor, end = NULL; current = NULL; for (p = words; p != NULL; p = p->next) { - if ((selection->x1 < p->xMax && selection->y1 < p->yMax) || - (selection->x2 < p->xMax && selection->y2 < p->yMax)) - if (begin == NULL) - begin = p; - if (((selection->x1 > p->xMin && selection->y1 > p->yMin) || - (selection->x2 > p->xMin && selection->y2 > p->yMin)) && (begin != NULL)) { - end = p->next; - current = p; + if (blk->page->primaryLR) { + if ((selection->x1 < p->xMax && selection->y1 < p->yMax) || + (selection->x2 < p->xMax && selection->y2 < p->yMax)) + if (begin == NULL) + begin = p; + + if (((selection->x1 > p->xMin && selection->y1 > p->yMin) || + (selection->x2 > p->xMin && selection->y2 > p->yMin)) && (begin != NULL)) { + end = p->next; + current = p; + } + } else { + if ((selection->x1 > p->xMin && selection->y1 < p->yMax) || + (selection->x2 > p->xMin && selection->y2 < p->yMax)) + if (begin == NULL) + begin = p; + + if (((selection->x1 < p->xMax && selection->y1 > p->yMin) || + (selection->x2 < p->xMax && selection->y2 > p->yMin)) && (begin != NULL)) { + end = p->next; + current = p; + } } } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 12453c3..62e2d4c 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -374,6 +374,10 @@ private: double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double priMin, priMax; // whitespace bounding box along primary axis + double ExMin, ExMax; // extended bounding box x coordinates + double EyMin, EyMax; // extended bounding box y coordinates + int tableId; // id of table to which this block belongs + GBool tableEnd; // is this block at end of line of actual table TextPool *pool; // pool of words (used only until lines // are built) @@ -393,6 +397,7 @@ private: friend class TextWordList; friend class TextPage; friend class TextSelectionPainter; + friend class TextSelectionDumper; }; //------------------------------------------------------------------------ -- 1.6.6