From 3fa49a48e8dee94ed82edb9c693bb6f4dc8ef36a Mon Sep 17 00:00:00 2001 From: Jason Crain Date: Sat, 3 Mar 2012 10:45:33 -0600 Subject: [PATCH 3/5] displayed chars can map to any number of text chars Modified TextWord, TextLine, TextLineFrag, TextBlock, and TextPage classes to suport displayed characters that can be mapped to any number of text characters. --- poppler/TextOutputDev.cc | 176 ++++++++++++++++++++++++++++------------------ poppler/TextOutputDev.h | 15 +++-- 2 files changed, 118 insertions(+), 73 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 332d7ee..b7fd582 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -328,6 +328,8 @@ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, } } text = NULL; + textLen = textSize = 0; + textOffset = NULL; charcode = NULL; edge = NULL; charPos = NULL; @@ -354,6 +356,7 @@ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, TextWord::~TextWord() { gfree(text); + gfree(textOffset); gfree(charcode); gfree(edge); gfree(charPos); @@ -361,17 +364,29 @@ TextWord::~TextWord() { void TextWord::addChar(GfxState *state, double x, double y, double dx, double dy, int charPosA, int charLen, - CharCode c, Unicode u) { + CharCode c, Unicode *u, int uLen) { int wMode; if (len == size) { size += 16; - text = (Unicode *)greallocn(text, size, sizeof(Unicode)); + textOffset = (int *)greallocn(textOffset, size + 1, sizeof(int)); + if (len == 0) + textOffset[0] = 0; charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); } - text[len] = u; + if (textLen + uLen > textSize) { + textSize = (textSize + 16 > textLen + uLen) ? textSize + 16 : textLen + uLen; + text = (Unicode *)greallocn(text, textSize, sizeof(Unicode)); + } + + for (int i = 0; i < uLen; ++i) { + text[textLen] = u[i]; + textLen++; + } + + textOffset[len + 1] = textLen; charcode[len] = c; charPos[len] = charPosA; charPos[len + 1] = charPosA + charLen; @@ -461,17 +476,27 @@ void TextWord::merge(TextWord *word) { } if (len + word->len > size) { size = len + word->len; - text = (Unicode *)greallocn(text, size, sizeof(Unicode)); + textOffset = (int *)greallocn(textOffset, size + 1, sizeof(int)); + if (len == 0) + textOffset[0] = 0; charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); } + if (textLen + word->textLen > textSize) { + textSize = textLen + word->textLen; + text = (Unicode *)greallocn(text, textSize, sizeof(Unicode)); + } for (i = 0; i < word->len; ++i) { - text[len + i] = word->text[i]; + textOffset[len + i] = textLen + word->textOffset[i]; charcode[len + i] = word->charcode[i]; edge[len + i] = word->edge[i]; charPos[len + i] = word->charPos[i]; } + for (i = 0; i < word->textLen; i++) { + text[textLen++] = word->text[i]; + } + textOffset[len + word->len] = textLen; edge[len + word->len] = word->edge[word->len]; charPos[len + word->len] = word->charPos[word->len]; len += word->len; @@ -533,6 +558,11 @@ int TextWord::cmpYX(const void *p1, const void *p2) { #if TEXTOUT_WORD_LIST +const Unicode *TextWord::getChars(int idx, int *length) { + *length = textOffset[idx + 1] - textOffset[idx]; + return &text[textOffset[idx]]; +} + GooString *TextWord::getText() { GooString *s; UnicodeMap *uMap; @@ -543,7 +573,7 @@ GooString *TextWord::getText() { if (!(uMap = globalParams->getTextEncoding())) { return s; } - for (i = 0; i < len; ++i) { + for (i = 0; i < textLen; ++i) { n = uMap->mapUnicode(text[i], buf, sizeof(buf)); s->append(buf, n); } @@ -713,6 +743,7 @@ TextLine::~TextLine() { delete word; } gfree(text); + gfree(textOffset); gfree(edge); gfree(col); if (normalized) { @@ -824,7 +855,8 @@ void TextLine::coalesce(UnicodeMap *uMap) { double space, delta, minSpace; GBool isUnicode; char buf[8]; - int i, j; + int i, j, k; + int textPos; if (words->next) { @@ -881,32 +913,43 @@ void TextLine::coalesce(UnicodeMap *uMap) { // build the line text isUnicode = uMap ? uMap->isUnicode() : gFalse; len = 0; + textLen = 0; for (word1 = words; word1; word1 = word1->next) { len += word1->len; + textLen += word1->textLen; if (word1->spaceAfter) { ++len; + ++textLen; } } - text = (Unicode *)gmallocn(len, sizeof(Unicode)); + text = (Unicode *)gmallocn(textLen, sizeof(Unicode)); edge = (double *)gmallocn(len + 1, sizeof(double)); + textOffset = (int *)gmallocn(len + 1, sizeof(int)); i = 0; + textPos = 0; for (word1 = words; word1; word1 = word1->next) { for (j = 0; j < word1->len; ++j) { - text[i] = word1->text[j]; + textOffset[i] = textPos; edge[i] = word1->edge[j]; ++i; + for (k = word1->textOffset[j]; k < word1->textOffset[j+1]; ++k) { + text[textPos++] = word1->text[k]; + } } + edge[i] = word1->edge[word1->len]; if (word1->spaceAfter) { - text[i] = (Unicode)0x0020; + textOffset[i] = textPos; + text[textPos++] = (Unicode)0x0020; ++i; } } + textOffset[i] = textPos; // compute convertedLen and set up the col array - col = (int *)gmallocn(len + 1, sizeof(int)); + col = (int *)gmallocn(textLen + 1, sizeof(int)); convertedLen = 0; - for (i = 0; i < len; ++i) { + for (i = 0; i < textLen; ++i) { col[i] = convertedLen; if (isUnicode) { ++convertedLen; @@ -914,11 +957,12 @@ void TextLine::coalesce(UnicodeMap *uMap) { convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); } } - col[len] = convertedLen; + col[textLen] = convertedLen; // check for hyphen at end of line //~ need to check for other chars used as hyphens - hyphenated = text[len - 1] == (Unicode)'-'; + if (textLen > 0) + hyphenated = text[textLen - 1] == (Unicode)'-'; } //------------------------------------------------------------------------ @@ -948,8 +992,8 @@ public: void TextLineFrag::init(TextLine *lineA, int startA, int lenA) { line = lineA; - start = startA; - len = lenA; + start = line->textOffset[startA]; + len = line->textOffset[startA + lenA] - start; col = line->col[start]; } @@ -1307,9 +1351,9 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { word2 = pool->getPool(idx1); } for (; word2; word1 = word2, word2 = word2->next) { - if (word2->len == word0->len && + if (word2->textLen == word0->textLen && !memcmp(word2->text, word0->text, - word0->len * sizeof(Unicode))) { + word0->textLen * sizeof(Unicode))) { switch (rot) { case 0: case 2: @@ -1491,11 +1535,11 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5); break; } - for (k = 0; k <= line0->len; ++k) { + for (k = 0; k <= line0->textLen; ++k) { line0->col[k] += col1; } - if (line0->col[line0->len] > nColumns) { - nColumns = line0->col[line0->len]; + if (line0->col[line0->textLen] > nColumns) { + nColumns = line0->col[line0->textLen]; } } } else { @@ -1505,7 +1549,7 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { for (j = 0; j < i; ++j) { line1 = lineArray[j]; if (line1->primaryDelta(line0) >= 0) { - col2 = line1->col[line1->len] + 1; + col2 = line1->col[line1->textLen] + 1; } else { k = 0; // make gcc happy switch (rot) { @@ -1540,11 +1584,11 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { col1 = col2; } } - for (k = 0; k <= line0->len; ++k) { + for (k = 0; k <= line0->textLen; ++k) { line0->col[k] += col1; } - if (line0->col[line0->len] > nColumns) { - nColumns = line0->col[line0->len]; + if (line0->col[line0->textLen] > nColumns) { + nColumns = line0->col[line0->textLen]; } } } @@ -2281,7 +2325,6 @@ void TextPage::addChar(GfxState *state, double x, double y, CharCode c, int nBytes, Unicode *u, int uLen) { double x1, y1, w1, h1, dx2, dy2, base, sp, delta; GBool overlap; - int i; // subtract char and word spacing from the dx,dy values sp = state->getCharSpace(); @@ -2366,34 +2409,27 @@ void TextPage::addChar(GfxState *state, double x, double y, lastCharOverlap = gFalse; } - if (uLen != 0) { - // start a new word if needed - if (!curWord) { - beginWord(state, x, y); - } - - // page rotation and/or transform matrices can cause text to be - // drawn in reverse order -- in this case, swap the begin/end - // coordinates and break text into individual chars - if ((curWord->rot == 0 && w1 < 0) || - (curWord->rot == 1 && h1 < 0) || - (curWord->rot == 2 && w1 > 0) || - (curWord->rot == 3 && h1 > 0)) { - endWord(); - beginWord(state, x + dx, y + dy); - x1 += w1; - y1 += h1; - w1 = -w1; - h1 = -h1; - } + // start a new word if needed + if (!curWord) { + beginWord(state, x, y); + } - // add the characters to the current word - w1 /= uLen; - h1 /= uLen; - for (i = 0; i < uLen; ++i) { - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); - } + // page rotation and/or transform matrices can cause text to be + // drawn in reverse order -- in this case, swap the begin/end + // coordinates and break text into individual chars + if ((curWord->rot == 0 && w1 < 0) || + (curWord->rot == 1 && h1 < 0) || + (curWord->rot == 2 && w1 > 0) || + (curWord->rot == 3 && h1 > 0)) { + endWord(); + beginWord(state, x + dx, y + dy); + x1 += w1; + y1 += h1; + w1 = -w1; + h1 = -h1; } + + curWord->addChar(state, x1, y1, w1, h1, charPos, nBytes, c, u, uLen); charPos += nBytes; } @@ -2483,7 +2519,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, rot*90, word0->link); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -2973,7 +3009,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -2988,7 +3024,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { for (blk = blkList; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word0 = line->words; word0; word0 = word0->next) { - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { if (unicodeTypeL(word0->text[i])) { ++lrCount; } else if (unicodeTypeR(word0->text[i])) { @@ -3033,7 +3069,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { } blk->col = col1; for (line = blk->lines; line; line = line->next) { - for (j = 0; j <= line->len; ++j) { + for (j = 0; j <= line->textLen; ++j) { line->col[j] += col1; } } @@ -3107,7 +3143,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { } blk0->col = col1; for (line = blk0->lines; line; line = line->next) { - for (j = 0; j <= line->len; ++j) { + for (j = 0; j <= line->textLen; ++j) { line->col[j] += col1; } } @@ -3127,7 +3163,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -3441,7 +3477,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (j = 0; j < word0->len; ++j) { + for (j = 0; j < word0->textLen; ++j) { fputc(word0->text[j] & 0xff, stdout); } printf("'\n"); @@ -3502,7 +3538,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -3614,7 +3650,7 @@ GBool TextPage::findText(Unicode *s, int len, } if (!line->normalized) - line->normalized = unicodeNormalizeNFKC(line->text, line->len, + line->normalized = unicodeNormalizeNFKC(line->text, line->textLen, &line->normalized_len, &line->normalized_idx); // convert the line to uppercase @@ -3749,7 +3785,7 @@ GooString *TextPage::getText(double xMin, double yMin, int spaceLen, eolLen; int lastRot; double x, y, delta; - int col, idx0, idx1, i, j; + int col, idx0, idx1, i, j, k; GBool multiLine, oneRot; s = new GooString(); @@ -3770,8 +3806,12 @@ GooString *TextPage::getText(double xMin, double yMin, word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax); if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) { - mbc_len = uMap->mapUnicode( *(word->getChar(j)), mbc, sizeof(mbc) ); - s->append(mbc, mbc_len); + int uLen; + const Unicode *u = word->getChars(j, &uLen); + for (k = 0; k < uLen; k++) { + mbc_len = uMap->mapUnicode(u[k], mbc, sizeof(mbc)); + s->append(mbc, mbc_len); + } } } } @@ -4852,7 +4892,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, for (word = rawWords; word; word = word->next) { s = new GooString(); - dumpFragment(word->text, word->len, uMap, s); + dumpFragment(word->text, word->textLen, uMap, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; if (word->next && @@ -4960,7 +5000,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, for (flow = flows; flow; flow = flow->next) { for (blk = flow->blocks; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { - n = line->len; + n = line->textLen; if (line->hyphenated && (line->next || blk->next)) { --n; } @@ -4969,7 +5009,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; // output a newline when a hyphen is not suppressed - if (n == line->len) { + if (n == line->textLen) { (*outputFunc)(outputStream, eol, eolLen); } } @@ -5233,7 +5273,7 @@ void ActualText::end(GfxState *state) { text->addChar(state, actualTextX0, actualTextY0, actualTextX1 - actualTextX0, actualTextY1 - actualTextY0, 0, actualTextNBytes, uni, length); - gfree(uni); // len = 0 ?? XXXXXXXX + gfree(uni); } delete actualText; diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index e31876b..eec7b79 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -122,7 +122,7 @@ public: // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, int charPosA, int charLen, - CharCode c, Unicode u); + CharCode c, Unicode *u, int uLen); // Merge onto the end of . void merge(TextWord *word); @@ -149,7 +149,7 @@ public: #if TEXTOUT_WORD_LIST int getLength() { return len; } - const Unicode *getChar(int idx) { return &text[idx]; } + const Unicode *getChars(int idx, int *length); GooString *getText(); GooString *getFontName() { return font->fontName; } void getColor(double *r, double *g, double *b) @@ -178,14 +178,17 @@ private: double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate Unicode *text; // the text + int textLen; // length of text array + int textSize; // size of text array + int *textOffset; // offsets to match each glyph to text CharCode *charcode; // glyph indices double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *charPos; // character position (within content stream) // of each char (plus one extra entry for // the last char) - int len; // length of text/edge/charPos arrays - int size; // size of text/edge/charPos arrays + int len; // length of textOffset/edge/charPos arrays + int size; // size of textOffset/edge/charPos arrays TextFontInfo *font; // font information double fontSize; // font size GBool spaceAfter; // set if there is a space between this @@ -299,10 +302,12 @@ private: TextWord *lastWord; // last word in this line Unicode *text; // Unicode text of the line, including // spaces between words + int *textOffset; // offsets to match each glyph to text + int textLen; // number of text Unicode chars double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *col; // starting column number of each Unicode char - int len; // number of Unicode chars + int len; // number of displayed chars int convertedLen; // total number of converted characters GBool hyphenated; // set if last char is a hyphen TextLine *next; // next line in block -- 1.7.5.4