From 45c8f20074da5fed5bf9aabec4440dbcf2119058 Mon Sep 17 00:00:00 2001 From: Jason Crain Date: Sat, 3 Mar 2012 10:45:33 -0600 Subject: [PATCH] displayed chars can map to any number of text chars Modified TextWord, TextLine, TextLineFrag, TextBlock, and TextPage classes to suport displayed characters that can be mapped to any number of text characters. --- poppler/TextOutputDev.cc | 208 +++++++++++++++++++++++++++------------------- poppler/TextOutputDev.h | 15 ++- 2 files changed, 134 insertions(+), 89 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 531617d..2b62d2a 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -328,6 +328,8 @@ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, } } text = NULL; + textLen = textSize = 0; + textOffset = NULL; charcode = NULL; edge = NULL; charPos = NULL; @@ -354,6 +356,7 @@ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, TextWord::~TextWord() { gfree(text); + gfree(textOffset); gfree(charcode); gfree(edge); gfree(charPos); @@ -361,17 +364,46 @@ TextWord::~TextWord() { void TextWord::addChar(GfxState *state, double x, double y, double dx, double dy, int charPosA, int charLen, - CharCode c, Unicode u) { + CharCode c, Unicode *u, int uLen) { int wMode; if (len == size) { size += 16; - text = (Unicode *)greallocn(text, size, sizeof(Unicode)); + textOffset = (int *)greallocn(textOffset, size + 1, sizeof(int)); + if (len == 0) + textOffset[0] = 0; charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); } - text[len] = u; + if (textLen + uLen > textSize) { + textSize = (textSize + 16 > textLen + uLen) ? textSize + 16 : textLen + uLen; + text = (Unicode *)greallocn(text, textSize, sizeof(Unicode)); + } + + // decode surrogates and add Unicode chars to text + for (int i = 0; i < uLen; ++i) { + if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */ + if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) { + /* next code is a low surrogate */ + text[textLen] = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000; + ++i; + } else { + /* missing low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + text[textLen] = 0xfffd; + } + } else if (u[i] >= 0xdc00 && u[i] < 0xe000) { + /* invalid low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + text[textLen] = 0xfffd; + } else { + text[textLen] = u[i]; + } + textLen++; + } + + textOffset[len + 1] = textLen; charcode[len] = c; charPos[len] = charPosA; charPos[len + 1] = charPosA + charLen; @@ -461,17 +493,27 @@ void TextWord::merge(TextWord *word) { } if (len + word->len > size) { size = len + word->len; - text = (Unicode *)greallocn(text, size, sizeof(Unicode)); + textOffset = (int *)greallocn(textOffset, size + 1, sizeof(int)); + if (len == 0) + textOffset[0] = 0; charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); } + if (textLen + word->textLen > textSize) { + textSize = textLen + word->textLen; + text = (Unicode *)greallocn(text, textSize, sizeof(Unicode)); + } for (i = 0; i < word->len; ++i) { - text[len + i] = word->text[i]; + textOffset[len + i] = textLen + word->textOffset[i]; charcode[len + i] = word->charcode[i]; edge[len + i] = word->edge[i]; charPos[len + i] = word->charPos[i]; } + for (i = 0; i < word->textLen; i++) { + text[textLen++] = word->text[i]; + } + textOffset[len + word->len] = textLen; edge[len + word->len] = word->edge[word->len]; charPos[len + word->len] = word->charPos[word->len]; len += word->len; @@ -533,6 +575,11 @@ int TextWord::cmpYX(const void *p1, const void *p2) { #if TEXTOUT_WORD_LIST +const Unicode *TextWord::getChars(int idx, int *length) { + *length = textOffset[idx + 1] - textOffset[idx]; + return &text[textOffset[idx]]; +} + GooString *TextWord::getText() { GooString *s; UnicodeMap *uMap; @@ -543,7 +590,7 @@ GooString *TextWord::getText() { if (!(uMap = globalParams->getTextEncoding())) { return s; } - for (i = 0; i < len; ++i) { + for (i = 0; i < textLen; ++i) { n = uMap->mapUnicode(text[i], buf, sizeof(buf)); s->append(buf, n); } @@ -713,6 +760,7 @@ TextLine::~TextLine() { delete word; } gfree(text); + gfree(textOffset); gfree(edge); gfree(col); if (normalized) { @@ -824,7 +872,8 @@ void TextLine::coalesce(UnicodeMap *uMap) { double space, delta, minSpace; GBool isUnicode; char buf[8]; - int i, j; + int i, j, k; + int textPos; if (words->next) { @@ -881,32 +930,43 @@ void TextLine::coalesce(UnicodeMap *uMap) { // build the line text isUnicode = uMap ? uMap->isUnicode() : gFalse; len = 0; + textLen = 0; for (word1 = words; word1; word1 = word1->next) { len += word1->len; + textLen += word1->textLen; if (word1->spaceAfter) { ++len; + ++textLen; } } - text = (Unicode *)gmallocn(len, sizeof(Unicode)); + text = (Unicode *)gmallocn(textLen, sizeof(Unicode)); edge = (double *)gmallocn(len + 1, sizeof(double)); + textOffset = (int *)gmallocn(len + 1, sizeof(int)); i = 0; + textPos = 0; for (word1 = words; word1; word1 = word1->next) { for (j = 0; j < word1->len; ++j) { - text[i] = word1->text[j]; + textOffset[i] = textPos; edge[i] = word1->edge[j]; ++i; + for (k = word1->textOffset[j]; k < word1->textOffset[j+1]; ++k) { + text[textPos++] = word1->text[k]; + } } + edge[i] = word1->edge[word1->len]; if (word1->spaceAfter) { - text[i] = (Unicode)0x0020; + textOffset[i] = textPos; + text[textPos++] = (Unicode)0x0020; ++i; } } + textOffset[i] = textPos; // compute convertedLen and set up the col array - col = (int *)gmallocn(len + 1, sizeof(int)); + col = (int *)gmallocn(textLen + 1, sizeof(int)); convertedLen = 0; - for (i = 0; i < len; ++i) { + for (i = 0; i < textLen; ++i) { col[i] = convertedLen; if (isUnicode) { ++convertedLen; @@ -914,11 +974,12 @@ void TextLine::coalesce(UnicodeMap *uMap) { convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); } } - col[len] = convertedLen; + col[textLen] = convertedLen; // check for hyphen at end of line //~ need to check for other chars used as hyphens - hyphenated = text[len - 1] == (Unicode)'-'; + if (textLen > 0) + hyphenated = text[textLen - 1] == (Unicode)'-'; } //------------------------------------------------------------------------ @@ -948,8 +1009,8 @@ public: void TextLineFrag::init(TextLine *lineA, int startA, int lenA) { line = lineA; - start = startA; - len = lenA; + start = line->textOffset[startA]; + len = line->textOffset[startA + lenA] - start; col = line->col[start]; } @@ -1307,9 +1368,9 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { word2 = pool->getPool(idx1); } for (; word2; word1 = word2, word2 = word2->next) { - if (word2->len == word0->len && + if (word2->textLen == word0->textLen && !memcmp(word2->text, word0->text, - word0->len * sizeof(Unicode))) { + word0->textLen * sizeof(Unicode))) { switch (rot) { case 0: case 2: @@ -1491,11 +1552,11 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5); break; } - for (k = 0; k <= line0->len; ++k) { + for (k = 0; k <= line0->textLen; ++k) { line0->col[k] += col1; } - if (line0->col[line0->len] > nColumns) { - nColumns = line0->col[line0->len]; + if (line0->col[line0->textLen] > nColumns) { + nColumns = line0->col[line0->textLen]; } } } else { @@ -1505,7 +1566,7 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { for (j = 0; j < i; ++j) { line1 = lineArray[j]; if (line1->primaryDelta(line0) >= 0) { - col2 = line1->col[line1->len] + 1; + col2 = line1->col[line1->textLen] + 1; } else { k = 0; // make gcc happy switch (rot) { @@ -1540,11 +1601,11 @@ void TextBlock::coalesce(UnicodeMap *uMap, double fixedPitch) { col1 = col2; } } - for (k = 0; k <= line0->len; ++k) { + for (k = 0; k <= line0->textLen; ++k) { line0->col[k] += col1; } - if (line0->col[line0->len] > nColumns) { - nColumns = line0->col[line0->len]; + if (line0->col[line0->textLen] > nColumns) { + nColumns = line0->col[line0->textLen]; } } } @@ -2281,7 +2342,6 @@ void TextPage::addChar(GfxState *state, double x, double y, CharCode c, int nBytes, Unicode *u, int uLen) { double x1, y1, w1, h1, dx2, dy2, base, sp, delta; GBool overlap; - int i; // subtract char and word spacing from the dx,dy values sp = state->getCharSpace(); @@ -2366,51 +2426,27 @@ void TextPage::addChar(GfxState *state, double x, double y, lastCharOverlap = gFalse; } - if (uLen != 0) { - // start a new word if needed - if (!curWord) { - beginWord(state, x, y); - } + // start a new word if needed + if (!curWord) { + beginWord(state, x, y); + } - // page rotation and/or transform matrices can cause text to be - // drawn in reverse order -- in this case, swap the begin/end - // coordinates and break text into individual chars - if ((curWord->rot == 0 && w1 < 0) || - (curWord->rot == 1 && h1 < 0) || - (curWord->rot == 2 && w1 > 0) || - (curWord->rot == 3 && h1 > 0)) { - endWord(); - beginWord(state, x + dx, y + dy); - x1 += w1; - y1 += h1; - w1 = -w1; - h1 = -h1; - } - - // add the characters to the current word - w1 /= uLen; - h1 /= uLen; - for (i = 0; i < uLen; ++i) { - if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */ - if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) { - /* next code is a low surrogate */ - Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000; - i++; - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu); - } else { - /* missing low surrogate - replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); - } - } else if (u[i] >= 0xdc00 && u[i] < 0xe000) { - /* invalid low surrogate - replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); - } else { - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); - } - } + // page rotation and/or transform matrices can cause text to be + // drawn in reverse order -- in this case, swap the begin/end + // coordinates and break text into individual chars + if ((curWord->rot == 0 && w1 < 0) || + (curWord->rot == 1 && h1 < 0) || + (curWord->rot == 2 && w1 > 0) || + (curWord->rot == 3 && h1 > 0)) { + endWord(); + beginWord(state, x + dx, y + dy); + x1 += w1; + y1 += h1; + w1 = -w1; + h1 = -h1; } + + curWord->addChar(state, x1, y1, w1, h1, charPos, nBytes, c, u, uLen); charPos += nBytes; } @@ -2500,7 +2536,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, rot*90, word0->link); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -2990,7 +3026,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -3005,7 +3041,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { for (blk = blkList; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word0 = line->words; word0; word0 = word0->next) { - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { if (unicodeTypeL(word0->text[i])) { ++lrCount; } else if (unicodeTypeR(word0->text[i])) { @@ -3050,7 +3086,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { } blk->col = col1; for (line = blk->lines; line; line = line->next) { - for (j = 0; j <= line->len; ++j) { + for (j = 0; j <= line->textLen; ++j) { line->col[j] += col1; } } @@ -3124,7 +3160,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { } blk0->col = col1; for (line = blk0->lines; line; line = line->next) { - for (j = 0; j <= line->len; ++j) { + for (j = 0; j <= line->textLen; ++j) { line->col[j] += col1; } } @@ -3144,7 +3180,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -3458,7 +3494,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (j = 0; j < word0->len; ++j) { + for (j = 0; j < word0->textLen; ++j) { fputc(word0->text[j] & 0xff, stdout); } printf("'\n"); @@ -3519,7 +3555,7 @@ void TextPage::coalesce(GBool physLayout, double fixedPitch, GBool doHTML) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { + for (i = 0; i < word0->textLen; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); @@ -3631,7 +3667,7 @@ GBool TextPage::findText(Unicode *s, int len, } if (!line->normalized) - line->normalized = unicodeNormalizeNFKC(line->text, line->len, + line->normalized = unicodeNormalizeNFKC(line->text, line->textLen, &line->normalized_len, &line->normalized_idx); // convert the line to uppercase @@ -3766,7 +3802,7 @@ GooString *TextPage::getText(double xMin, double yMin, int spaceLen, eolLen; int lastRot; double x, y, delta; - int col, idx0, idx1, i, j; + int col, idx0, idx1, i, j, k; GBool multiLine, oneRot; s = new GooString(); @@ -3787,8 +3823,12 @@ GooString *TextPage::getText(double xMin, double yMin, word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax); if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) { - mbc_len = uMap->mapUnicode( *(word->getChar(j)), mbc, sizeof(mbc) ); - s->append(mbc, mbc_len); + int uLen; + const Unicode *u = word->getChars(j, &uLen); + for (k = 0; k < uLen; k++) { + mbc_len = uMap->mapUnicode(u[k], mbc, sizeof(mbc)); + s->append(mbc, mbc_len); + } } } } @@ -4869,7 +4909,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, for (word = rawWords; word; word = word->next) { s = new GooString(); - dumpFragment(word->text, word->len, uMap, s); + dumpFragment(word->text, word->textLen, uMap, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; if (word->next && @@ -4977,7 +5017,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, for (flow = flows; flow; flow = flow->next) { for (blk = flow->blocks; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { - n = line->len; + n = line->textLen; if (line->hyphenated && (line->next || blk->next)) { --n; } @@ -4986,7 +5026,7 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; // output a newline when a hyphen is not suppressed - if (n == line->len) { + if (n == line->textLen) { (*outputFunc)(outputStream, eol, eolLen); } } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index e31876b..eec7b79 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -122,7 +122,7 @@ public: // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, int charPosA, int charLen, - CharCode c, Unicode u); + CharCode c, Unicode *u, int uLen); // Merge onto the end of . void merge(TextWord *word); @@ -149,7 +149,7 @@ public: #if TEXTOUT_WORD_LIST int getLength() { return len; } - const Unicode *getChar(int idx) { return &text[idx]; } + const Unicode *getChars(int idx, int *length); GooString *getText(); GooString *getFontName() { return font->fontName; } void getColor(double *r, double *g, double *b) @@ -178,14 +178,17 @@ private: double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate Unicode *text; // the text + int textLen; // length of text array + int textSize; // size of text array + int *textOffset; // offsets to match each glyph to text CharCode *charcode; // glyph indices double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *charPos; // character position (within content stream) // of each char (plus one extra entry for // the last char) - int len; // length of text/edge/charPos arrays - int size; // size of text/edge/charPos arrays + int len; // length of textOffset/edge/charPos arrays + int size; // size of textOffset/edge/charPos arrays TextFontInfo *font; // font information double fontSize; // font size GBool spaceAfter; // set if there is a space between this @@ -299,10 +302,12 @@ private: TextWord *lastWord; // last word in this line Unicode *text; // Unicode text of the line, including // spaces between words + int *textOffset; // offsets to match each glyph to text + int textLen; // number of text Unicode chars double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *col; // starting column number of each Unicode char - int len; // number of Unicode chars + int len; // number of displayed chars int convertedLen; // total number of converted characters GBool hyphenated; // set if last char is a hyphen TextLine *next; // next line in block -- 1.7.5.4