From 7d5cdb202d106513cb78caae410a92a109a76956 Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Mon, 27 Dec 2010 10:29:10 +1030 Subject: [PATCH] TextOutput: Don't reverse chars mapping to a single glyph when reversing glyphs https://bugs.freedesktop.org/show_bug.cgi?id=32522 --- poppler/TextOutputDev.cc | 97 ++++++++++++++++++++++++++++++++++------------ poppler/TextOutputDev.h | 14 ++++++- 2 files changed, 85 insertions(+), 26 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 576bcc9..5e5c02a 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -293,8 +293,9 @@ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, } text = NULL; charcode = NULL; + chars_per_glyph = NULL; edge = NULL; - len = size = 0; + len = size = num_glyphs = 0; spaceAfter = gFalse; next = NULL; @@ -318,15 +319,25 @@ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, TextWord::~TextWord() { gfree(text); gfree(charcode); + gfree(chars_per_glyph); gfree(edge); } +void TextWord::beginGlyph() { + char_start = len; +} + +void TextWord::endGlyph() { + chars_per_glyph[num_glyphs++] = len - char_start; +} + void TextWord::addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, Unicode u) { if (len == size) { size += 16; text = (Unicode *)greallocn(text, size, sizeof(Unicode)); charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode)); + chars_per_glyph = (int *)greallocn(chars_per_glyph, size, sizeof(int)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); } text[len] = u; @@ -609,6 +620,7 @@ TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) { base = baseA; words = lastWord = NULL; text = NULL; + chars_per_glyph = NULL; edge = NULL; col = NULL; len = 0; @@ -631,6 +643,7 @@ TextLine::~TextLine() { delete word; } gfree(text); + gfree(chars_per_glyph); gfree(edge); gfree(col); if (normalized) { @@ -806,18 +819,26 @@ void TextLine::coalesce(UnicodeMap *uMap) { } } text = (Unicode *)gmallocn(len, sizeof(Unicode)); + chars_per_glyph = (int *)gmallocn(len, sizeof(int)); edge = (double *)gmallocn(len + 1, sizeof(double)); i = 0; + num_glyphs = 0; for (word1 = words; word1; word1 = word1->next) { for (j = 0; j < word1->len; ++j) { text[i] = word1->text[j]; edge[i] = word1->edge[j]; ++i; } + for (j = 0; j < word1->num_glyphs; ++j) { + chars_per_glyph[num_glyphs] = word1->chars_per_glyph[j]; + ++num_glyphs; + } edge[i] = word1->edge[word1->len]; if (word1->spaceAfter) { text[i] = (Unicode)0x0020; ++i; + chars_per_glyph[num_glyphs] = 1; + ++num_glyphs; } } @@ -2270,6 +2291,7 @@ void TextPage::addChar(GfxState *state, double x, double y, // add the characters to the current word w1 /= uLen; h1 /= uLen; + curWord->beginGlyph(); for (i = 0; i < uLen; ++i) { if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */ if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) { @@ -2290,6 +2312,7 @@ void TextPage::addChar(GfxState *state, double x, double y, curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]); } } + curWord->endGlyph(); } if (curWord) { curWord->charLen += nBytes; @@ -3815,7 +3838,8 @@ GooString *TextPage::getText(double xMin, double yMin, } // get the fragment text - col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, + frag->line->chars_per_glyph, frag->line->num_glyphs, s); } if (multiLine) { @@ -3994,7 +4018,9 @@ GooString *TextSelectionDumper::getText (void) actual_line = 0; } - page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line))); + page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, + frag->line->chars_per_glyph, frag->line->num_glyphs, + ((GooString*) strings->get (actual_line))); last_length = frag->len; if (!frag->line->blk->tableEnd) { @@ -4003,7 +4029,8 @@ GooString *TextSelectionDumper::getText (void) } // not a table else { - page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s); + page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, + frag->line->chars_per_glyph, frag->line->num_glyphs, s); s->append (eol, eolLen); } } @@ -4710,7 +4737,8 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, for (word = rawWords; word; word = word->next) { s = new GooString(); - dumpFragment(word->text, word->len, uMap, s); + dumpFragment(word->text, word->len, uMap, + frag->line->chars_per_glyph, frag->line->num_glyphs, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; if (word->next && @@ -4782,7 +4810,8 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, // print the line s = new GooString(); - col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, + frag->line->chars_per_glyph, frag->line->num_glyphs, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; @@ -4821,7 +4850,8 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, --n; } s = new GooString(); - dumpFragment(line->text, n, uMap, s); + dumpFragment(line->text, n, uMap, + line->chars_per_glyph, line->num_glyphs, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; // output a newline when a hyphen is not suppressed @@ -4939,12 +4969,18 @@ void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) { } int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, + int *chars_per_glyph, int num_glyphs, GooString *s) { char lre[8], rle[8], popdf[8], buf[8]; int lreLen, rleLen, popdfLen, n; - int nCols, i, j, k; + int nCols, i, j, k, m; + int *glyph_offsets; nCols = 0; + glyph_offsets = (int*)gmallocn(num_glyphs, sizeof(int)); + glyph_offsets[0] = 0; + for (i = 1; i < num_glyphs; i++) + glyph_offsets[i] = glyph_offsets[i-1] + chars_per_glyph[i-1]; if (uMap->isUnicode()) { @@ -4955,23 +4991,28 @@ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, if (primaryLR) { i = 0; - while (i < len) { + while (i < num_glyphs) { // output a left-to-right section - for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; + for (j = i; j < num_glyphs && !unicodeTypeR(text[glyph_offsets[j]]); ++j) ; for (k = i; k < j; ++k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; + for (m = 0; m < chars_per_glyph[k]; m++) { + n = uMap->mapUnicode(text[glyph_offsets[k]+m], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } } i = j; // output a right-to-left section - for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ; + for (j = i; j < num_glyphs && !unicodeTypeL(text[glyph_offsets[j]]); ++j) ; if (j > i) { s->append(rle, rleLen); for (k = j - 1; k >= i; --k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; + for (m = 0; m < chars_per_glyph[k]; m++) { + // don't reverse chars mapped to a single glyph + n = uMap->mapUnicode(text[glyph_offsets[k]+m], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } } s->append(popdf, popdfLen); i = j; @@ -4981,14 +5022,17 @@ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, } else { s->append(rle, rleLen); - i = len - 1; + i = num_glyphs - 1; while (i >= 0) { // output a right-to-left section - for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ; + for (j = i; j >= 0 && !unicodeTypeL(text[glyph_offsets[j]]); --j) ; for (k = i; k > j; --k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; + for (m = 0; m < chars_per_glyph[k]; m++) { + // don't reverse chars mapped to a single glyph + n = uMap->mapUnicode(text[glyph_offsets[k]+m], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } } i = j; // output a left-to-right section @@ -4996,9 +5040,11 @@ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, if (j < i) { s->append(lre, lreLen); for (k = j + 1; k <= i; ++k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; + for (m = 0; m < chars_per_glyph[k]; m++) { + n = uMap->mapUnicode(text[glyph_offsets[k]+m], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } } s->append(popdf, popdfLen); i = j; @@ -5015,6 +5061,7 @@ int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, nCols += n; } } + gfree(glyph_offsets); return nCols; } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 438aee4..71fad10 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -115,6 +115,10 @@ public: // Destructor. ~TextWord(); + // define the glyph boundaries for glyphs that map to multiple characters + void beginGlyph(); + void endGlyph(); + // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, Unicode u); @@ -174,10 +178,14 @@ private: double base; // baseline x or y coordinate Unicode *text; // the text CharCode *charcode; // glyph indices + int *chars_per_glyph; // each element contains the number of unicode + // characters from text that maps to one glyph double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int len; // length of text and edge arrays int size; // size of text and edge arrays + int num_glyphs; // num entries in chars_per_glyph array + int char_start; int charPos; // character position (within content stream) int charLen; // number of content stream characters in // this word @@ -294,10 +302,13 @@ private: TextWord *lastWord; // last word in this line Unicode *text; // Unicode text of the line, including // spaces between words + int *chars_per_glyph; // each element contains the number of unicode + // characters from text that maps to one glyph double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *col; // starting column number of each Unicode char int len; // number of Unicode chars + int num_glyphs; // num entries in chars_per_glyph array int convertedLen; // total number of converted characters GBool hyphenated; // set if last char is a hyphen TextLine *next; // next line in block @@ -580,7 +591,8 @@ private: void clear(); void assignColumns(TextLineFrag *frags, int nFrags, GBool rot); - int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s); + int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, + int *chars_per_glyph, int num_glyphs, GooString *s); GBool rawOrder; // keep text in content stream order -- 1.5.6.5