From f0c036c6cb79ba2009956ceaffd16a75b93faf67 Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 4 Mar 2012 14:49:59 +1030 Subject: [PATCH] TextOutput: output all charcodes in an ActualText span Text within an ActualText span is not selectable unless all charcodes are output. The unicode values of the charcodes are replaced with the replacement text evenly distributed across the charcodes. --- poppler/TextOutputDev.cc | 115 +++++++++++++++++++++++++++------------------- poppler/TextOutputDev.h | 18 +++++-- 2 files changed, 79 insertions(+), 54 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 4a12e69..0cd461c 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -5245,12 +5245,16 @@ ActualText::ActualText(TextPage *out) { out->incRefCnt(); text = out; actualText = NULL; - actualTextNBytes = 0; + lenGlyphs = 0; + sizeGlyphs = 0; + glyphs = NULL; } ActualText::~ActualText() { if (actualText) delete actualText; + if (glyphs) + gfree(glyphs); text->decRefCnt(); } @@ -5263,67 +5267,82 @@ void ActualText::addChar(GfxState *state, double x, double y, } // Inside ActualText span. - if (!actualTextNBytes) { - actualTextX0 = x; - actualTextY0 = y; - } - actualTextX1 = x + dx; - actualTextY1 = y + dy; - actualTextNBytes += nBytes; + + if (lenGlyphs == sizeGlyphs) { + sizeGlyphs += 16; + glyphs = (Glyph *)greallocn(glyphs, sizeGlyphs, sizeof(Glyph)); + } + glyphs[lenGlyphs].x = x; + glyphs[lenGlyphs].y = y; + glyphs[lenGlyphs].dx = dx; + glyphs[lenGlyphs].dy = dy; + glyphs[lenGlyphs].c = c; + glyphs[lenGlyphs].nBytes = nBytes; + lenGlyphs++; } void ActualText::begin(GfxState *state, GooString *text) { if (actualText) delete actualText; actualText = new GooString(text); - actualTextNBytes = 0; + lenGlyphs = 0; } void ActualText::end(GfxState *state) { - // ActualText span closed. Output the span text and the - // extents of all the glyphs inside the span - - if (actualTextNBytes) { - char *uniString = NULL; - Unicode *uni; - int length, i; - - if (!actualText->hasUnicodeMarker()) { - if (actualText->getLength() > 0) { - //non-unicode string -- assume pdfDocEncoding and - //try to convert to UTF16BE - uniString = pdfDocEncodingToUTF16(actualText, &length); - } else { - length = 0; - } + // ActualText span closed. Output the glyphs that were output + // inside the span with the span text distributed across the glyphs. + // Note: if there are no glyphs in the span the replacement text will + // not be output. This is the same as acroread behavior. + + char *uniString = NULL; + Unicode *uni; + int length, i, first, count; + double pos; + + if (!actualText->hasUnicodeMarker()) { + if (actualText->getLength() > 0) { + //non-unicode string -- assume pdfDocEncoding and + //try to convert to UTF16BE + uniString = pdfDocEncodingToUTF16(actualText, &length); } else { - uniString = actualText->getCString(); - length = actualText->getLength(); - } - - if (length < 3) length = 0; - else - length = length/2 - 1; - uni = new Unicode[length]; - for (i = 0 ; i < length; i++) - uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff); - - // now that we have the position info for all of the text inside - // the marked content span, we feed the "ActualText" back through - // text->addChar() - text->addChar(state, actualTextX0, actualTextY0, - actualTextX1 - actualTextX0, actualTextY1 - actualTextY0, - 0, actualTextNBytes, uni, length); - - delete [] uni; - if (!actualText->hasUnicodeMarker()) - delete [] uniString; - } + } + } else { + uniString = actualText->getCString(); + length = actualText->getLength(); + } + + if (length < 3) + length = 0; + else + length = length/2 - 1; + uni = new Unicode[length]; + for (i = 0 ; i < length; i++) + uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff); + + // Output each glyph replacing the unicode values of glyphs with the + // replacement text. The replacement text is evenly distributed + // across the glyphs. + first = 0; + pos = 0.0; + for (i = 0; i < lenGlyphs; i++) { + pos += (double)length/lenGlyphs; + count = (int)pos - first; + // If this is the last glyph ensure all remaining text is included + // as pos may be < length due to rounding errors. + if (i == lenGlyphs - 1) + count = length - first; + text->addChar(state, glyphs[i].x, glyphs[i].y, glyphs[i].dx, glyphs[i].dy, + glyphs[i].c, glyphs[i].nBytes, uni + first, count); + first += count; + } + + delete [] uni; + if (!actualText->hasUnicodeMarker()) + delete [] uniString; delete actualText; actualText = NULL; - actualTextNBytes = 0; } //------------------------------------------------------------------------ diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index e017fbd..057509c 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -645,6 +645,7 @@ private: // ActualText //------------------------------------------------------------------------ + class ActualText { public: // Create an ActualText @@ -661,13 +662,18 @@ private: TextPage *text; GooString *actualText; // replacement text for the span - double actualTextX0; - double actualTextY0; - double actualTextX1; - double actualTextY1; - int actualTextNBytes; + int lenGlyphs; + int sizeGlyphs; + struct Glyph { + double x; + double y; + double dx; + double dy; + CharCode c; + int nBytes; + } *glyphs; }; - + //------------------------------------------------------------------------ // TextOutputDev -- 1.7.5.4