From c52a23f1c64f0dd243f73f68fa08c270529b4003 Mon Sep 17 00:00:00 2001 From: Peter Waller Date: Wed, 27 May 2015 22:02:28 +0100 Subject: [PATCH] Drop chars in TextOutputDev with no unicode CMap If the font has no unicode cmap, it's not possible to output text for that encoding, so rather than potentially corrupting the textual output stream, the characters are dropped. It may be possible to keep the characters if they happen to lie in the printable spectrum, but my first priority is to fix crashes where the glib API returns an inconsistent number of glyphs via poppler_page_get_text and poppler_page_get_text_layout. --- poppler/Gfx.cc | 6 ++++++ poppler/OutputDev.h | 4 ++++ poppler/TextOutputDev.h | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index 07d95b3..130363d 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -3934,6 +3934,12 @@ void Gfx::doShowText(GooString *s) { int len, n, uLen, nChars, nSpaces, i; font = state->getFont(); + + if (out->needUnicodeText() && !font->hasToUnicodeCMap()) { + // No conversion to unicode available, drop characters. + return; + } + wMode = font->getWMode(); if (out->useDrawChar()) { diff --git a/poppler/OutputDev.h b/poppler/OutputDev.h index e8a7a47..7e63739 100644 --- a/poppler/OutputDev.h +++ b/poppler/OutputDev.h @@ -116,6 +116,10 @@ public: // Does this device need non-text content? virtual GBool needNonText() { return gTrue; } + // Does this device expect valid UTF-8 text? (i.e, discard characters for + // which cannot determine UTF-8 equivalents due to a missing unicode mapping) + virtual GBool needUnicodeText() { return gFalse; } + // Does this device require incCharCount to be called for text on // non-shown layers? virtual GBool needCharCount() { return gFalse; } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index a0aa6f8..8bbd018 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -762,6 +762,10 @@ public: // Does this device need non-text content? virtual GBool needNonText() { return gFalse; } + // Does this device expect valid UTF-8 text? (i.e, discard characters for + // which cannot determine UTF-8 equivalents due to a missing unicode mapping) + virtual GBool needUnicodeText() { return gTrue; } + // Does this device require incCharCount to be called for text on // non-shown layers? virtual GBool needCharCount() { return gTrue; } -- 1.9.1