From 5a86cff1508065ba72a15651fe38de68fb2b17a6 Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Thu, 8 Mar 2012 21:21:07 +1030 Subject: [PATCH 2/4] Move text string to unicode conversion into a separate function This also ensures UTF-16 ActualText strings are converted to UCS-4 before calling addChar. --- goo/GooString.cc | 2 +- poppler/TextOutputDev.cc | 32 ++++---------------------------- poppler/UTF.cc | 34 ++++++++++++++++++++++++++++++++++ poppler/UTF.h | 8 ++++++++ utils/pdfinfo.cc | 37 ++++++------------------------------- 5 files changed, 53 insertions(+), 60 deletions(-) diff --git a/goo/GooString.cc b/goo/GooString.cc index fc78d90..2bc85e7 100644 --- a/goo/GooString.cc +++ b/goo/GooString.cc @@ -854,7 +854,7 @@ int GooString::cmpN(const char *sA, int n) const { GBool GooString::hasUnicodeMarker(void) { - return (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff; + return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff; } GooString *GooString::sanitizedName(GBool psmode) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 79e4ae4..332d7ee 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -62,7 +62,7 @@ #include "TextOutputDev.h" #include "Page.h" #include "Annot.h" -#include "PDFDocEncoding.h" +#include "UTF.h" #ifdef MACOS // needed for setting type/creator of MacOS files @@ -5223,41 +5223,17 @@ void ActualText::end(GfxState *state) { // extents of all the glyphs inside the span if (actualTextNBytes) { - char *uniString = NULL; Unicode *uni; - int length, i; - - if (!actualText->hasUnicodeMarker()) { - if (actualText->getLength() > 0) { - //non-unicode string -- assume pdfDocEncoding and - //try to convert to UTF16BE - uniString = pdfDocEncodingToUTF16(actualText, &length); - } else { - length = 0; - } - } else { - uniString = actualText->getCString(); - length = actualText->getLength(); - } - - if (length < 3) - length = 0; - else - length = length/2 - 1; - uni = new Unicode[length]; - for (i = 0 ; i < length; i++) - uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff); + int length; // now that we have the position info for all of the text inside // the marked content span, we feed the "ActualText" back through // text->addChar() + length = TextStringToUCS4(actualText, &uni); text->addChar(state, actualTextX0, actualTextY0, actualTextX1 - actualTextX0, actualTextY1 - actualTextY0, 0, actualTextNBytes, uni, length); - - delete [] uni; - if (!actualText->hasUnicodeMarker()) - delete [] uniString; + gfree(uni); // len = 0 ?? XXXXXXXX } delete actualText; diff --git a/poppler/UTF.cc b/poppler/UTF.cc index b5f7d9f..0642d04 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -1,4 +1,5 @@ #include "goo/gmem.h" +#include "PDFDocEncoding.h" #include "UTF.h" int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) @@ -45,3 +46,36 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) return len; } +int TextStringToUCS4(GooString *textStr, Unicode **ucs4) +{ + int i, len; + const char *s; + Unicode *u; + + len = textStr->getLength(); + s = textStr->getCString(); + if (len == 0) + return 0; + + if (textStr->hasUnicodeMarker()) { + Unicode *utf16; + len = len/2 - 1; + if (len > 0) { + utf16 = new Unicode[len]; + for (i = 0 ; i < len; i++) { + utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff); + } + len = UTF16toUCS4(utf16, len, &u); + delete utf16; + } else { + u = NULL; + } + } else { + u = (Unicode*)gmallocn(len, sizeof(Unicode)); + for (i = 0 ; i < len; i++) { + u[i] = pdfDocEncoding[s[i]]; + } + } + *ucs4 = u; + return len; +} diff --git a/poppler/UTF.h b/poppler/UTF.h index d0ef5bc..ec51e5a 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -27,6 +27,7 @@ #pragma implementation #endif +#include "goo/GooString.h" #include "CharTypes.h" // Convert a UTF-16 string to a UCS-4 @@ -36,6 +37,13 @@ // returns number of UCS-4 characters int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out); +// Convert a PDF Text String to UCS-4 +// s - PDF text string +// ucs4 - if the number of UCS-4 characters is > 0, allocates and +// returns UCS-4 string. Free with gfree. +// returns number of UCS-4 characters +int TextStringToUCS4(GooString *textStr, Unicode **ucs4); + static int mapUTF8(Unicode u, char *buf, int bufSize) { if (u <= 0x0000007f) { diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index cdc5375..d1c077b 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -48,7 +48,7 @@ #include "PDFDocFactory.h" #include "CharTypes.h" #include "UnicodeMap.h" -#include "PDFDocEncoding.h" +#include "UTF.h" #include "Error.h" #include "DateInfo.h" @@ -379,41 +379,16 @@ static void printInfoString(Dict *infoDict, const char *key, const char *text, UnicodeMap *uMap) { Object obj; GooString *s1; - GBool isUnicode; - Unicode u, u2; + Unicode *u; char buf[8]; - int i, n; + int i, n, len; if (infoDict->lookup(key, &obj)->isString()) { fputs(text, stdout); s1 = obj.getString(); - if ((s1->getChar(0) & 0xff) == 0xfe && - (s1->getChar(1) & 0xff) == 0xff) { - isUnicode = gTrue; - i = 2; - } else { - isUnicode = gFalse; - i = 0; - } - while (i < obj.getString()->getLength()) { - if (isUnicode) { - u = ((s1->getChar(i) & 0xff) << 8) | - (s1->getChar(i+1) & 0xff); - i += 2; - if (u >= 0xd800 && u <= 0xdbff && i < obj.getString()->getLength()) { - // surrogate pair - u2 = ((s1->getChar(i) & 0xff) << 8) | - (s1->getChar(i+1) & 0xff); - i += 2; - if (u2 >= 0xdc00 && u2 <= 0xdfff) { - u = 0x10000 + ((u - 0xd800) << 10) + (u2 - 0xdc00); - } - } - } else { - u = pdfDocEncoding[s1->getChar(i) & 0xff]; - ++i; - } - n = uMap->mapUnicode(u, buf, sizeof(buf)); + len = TextStringToUCS4(s1, &u); + for (i = 0; i < len; i++) { + n = uMap->mapUnicode(u[i], buf, sizeof(buf)); fwrite(buf, 1, n, stdout); } fputc('\n', stdout); -- 1.7.5.4