From 759c1d641f6a9b73b3f43a1dff19c53660bf389a Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Thu, 8 Mar 2012 20:52:28 +1030 Subject: [PATCH 1/4] Convert UTF-16 to UCS-4 when reading toUnicode cmap to ensure only UCS-4 values are used with the "Unicode" type. --- CMakeLists.txt | 3 +- poppler/CairoOutputDev.cc | 2 +- poppler/CharCodeToUnicode.cc | 12 +++-- poppler/GlobalParams.cc | 2 +- poppler/Makefile.am | 3 +- poppler/TextOutputDev.cc | 19 +------- poppler/UTF.cc | 47 +++++++++++++++++++ poppler/UTF.h | 103 ++++++++++++++++++++++++++++++++++++++++++ poppler/UTF8.h | 84 ---------------------------------- utils/HtmlOutputDev.cc | 14 +----- 10 files changed, 165 insertions(+), 124 deletions(-) create mode 100644 poppler/UTF.cc create mode 100644 poppler/UTF.h delete mode 100644 poppler/UTF8.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 684b67a..8a32f3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -290,6 +290,7 @@ set(poppler_SRCS poppler/strtok_r.cpp poppler/UnicodeMap.cc poppler/UnicodeTypeTable.cc + poppler/UTF.cc poppler/XRef.cc poppler/PSOutputDev.cc poppler/TextOutputDev.cc @@ -460,7 +461,7 @@ if(ENABLE_XPDF_HEADERS) poppler/SecurityHandler.h poppler/StdinCachedFile.h poppler/StdinPDFDocBuilder.h - poppler/UTF8.h + poppler/UTF.h poppler/XpdfPluginAPI.h poppler/Sound.h ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc index 6652a35..2369890 100644 --- a/poppler/CairoOutputDev.cc +++ b/poppler/CairoOutputDev.cc @@ -61,7 +61,7 @@ #include "CairoOutputDev.h" #include "CairoFontEngine.h" #include "CairoRescaleBox.h" -#include "UTF8.h" +#include "UTF.h" //------------------------------------------------------------------------ // #define LOG_CAIRO diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc index 076f5ba..bedb325 100644 --- a/poppler/CharCodeToUnicode.cc +++ b/poppler/CharCodeToUnicode.cc @@ -42,6 +42,7 @@ #include "GlobalParams.h" #include "PSTokenizer.h" #include "CharCodeToUnicode.h" +#include "UTF.h" //------------------------------------------------------------------------ @@ -452,15 +453,16 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n, } map[code] = 0; sMap[sMapLen].c = code; - sMap[sMapLen].len = n / 4; - sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode)); - for (j = 0; j < sMap[sMapLen].len; ++j) { - if (!parseHex(uStr + j*4, 4, &sMap[sMapLen].u[j])) { + int utf16Len = n / 4; + Unicode *utf16 = (Unicode*)gmallocn(utf16Len, sizeof(Unicode)); + for (j = 0; j < utf16Len; ++j) { + if (!parseHex(uStr + j*4, 4, &utf16[j])) { error(errSyntaxWarning, -1, "Illegal entry in ToUnicode CMap"); return; } } - sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset; + utf16[utf16Len - 1] += offset; + sMap[sMapLen].len = UTF16toUCS4(utf16, utf16Len, &sMap[sMapLen].u); ++sMapLen; } } diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc index 73a9855..2f071b8 100644 --- a/poppler/GlobalParams.cc +++ b/poppler/GlobalParams.cc @@ -107,7 +107,7 @@ #include "NameToUnicodeTable.h" #include "UnicodeMapTables.h" -#include "UTF8.h" +#include "UTF.h" #ifdef ENABLE_PLUGINS # ifdef _WIN32 diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 767c518..ea72d35 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -245,7 +245,7 @@ poppler_include_HEADERS = \ PSOutputDev.h \ TextOutputDev.h \ SecurityHandler.h \ - UTF8.h \ + UTF.h \ XpdfPluginAPI.h \ Sound.h nodist_poppler_include_HEADERS = poppler-config.h @@ -311,6 +311,7 @@ libpoppler_la_SOURCES = \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ + UTF.cc \ ViewerPreferences.cc \ XRef.cc \ PSOutputDev.cc \ diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 531617d..79e4ae4 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2391,24 +2391,7 @@ void TextPage::addChar(GfxState *state, double x, double y, w1 /= uLen; h1 /= uLen; for (i = 0; i < uLen; ++i) { - if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */ - if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) { - /* next code is a low surrogate */ - Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000; - i++; - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu); - } else { - /* missing low surrogate - replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); - } - } else if (u[i] >= 0xdc00 && u[i] < 0xe000) { - /* invalid low surrogate - replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); - } else { - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); - } + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); } } charPos += nBytes; diff --git a/poppler/UTF.cc b/poppler/UTF.cc new file mode 100644 index 0000000..b5f7d9f --- /dev/null +++ b/poppler/UTF.cc @@ -0,0 +1,47 @@ +#include "goo/gmem.h" +#include "UTF.h" + +int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) +{ + int i, n, len; + Unicode *u; + + // count characters + len = 0; + for (i = 0; i < utf16Len; i++) { + if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && + utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) { + i++; /* surrogate pair */ + } + len++; + } + if (ucs4 == NULL) + return len; + + u = (Unicode*)gmallocn(len, sizeof(Unicode)); + n = 0; + // convert string + for (i = 0; i < utf16Len; i++) { + if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */ + if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) { + /* next code is a low surrogate */ + u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000; + ++i; + } else { + /* missing low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + u[n] = 0xfffd; + } + } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) { + /* invalid low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + u[n] = 0xfffd; + } else { + u[n] = utf16[i]; + } + n++; + } + *ucs4 = u; + return len; +} + diff --git a/poppler/UTF.h b/poppler/UTF.h new file mode 100644 index 0000000..d0ef5bc --- /dev/null +++ b/poppler/UTF.h @@ -0,0 +1,103 @@ +//======================================================================== +// +// UTF.h +// +// Copyright 2001-2003 Glyph & Cog, LLC +// +//======================================================================== + +//======================================================================== +// +// Modified under the Poppler project - http://poppler.freedesktop.org +// +// All changes made under the Poppler project to this file are licensed +// under GPL version 2 or later +// +// Copyright (C) 2008 Koji Otani +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + +#ifndef UTF_H +#define UTF_H + +#ifdef USE_GCC_PRAGMAS +#pragma implementation +#endif + +#include "CharTypes.h" + +// Convert a UTF-16 string to a UCS-4 +// utf16 - utf16 bytes +// utf16_len - number of UTF-16 characters +// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. +// returns number of UCS-4 characters +int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out); + + +static int mapUTF8(Unicode u, char *buf, int bufSize) { + if (u <= 0x0000007f) { + if (bufSize < 1) { + return 0; + } + buf[0] = (char)u; + return 1; + } else if (u <= 0x000007ff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)(0xc0 + (u >> 6)); + buf[1] = (char)(0x80 + (u & 0x3f)); + return 2; + } else if (u <= 0x0000ffff) { + if (bufSize < 3) { + return 0; + } + buf[0] = (char)(0xe0 + (u >> 12)); + buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[2] = (char)(0x80 + (u & 0x3f)); + return 3; + } else if (u <= 0x0010ffff) { + if (bufSize < 4) { + return 0; + } + buf[0] = (char)(0xf0 + (u >> 18)); + buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); + buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[3] = (char)(0x80 + (u & 0x3f)); + return 4; + } else { + return 0; + } +} + +static int mapUCS2(Unicode u, char *buf, int bufSize) { + if (u <= 0xffff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)((u >> 8) & 0xff); + buf[1] = (char)(u & 0xff); + return 2; + } else if (u < 0x110000) { + Unicode uu; + + /* using surrogate pair */ + if (bufSize < 4) { + return 0; + } + uu = ((u - 0x10000) >> 10) + 0xd800; + buf[0] = (char)((uu >> 8) & 0xff); + buf[1] = (char)(uu & 0xff); + uu = (u & 0x3ff)+0xdc00; + buf[2] = (char)((uu >> 8) & 0xff); + buf[3] = (char)(uu & 0xff); + return 4; + } else { + return 0; + } +} + +#endif diff --git a/poppler/UTF8.h b/poppler/UTF8.h deleted file mode 100644 index 34a07d4..0000000 --- a/poppler/UTF8.h +++ /dev/null @@ -1,84 +0,0 @@ -//======================================================================== -// -// UTF8.h -// -// Copyright 2001-2003 Glyph & Cog, LLC -// -//======================================================================== - -//======================================================================== -// -// Modified under the Poppler project - http://poppler.freedesktop.org -// -// All changes made under the Poppler project to this file are licensed -// under GPL version 2 or later -// -// Copyright (C) 2008 Koji Otani -// -// To see a description of the changes please see the Changelog file that -// came with your tarball or type make ChangeLog if you are building from git -// -//======================================================================== - -static int mapUTF8(Unicode u, char *buf, int bufSize) { - if (u <= 0x0000007f) { - if (bufSize < 1) { - return 0; - } - buf[0] = (char)u; - return 1; - } else if (u <= 0x000007ff) { - if (bufSize < 2) { - return 0; - } - buf[0] = (char)(0xc0 + (u >> 6)); - buf[1] = (char)(0x80 + (u & 0x3f)); - return 2; - } else if (u <= 0x0000ffff) { - if (bufSize < 3) { - return 0; - } - buf[0] = (char)(0xe0 + (u >> 12)); - buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[2] = (char)(0x80 + (u & 0x3f)); - return 3; - } else if (u <= 0x0010ffff) { - if (bufSize < 4) { - return 0; - } - buf[0] = (char)(0xf0 + (u >> 18)); - buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); - buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[3] = (char)(0x80 + (u & 0x3f)); - return 4; - } else { - return 0; - } -} - -static int mapUCS2(Unicode u, char *buf, int bufSize) { - if (u <= 0xffff) { - if (bufSize < 2) { - return 0; - } - buf[0] = (char)((u >> 8) & 0xff); - buf[1] = (char)(u & 0xff); - return 2; - } else if (u < 0x110000) { - Unicode uu; - - /* using surrogate pair */ - if (bufSize < 4) { - return 0; - } - uu = ((u - 0x10000) >> 10) + 0xd800; - buf[0] = (char)((uu >> 8) & 0xff); - buf[1] = (char)(uu & 0xff); - uu = (u & 0x3ff)+0xdc00; - buf[2] = (char)((uu >> 8) & 0xff); - buf[3] = (char)(uu & 0xff); - return 4; - } else { - return 0; - } -} diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 17541a2..9e113eb 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -398,19 +398,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y, h1 /= uLen; } for (i = 0; i < uLen; ++i) { - Unicode u1 = u[i]; - if (u1 >= 0xd800 && u1 <= 0xdbff && i < uLen) { - // surrogate pair - const Unicode u2 = u[i + 1]; - if (u2 >= 0xdc00 && u2 <= 0xdfff) { - u1 = 0x10000 + ((u1 - 0xd800) << 10) + (u2 - 0xdc00); - - curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1); - } - ++i; - } else { - curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1); - } + curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); } } -- 1.7.5.4