From dfb016a63cb2ef8ae7330a55247aecbf5aaadf4f Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 3 Dec 2017 20:27:08 +1030 Subject: [PATCH 05/13] Move UTF functions to goo/UTF.cc to allow UTF functions to be used in goo/gfile. --- CMakeLists.txt | 6 ++-- {poppler => goo}/UTF.cc | 84 ++++++++++++++++-------------------------------- {poppler => goo}/UTF.h | 17 +--------- goo/gtypes.h | 3 ++ poppler/CharTypes.h | 3 +- poppler/Outline.cc | 2 +- poppler/TextOutputDev.cc | 2 +- poppler/Unicode.cc | 79 +++++++++++++++++++++++++++++++++++++++++++++ poppler/Unicode.h | 32 ++++++++++++++++++ utils/JSInfo.cc | 2 +- utils/pdfinfo.cc | 2 +- 11 files changed, 152 insertions(+), 80 deletions(-) rename {poppler => goo}/UTF.cc (88%) rename {poppler => goo}/UTF.h (84%) create mode 100644 poppler/Unicode.cc create mode 100644 poppler/Unicode.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ed1398a..cf431978 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -323,6 +323,7 @@ set(poppler_SRCS goo/grandom.cc goo/glibc.cc goo/glibc_strtok_r.cc + goo/UTF.cc fofi/FoFiBase.cc fofi/FoFiEncodings.cc fofi/FoFiTrueType.cc @@ -379,7 +380,7 @@ set(poppler_SRCS poppler/StructElement.cc poppler/UnicodeMap.cc poppler/UnicodeTypeTable.cc - poppler/UTF.cc + poppler/Unicode.cc poppler/XRef.cc poppler/PSOutputDev.cc poppler/TextOutputDev.cc @@ -565,7 +566,7 @@ if(ENABLE_XPDF_HEADERS) poppler/SecurityHandler.h poppler/StdinCachedFile.h poppler/StdinPDFDocBuilder.h - poppler/UTF.h + poppler/Unicode.h poppler/XpdfPluginAPI.h poppler/Sound.h ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h @@ -584,6 +585,7 @@ if(ENABLE_XPDF_HEADERS) goo/GooLikely.h goo/gstrtod.h goo/grandom.h + goo/UTF.h DESTINATION include/poppler/goo) if(PNG_FOUND) install(FILES diff --git a/poppler/UTF.cc b/goo/UTF.cc similarity index 88% rename from poppler/UTF.cc rename to goo/UTF.cc index 90771943..2685bd06 100644 --- a/poppler/UTF.cc +++ b/goo/UTF.cc @@ -24,11 +24,8 @@ // //======================================================================== -#include "goo/gmem.h" -#include "PDFDocEncoding.h" +#include "gmem.h" #include "UTF.h" -#include "UnicodeMapFuncs.h" -#include bool UnicodeIsValid(Unicode ucs4) { @@ -85,53 +82,6 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) return len; } -int TextStringToUCS4(GooString *textStr, Unicode **ucs4) -{ - int i, len; - const char *s; - Unicode *u; - - len = textStr->getLength(); - s = textStr->getCString(); - if (len == 0) { - *ucs4 = 0; - return 0; - } - - if (textStr->hasUnicodeMarker()) { - Unicode *utf16; - len = len/2 - 1; - if (len > 0) { - utf16 = new Unicode[len]; - for (i = 0 ; i < len; i++) { - utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff); - } - len = UTF16toUCS4(utf16, len, &u); - delete[] utf16; - } else { - u = NULL; - } - } else { - u = (Unicode*)gmallocn(len, sizeof(Unicode)); - for (i = 0 ; i < len; i++) { - u[i] = pdfDocEncoding[s[i] & 0xff]; - } - } - *ucs4 = u; - return len; -} - -bool UnicodeIsWhitespace(Unicode ucs4) -{ - static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, - 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, - 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, - 0x3000 }; - Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]); - Unicode const *i = std::lower_bound(spaces, end, ucs4); - return (i != end && *i == ucs4); -} - // // decodeUtf8() and decodeUtf8Table are: // @@ -331,7 +281,7 @@ inline uint32_t decodeUtf16(uint32_t* state, uint32_t* codePoint, uint16_t codeU // UTF-8 (excluding terminating NULL). int utf16CountUtf8Bytes(const uint16_t *utf16) { - uint32_t codepoint; + uint32_t codepoint = 0; uint32_t state = 0; int count = 0; @@ -360,6 +310,30 @@ int utf16CountUtf8Bytes(const uint16_t *utf16) return count; } +static int Ucs4toUtf8(uint32_t codepoint, char *p) +{ + if (codepoint < 0x80) { + *p = (char)codepoint; + return 1; + } else if (codepoint < 0x800) { + *p++ = (char)(0xc0 + (codepoint >> 6)); + *p = (char)(0x80 + (codepoint & 0x3f)); + return 2; + } else if (codepoint < 0x10000) { + *p++ = (char)(0xe0 + (codepoint >> 12)); + *p++ = (char)(0x80 + ((codepoint >> 6) & 0x3f)); + *p = (char)(0x80 + (codepoint & 0x3f)); + return 3; + } else if (codepoint <= UCS4_MAX) { + *p++ = (char)(0xf0 + (codepoint >> 18)); + *p++ = (char)(0x80 + ((codepoint >> 12) & 0x3f)); + *p++ = (char)(0x80 + ((codepoint >> 6) & 0x3f)); + *p = (char)(0x80 + (codepoint & 0x3f)); + return 4; + } + return 0; +} + // Convert UTF-16 to UTF-8 // utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num // code units to convert @@ -383,8 +357,7 @@ int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16) state = 0; } - int bufSize = maxUtf8 - nOut; - int count = mapUTF8(codepoint, p, bufSize); + int count = Ucs4toUtf8(codepoint, p); p += count; nOut += count; } @@ -393,8 +366,7 @@ int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16) } // replace any trailing bytes too short for a valid UTF-8 with a replacement char if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) { - int bufSize = maxUtf8 - nOut; - int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize); + int count = Ucs4toUtf8(REPLACEMENT_CHAR, p); p += count; nOut += count; nOut++; diff --git a/poppler/UTF.h b/goo/UTF.h similarity index 84% rename from poppler/UTF.h rename to goo/UTF.h index bddb926d..58865a8d 100644 --- a/poppler/UTF.h +++ b/goo/UTF.h @@ -12,15 +12,10 @@ #ifndef UTF_H #define UTF_H -#ifdef USE_GCC_PRAGMAS -#pragma implementation -#endif - #include #include -#include "goo/GooString.h" -#include "CharTypes.h" +#include "gtypes.h" // Convert a UTF-16 string to a UCS-4 // utf16 - utf16 bytes @@ -29,19 +24,9 @@ // returns number of UCS-4 characters int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out); -// Convert a PDF Text String to UCS-4 -// s - PDF text string -// ucs4 - if the number of UCS-4 characters is > 0, allocates and -// returns UCS-4 string. Free with gfree. -// returns number of UCS-4 characters -int TextStringToUCS4(GooString *textStr, Unicode **ucs4); - // check if UCS-4 character is valid bool UnicodeIsValid(Unicode ucs4); -// is a unicode whitespace character -bool UnicodeIsWhitespace(Unicode ucs4); - // Count number of UTF-16 code units required to convert a UTF-8 string // (excluding terminating NULL). Each invalid byte is counted as a // code point since the UTF-8 conversion functions will replace it with diff --git a/goo/gtypes.h b/goo/gtypes.h index a8d45194..7bb2f904 100644 --- a/goo/gtypes.h +++ b/goo/gtypes.h @@ -49,4 +49,7 @@ typedef unsigned int Guint; typedef unsigned long Gulong; typedef long long Goffset; +// Unicode character. +typedef unsigned int Unicode; + #endif diff --git a/poppler/CharTypes.h b/poppler/CharTypes.h index d0df630d..d79ea345 100644 --- a/poppler/CharTypes.h +++ b/poppler/CharTypes.h @@ -9,8 +9,7 @@ #ifndef CHARTYPES_H #define CHARTYPES_H -// Unicode character. -typedef unsigned int Unicode; +#include "goo/gtypes.h" // Character ID for CID character collections. typedef unsigned int CID; diff --git a/poppler/Outline.cc b/poppler/Outline.cc index 82f28663..b262e6cf 100644 --- a/poppler/Outline.cc +++ b/poppler/Outline.cc @@ -37,7 +37,7 @@ #include "Link.h" #include "PDFDocEncoding.h" #include "Outline.h" -#include "UTF.h" +#include "Unicode.h" //------------------------------------------------------------------------ diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 9a77d050..d350c223 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -71,7 +71,7 @@ #include "TextOutputDev.h" #include "Page.h" #include "Annot.h" -#include "UTF.h" +#include "Unicode.h" #ifdef MACOS // needed for setting type/creator of MacOS files diff --git a/poppler/Unicode.cc b/poppler/Unicode.cc new file mode 100644 index 00000000..bd30daa0 --- /dev/null +++ b/poppler/Unicode.cc @@ -0,0 +1,79 @@ +//======================================================================== +// +// Unicode.cc +// +// Copyright 2001-2003 Glyph & Cog, LLC +// +//======================================================================== + +//======================================================================== +// +// Modified under the Poppler project - http://poppler.freedesktop.org +// +// All changes made under the Poppler project to this file are licensed +// under GPL version 2 or later +// +// Copyright (C) 2008 Koji Otani +// Copyright (C) 2012 Adrian Johnson +// Copyright (C) 2012 Hib Eris +// Copyright (C) 2016 Albert Astals Cid +// Copyright (C) 2016 Jason Crain +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + +#include + +#include "goo/gmem.h" +#include "PDFDocEncoding.h" +#include "Unicode.h" + + +int TextStringToUCS4(GooString *textStr, Unicode **ucs4) +{ + int i, len; + const char *s; + Unicode *u; + + len = textStr->getLength(); + s = textStr->getCString(); + if (len == 0) { + *ucs4 = 0; + return 0; + } + + if (textStr->hasUnicodeMarker()) { + Unicode *utf16; + len = len/2 - 1; + if (len > 0) { + utf16 = new Unicode[len]; + for (i = 0 ; i < len; i++) { + utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff); + } + len = UTF16toUCS4(utf16, len, &u); + delete[] utf16; + } else { + u = NULL; + } + } else { + u = (Unicode*)gmallocn(len, sizeof(Unicode)); + for (i = 0 ; i < len; i++) { + u[i] = pdfDocEncoding[s[i] & 0xff]; + } + } + *ucs4 = u; + return len; +} + +bool UnicodeIsWhitespace(Unicode ucs4) +{ + static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, + 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, + 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, + 0x3000 }; + Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]); + Unicode const *i = std::lower_bound(spaces, end, ucs4); + return (i != end && *i == ucs4); +} diff --git a/poppler/Unicode.h b/poppler/Unicode.h new file mode 100644 index 00000000..29c4de7d --- /dev/null +++ b/poppler/Unicode.h @@ -0,0 +1,32 @@ +//======================================================================== +// +// UTF.h +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2012 Adrian Johnson +// Copyright (C) 2016 Jason Crain +// +//======================================================================== + +#ifndef UNICODE_H +#define UNICODE_H + +#include +#include + +#include "goo/GooString.h" +#include "goo/UTF.h" +#include "CharTypes.h" + +// Convert a PDF Text String to UCS-4 +// s - PDF text string +// ucs4 - if the number of UCS-4 characters is > 0, allocates and +// returns UCS-4 string. Free with gfree. +// returns number of UCS-4 characters +int TextStringToUCS4(GooString *textStr, Unicode **ucs4); + +// is a unicode whitespace character +bool UnicodeIsWhitespace(Unicode ucs4); + +#endif diff --git a/utils/JSInfo.cc b/utils/JSInfo.cc index 34f31d58..3eaa3955 100644 --- a/utils/JSInfo.cc +++ b/utils/JSInfo.cc @@ -23,7 +23,7 @@ #include "Link.h" #include "Form.h" #include "UnicodeMap.h" -#include "UTF.h" +#include "Unicode.h" #include "Win32Console.h" JSInfo::JSInfo(PDFDoc *docA, int firstPage) { diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index a3099cf6..bb715573 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -55,7 +55,7 @@ #include "PDFDocFactory.h" #include "CharTypes.h" #include "UnicodeMap.h" -#include "UTF.h" +#include "Unicode.h" #include "Error.h" #include "DateInfo.h" #include "JSInfo.h" -- 2.11.0