From dfb016a63cb2ef8ae7330a55247aecbf5aaadf4f Mon Sep 17 00:00:00 2001
From: Adrian Johnson <ajohnson@redneon.com>
Date: Sun, 3 Dec 2017 20:27:08 +1030
Subject: [PATCH 05/13] Move UTF functions to goo/UTF.cc

to allow UTF functions to be used in goo/gfile.
---
 CMakeLists.txt           |  6 ++--
 {poppler => goo}/UTF.cc  | 84 ++++++++++++++++--------------------------------
 {poppler => goo}/UTF.h   | 17 +---------
 goo/gtypes.h             |  3 ++
 poppler/CharTypes.h      |  3 +-
 poppler/Outline.cc       |  2 +-
 poppler/TextOutputDev.cc |  2 +-
 poppler/Unicode.cc       | 79 +++++++++++++++++++++++++++++++++++++++++++++
 poppler/Unicode.h        | 32 ++++++++++++++++++
 utils/JSInfo.cc          |  2 +-
 utils/pdfinfo.cc         |  2 +-
 11 files changed, 152 insertions(+), 80 deletions(-)
 rename {poppler => goo}/UTF.cc (88%)
 rename {poppler => goo}/UTF.h (84%)
 create mode 100644 poppler/Unicode.cc
 create mode 100644 poppler/Unicode.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ed1398a..cf431978 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -323,6 +323,7 @@ set(poppler_SRCS
   goo/grandom.cc
   goo/glibc.cc
   goo/glibc_strtok_r.cc
+  goo/UTF.cc
   fofi/FoFiBase.cc
   fofi/FoFiEncodings.cc
   fofi/FoFiTrueType.cc
@@ -379,7 +380,7 @@ set(poppler_SRCS
   poppler/StructElement.cc
   poppler/UnicodeMap.cc
   poppler/UnicodeTypeTable.cc
-  poppler/UTF.cc
+  poppler/Unicode.cc
   poppler/XRef.cc
   poppler/PSOutputDev.cc
   poppler/TextOutputDev.cc
@@ -565,7 +566,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/SecurityHandler.h
     poppler/StdinCachedFile.h
     poppler/StdinPDFDocBuilder.h
-    poppler/UTF.h
+    poppler/Unicode.h
     poppler/XpdfPluginAPI.h
     poppler/Sound.h
     ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h
@@ -584,6 +585,7 @@ if(ENABLE_XPDF_HEADERS)
     goo/GooLikely.h
     goo/gstrtod.h
     goo/grandom.h
+    goo/UTF.h
     DESTINATION include/poppler/goo)
   if(PNG_FOUND)
     install(FILES
diff --git a/poppler/UTF.cc b/goo/UTF.cc
similarity index 88%
rename from poppler/UTF.cc
rename to goo/UTF.cc
index 90771943..2685bd06 100644
--- a/poppler/UTF.cc
+++ b/goo/UTF.cc
@@ -24,11 +24,8 @@
 //
 //========================================================================
 
-#include "goo/gmem.h"
-#include "PDFDocEncoding.h"
+#include "gmem.h"
 #include "UTF.h"
-#include "UnicodeMapFuncs.h"
-#include <algorithm>
 
 bool UnicodeIsValid(Unicode ucs4)
 {
@@ -85,53 +82,6 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
   return len;
 }
 
-int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
-{
-  int i, len;
-  const char *s;
-  Unicode *u;
-
-  len = textStr->getLength();
-  s = textStr->getCString();
-  if (len == 0) {
-    *ucs4 = 0;
-    return 0;
-  }
-
-  if (textStr->hasUnicodeMarker()) {
-    Unicode *utf16;
-    len = len/2 - 1;
-    if (len > 0) {
-      utf16 = new Unicode[len];
-      for (i = 0 ; i < len; i++) {
-        utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
-      }
-      len = UTF16toUCS4(utf16, len, &u);
-      delete[] utf16;
-    } else {
-      u = NULL;
-    }
-  } else {
-    u = (Unicode*)gmallocn(len, sizeof(Unicode));
-    for (i = 0 ; i < len; i++) {
-      u[i] = pdfDocEncoding[s[i] & 0xff];
-    }
-  }
-  *ucs4 = u;
-  return len;
-}
-
-bool UnicodeIsWhitespace(Unicode ucs4)
-{
-  static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D,
-    0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
-    0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F,
-    0x3000 };
-  Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
-  Unicode const *i = std::lower_bound(spaces, end, ucs4);
-  return (i != end && *i == ucs4);
-}
-
 //
 // decodeUtf8() and decodeUtf8Table are:
 //
@@ -331,7 +281,7 @@ inline uint32_t decodeUtf16(uint32_t* state, uint32_t* codePoint, uint16_t codeU
 // UTF-8 (excluding terminating NULL).
 int utf16CountUtf8Bytes(const uint16_t *utf16)
 {
-    uint32_t codepoint;
+    uint32_t codepoint = 0;
     uint32_t state = 0;
     int count = 0;
 
@@ -360,6 +310,30 @@ int utf16CountUtf8Bytes(const uint16_t *utf16)
     return count;
 }
 
+static int Ucs4toUtf8(uint32_t codepoint, char *p)
+{
+  if (codepoint < 0x80) {
+    *p = (char)codepoint;
+    return 1;
+  } else if (codepoint < 0x800) {
+    *p++ = (char)(0xc0 + (codepoint >> 6));
+    *p =   (char)(0x80 + (codepoint & 0x3f));
+    return 2;
+  } else if (codepoint < 0x10000) {
+    *p++ = (char)(0xe0 + (codepoint >> 12));
+    *p++ = (char)(0x80 + ((codepoint >> 6) & 0x3f));
+    *p =   (char)(0x80 + (codepoint & 0x3f));
+    return 3;
+  } else if (codepoint <= UCS4_MAX) {
+    *p++ = (char)(0xf0 + (codepoint >> 18));
+    *p++ = (char)(0x80 + ((codepoint >> 12) & 0x3f));
+    *p++ = (char)(0x80 + ((codepoint >> 6) & 0x3f));
+    *p =   (char)(0x80 + (codepoint & 0x3f));
+    return 4;
+  }
+  return 0;
+}
+
 // Convert UTF-16 to UTF-8
 //  utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
 //        code units to convert
@@ -383,8 +357,7 @@ int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
 	  state = 0;
       }
 
-      int bufSize = maxUtf8 - nOut;
-      int count = mapUTF8(codepoint, p, bufSize);
+      int count = Ucs4toUtf8(codepoint, p);
       p += count;
       nOut += count;
     }
@@ -393,8 +366,7 @@ int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
   }
   // replace any trailing bytes too short for a valid UTF-8 with a replacement char
   if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) {
-    int bufSize = maxUtf8 - nOut;
-    int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize);
+    int count = Ucs4toUtf8(REPLACEMENT_CHAR, p);
     p += count;
     nOut += count;
     nOut++;
diff --git a/poppler/UTF.h b/goo/UTF.h
similarity index 84%
rename from poppler/UTF.h
rename to goo/UTF.h
index bddb926d..58865a8d 100644
--- a/poppler/UTF.h
+++ b/goo/UTF.h
@@ -12,15 +12,10 @@
 #ifndef UTF_H
 #define UTF_H
 
-#ifdef USE_GCC_PRAGMAS
-#pragma implementation
-#endif
-
 #include <cstdint>
 #include <climits>
 
-#include "goo/GooString.h"
-#include "CharTypes.h"
+#include "gtypes.h"
 
 // Convert a UTF-16 string to a UCS-4
 //   utf16      - utf16 bytes
@@ -29,19 +24,9 @@
 //   returns number of UCS-4 characters
 int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
 
-// Convert a PDF Text String to UCS-4
-//   s          - PDF text string
-//   ucs4       - if the number of UCS-4 characters is > 0, allocates and
-//                returns UCS-4 string. Free with gfree.
-//   returns number of UCS-4 characters
-int TextStringToUCS4(GooString *textStr, Unicode **ucs4);
-
 // check if UCS-4 character is valid
 bool UnicodeIsValid(Unicode ucs4);
 
-// is a unicode whitespace character
-bool UnicodeIsWhitespace(Unicode ucs4);
-
 // Count number of UTF-16 code units required to convert a UTF-8 string
 // (excluding terminating NULL). Each invalid byte is counted as a
 // code point since the UTF-8 conversion functions will replace it with
diff --git a/goo/gtypes.h b/goo/gtypes.h
index a8d45194..7bb2f904 100644
--- a/goo/gtypes.h
+++ b/goo/gtypes.h
@@ -49,4 +49,7 @@ typedef unsigned int Guint;
 typedef unsigned long Gulong;
 typedef long long Goffset;
 
+// Unicode character.
+typedef unsigned int Unicode;
+
 #endif
diff --git a/poppler/CharTypes.h b/poppler/CharTypes.h
index d0df630d..d79ea345 100644
--- a/poppler/CharTypes.h
+++ b/poppler/CharTypes.h
@@ -9,8 +9,7 @@
 #ifndef CHARTYPES_H
 #define CHARTYPES_H
 
-// Unicode character.
-typedef unsigned int Unicode;
+#include "goo/gtypes.h"
 
 // Character ID for CID character collections.
 typedef unsigned int CID;
diff --git a/poppler/Outline.cc b/poppler/Outline.cc
index 82f28663..b262e6cf 100644
--- a/poppler/Outline.cc
+++ b/poppler/Outline.cc
@@ -37,7 +37,7 @@
 #include "Link.h"
 #include "PDFDocEncoding.h"
 #include "Outline.h"
-#include "UTF.h"
+#include "Unicode.h"
 
 //------------------------------------------------------------------------
 
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 9a77d050..d350c223 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -71,7 +71,7 @@
 #include "TextOutputDev.h"
 #include "Page.h"
 #include "Annot.h"
-#include "UTF.h"
+#include "Unicode.h"
 
 #ifdef MACOS
 // needed for setting type/creator of MacOS files
diff --git a/poppler/Unicode.cc b/poppler/Unicode.cc
new file mode 100644
index 00000000..bd30daa0
--- /dev/null
+++ b/poppler/Unicode.cc
@@ -0,0 +1,79 @@
+//========================================================================
+//
+// Unicode.cc
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+//========================================================================
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
+// Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
+// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
+// Copyright (C) 2016 Albert Astals Cid <aacid@kde.org>
+// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+#include <algorithm>
+
+#include "goo/gmem.h"
+#include "PDFDocEncoding.h"
+#include "Unicode.h"
+
+
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
+{
+  int i, len;
+  const char *s;
+  Unicode *u;
+
+  len = textStr->getLength();
+  s = textStr->getCString();
+  if (len == 0) {
+    *ucs4 = 0;
+    return 0;
+  }
+
+  if (textStr->hasUnicodeMarker()) {
+    Unicode *utf16;
+    len = len/2 - 1;
+    if (len > 0) {
+      utf16 = new Unicode[len];
+      for (i = 0 ; i < len; i++) {
+        utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
+      }
+      len = UTF16toUCS4(utf16, len, &u);
+      delete[] utf16;
+    } else {
+      u = NULL;
+    }
+  } else {
+    u = (Unicode*)gmallocn(len, sizeof(Unicode));
+    for (i = 0 ; i < len; i++) {
+      u[i] = pdfDocEncoding[s[i] & 0xff];
+    }
+  }
+  *ucs4 = u;
+  return len;
+}
+
+bool UnicodeIsWhitespace(Unicode ucs4)
+{
+  static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D,
+    0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
+    0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F,
+    0x3000 };
+  Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
+  Unicode const *i = std::lower_bound(spaces, end, ucs4);
+  return (i != end && *i == ucs4);
+}
diff --git a/poppler/Unicode.h b/poppler/Unicode.h
new file mode 100644
index 00000000..29c4de7d
--- /dev/null
+++ b/poppler/Unicode.h
@@ -0,0 +1,32 @@
+//========================================================================
+//
+// UTF.h
+//
+// This file is licensed under the GPLv2 or later
+//
+// Copyright (C) 2012 Adrian Johnson <ajohnson@redneon.com>
+// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
+//
+//========================================================================
+
+#ifndef UNICODE_H
+#define UNICODE_H
+
+#include <cstdint>
+#include <climits>
+
+#include "goo/GooString.h"
+#include "goo/UTF.h"
+#include "CharTypes.h"
+
+// Convert a PDF Text String to UCS-4
+//   s          - PDF text string
+//   ucs4       - if the number of UCS-4 characters is > 0, allocates and
+//                returns UCS-4 string. Free with gfree.
+//   returns number of UCS-4 characters
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4);
+
+// is a unicode whitespace character
+bool UnicodeIsWhitespace(Unicode ucs4);
+
+#endif
diff --git a/utils/JSInfo.cc b/utils/JSInfo.cc
index 34f31d58..3eaa3955 100644
--- a/utils/JSInfo.cc
+++ b/utils/JSInfo.cc
@@ -23,7 +23,7 @@
 #include "Link.h"
 #include "Form.h"
 #include "UnicodeMap.h"
-#include "UTF.h"
+#include "Unicode.h"
 #include "Win32Console.h"
 
 JSInfo::JSInfo(PDFDoc *docA, int firstPage) {
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index a3099cf6..bb715573 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -55,7 +55,7 @@
 #include "PDFDocFactory.h"
 #include "CharTypes.h"
 #include "UnicodeMap.h"
-#include "UTF.h"
+#include "Unicode.h"
 #include "Error.h"
 #include "DateInfo.h"
 #include "JSInfo.h"
-- 
2.11.0