From 759c1d641f6a9b73b3f43a1dff19c53660bf389a Mon Sep 17 00:00:00 2001
From: Adrian Johnson <ajohnson@redneon.com>
Date: Thu, 8 Mar 2012 20:52:28 +1030
Subject: [PATCH 1/4] Convert UTF-16 to UCS-4 when reading toUnicode cmap

to ensure only UCS-4 values are used with the "Unicode" type.
---
 CMakeLists.txt               |    3 +-
 poppler/CairoOutputDev.cc    |    2 +-
 poppler/CharCodeToUnicode.cc |   12 +++--
 poppler/GlobalParams.cc      |    2 +-
 poppler/Makefile.am          |    3 +-
 poppler/TextOutputDev.cc     |   19 +-------
 poppler/UTF.cc               |   47 +++++++++++++++++++
 poppler/UTF.h                |  103 ++++++++++++++++++++++++++++++++++++++++++
 poppler/UTF8.h               |   84 ----------------------------------
 utils/HtmlOutputDev.cc       |   14 +-----
 10 files changed, 165 insertions(+), 124 deletions(-)
 create mode 100644 poppler/UTF.cc
 create mode 100644 poppler/UTF.h
 delete mode 100644 poppler/UTF8.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 684b67a..8a32f3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -290,6 +290,7 @@ set(poppler_SRCS
   poppler/strtok_r.cpp
   poppler/UnicodeMap.cc
   poppler/UnicodeTypeTable.cc
+  poppler/UTF.cc
   poppler/XRef.cc
   poppler/PSOutputDev.cc
   poppler/TextOutputDev.cc
@@ -460,7 +461,7 @@ if(ENABLE_XPDF_HEADERS)
     poppler/SecurityHandler.h
     poppler/StdinCachedFile.h
     poppler/StdinPDFDocBuilder.h
-    poppler/UTF8.h
+    poppler/UTF.h
     poppler/XpdfPluginAPI.h
     poppler/Sound.h
     ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h
diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc
index 6652a35..2369890 100644
--- a/poppler/CairoOutputDev.cc
+++ b/poppler/CairoOutputDev.cc
@@ -61,7 +61,7 @@
 #include "CairoOutputDev.h"
 #include "CairoFontEngine.h"
 #include "CairoRescaleBox.h"
-#include "UTF8.h"
+#include "UTF.h"
 //------------------------------------------------------------------------
 
 // #define LOG_CAIRO
diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc
index 076f5ba..bedb325 100644
--- a/poppler/CharCodeToUnicode.cc
+++ b/poppler/CharCodeToUnicode.cc
@@ -42,6 +42,7 @@
 #include "GlobalParams.h"
 #include "PSTokenizer.h"
 #include "CharCodeToUnicode.h"
+#include "UTF.h"
 
 //------------------------------------------------------------------------
 
@@ -452,15 +453,16 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n,
     }
     map[code] = 0;
     sMap[sMapLen].c = code;
-    sMap[sMapLen].len = n / 4;
-    sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode));
-    for (j = 0; j < sMap[sMapLen].len; ++j) {
-      if (!parseHex(uStr + j*4, 4, &sMap[sMapLen].u[j])) {
+    int utf16Len = n / 4;
+    Unicode *utf16 = (Unicode*)gmallocn(utf16Len, sizeof(Unicode));
+    for (j = 0; j < utf16Len; ++j) {
+      if (!parseHex(uStr + j*4, 4, &utf16[j])) {
 	error(errSyntaxWarning, -1, "Illegal entry in ToUnicode CMap");
 	return;
       }
     }
-    sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset;
+    utf16[utf16Len - 1] += offset;
+    sMap[sMapLen].len = UTF16toUCS4(utf16, utf16Len, &sMap[sMapLen].u);
     ++sMapLen;
   }
 }
diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc
index 73a9855..2f071b8 100644
--- a/poppler/GlobalParams.cc
+++ b/poppler/GlobalParams.cc
@@ -107,7 +107,7 @@
 
 #include "NameToUnicodeTable.h"
 #include "UnicodeMapTables.h"
-#include "UTF8.h"
+#include "UTF.h"
 
 #ifdef ENABLE_PLUGINS
 #  ifdef _WIN32
diff --git a/poppler/Makefile.am b/poppler/Makefile.am
index 767c518..ea72d35 100644
--- a/poppler/Makefile.am
+++ b/poppler/Makefile.am
@@ -245,7 +245,7 @@ poppler_include_HEADERS =	\
 	PSOutputDev.h		\
 	TextOutputDev.h		\
 	SecurityHandler.h	\
-	UTF8.h			\
+	UTF.h			\
 	XpdfPluginAPI.h		\
 	Sound.h
 nodist_poppler_include_HEADERS = poppler-config.h
@@ -311,6 +311,7 @@ libpoppler_la_SOURCES =		\
 	strtok_r.cpp		\
 	UnicodeMap.cc		\
 	UnicodeTypeTable.cc	\
+	UTF.cc                  \
 	ViewerPreferences.cc	\
 	XRef.cc			\
 	PSOutputDev.cc		\
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 531617d..79e4ae4 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2391,24 +2391,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
     w1 /= uLen;
     h1 /= uLen;
     for (i = 0; i < uLen; ++i) {
-      if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
-	if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
-	  /* next code is a low surrogate */
-	  Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
-	  i++;
-	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu);
-	} else {
-	    /* missing low surrogate
-	     replace it with REPLACEMENT CHARACTER (U+FFFD) */
-	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
-	}
-      } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
-	  /* invalid low surrogate
-	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
-	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
-      } else {
-	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
-      }
+      curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
     }
   }
   charPos += nBytes;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
new file mode 100644
index 0000000..b5f7d9f
--- /dev/null
+++ b/poppler/UTF.cc
@@ -0,0 +1,47 @@
+#include "goo/gmem.h"
+#include "UTF.h"
+
+int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
+{
+  int i, n, len;
+  Unicode *u;
+
+  // count characters
+  len = 0;
+  for (i = 0; i < utf16Len; i++) {
+    if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len &&
+        utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
+      i++; /* surrogate pair */
+    }
+    len++;
+  }
+  if (ucs4 == NULL)
+    return len;
+
+  u = (Unicode*)gmallocn(len, sizeof(Unicode));
+  n = 0;
+  // convert string
+  for (i = 0; i < utf16Len; i++) {
+    if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
+      if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) {
+	/* next code is a low surrogate */
+	u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000;
+	++i;
+      } else {
+	/* missing low surrogate
+	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
+	u[n] = 0xfffd;
+      }
+    } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
+      /* invalid low surrogate
+	 replace it with REPLACEMENT CHARACTER (U+FFFD) */
+      u[n] = 0xfffd;
+    } else {
+      u[n] = utf16[i];
+    }
+    n++;
+  }
+  *ucs4 = u;
+  return len;
+}
+
diff --git a/poppler/UTF.h b/poppler/UTF.h
new file mode 100644
index 0000000..d0ef5bc
--- /dev/null
+++ b/poppler/UTF.h
@@ -0,0 +1,103 @@
+//========================================================================
+//
+// UTF.h
+//
+// Copyright 2001-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+//========================================================================
+//
+// Modified under the Poppler project - http://poppler.freedesktop.org
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+#ifndef UTF_H
+#define UTF_H
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include "CharTypes.h"
+
+// Convert a UTF-16 string to a UCS-4
+//   utf16      - utf16 bytes
+//   utf16_len  - number of UTF-16 characters
+//   ucs4_out   - if not NULL, allocates and returns UCS-4 string. Free with gfree.
+//   returns number of UCS-4 characters
+int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
+
+
+static int mapUTF8(Unicode u, char *buf, int bufSize) {
+  if        (u <= 0x0000007f) {
+    if (bufSize < 1) {
+      return 0;
+    }
+    buf[0] = (char)u;
+    return 1;
+  } else if (u <= 0x000007ff) {
+    if (bufSize < 2) {
+      return 0;
+    }
+    buf[0] = (char)(0xc0 + (u >> 6));
+    buf[1] = (char)(0x80 + (u & 0x3f));
+    return 2;
+  } else if (u <= 0x0000ffff) {
+    if (bufSize < 3) {
+      return 0;
+    }
+    buf[0] = (char)(0xe0 + (u >> 12));
+    buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
+    buf[2] = (char)(0x80 + (u & 0x3f));
+    return 3;
+  } else if (u <= 0x0010ffff) {
+    if (bufSize < 4) {
+      return 0;
+    }
+    buf[0] = (char)(0xf0 + (u >> 18));
+    buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
+    buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
+    buf[3] = (char)(0x80 + (u & 0x3f));
+    return 4;
+  } else {
+    return 0;
+  }
+}
+
+static int mapUCS2(Unicode u, char *buf, int bufSize) {
+  if (u <= 0xffff) {
+    if (bufSize < 2) {
+      return 0;
+     }
+    buf[0] = (char)((u >> 8) & 0xff);
+    buf[1] = (char)(u & 0xff);
+    return 2;
+  } else if (u < 0x110000) {
+    Unicode uu;
+
+    /* using surrogate pair */
+    if (bufSize < 4) {
+      return 0;
+    }
+    uu = ((u - 0x10000) >> 10) + 0xd800;
+    buf[0] = (char)((uu >> 8) & 0xff);
+    buf[1] = (char)(uu & 0xff);
+    uu = (u & 0x3ff)+0xdc00;
+    buf[2] = (char)((uu >> 8) & 0xff);
+    buf[3] = (char)(uu & 0xff);
+    return 4;
+  } else {
+    return 0;
+  }
+}
+
+#endif
diff --git a/poppler/UTF8.h b/poppler/UTF8.h
deleted file mode 100644
index 34a07d4..0000000
--- a/poppler/UTF8.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//========================================================================
-//
-// UTF8.h
-//
-// Copyright 2001-2003 Glyph & Cog, LLC
-//
-//========================================================================
-
-//========================================================================
-//
-// Modified under the Poppler project - http://poppler.freedesktop.org
-//
-// All changes made under the Poppler project to this file are licensed
-// under GPL version 2 or later
-//
-// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
-//
-// To see a description of the changes please see the Changelog file that
-// came with your tarball or type make ChangeLog if you are building from git
-//
-//========================================================================
-
-static int mapUTF8(Unicode u, char *buf, int bufSize) {
-  if        (u <= 0x0000007f) {
-    if (bufSize < 1) {
-      return 0;
-    }
-    buf[0] = (char)u;
-    return 1;
-  } else if (u <= 0x000007ff) {
-    if (bufSize < 2) {
-      return 0;
-    }
-    buf[0] = (char)(0xc0 + (u >> 6));
-    buf[1] = (char)(0x80 + (u & 0x3f));
-    return 2;
-  } else if (u <= 0x0000ffff) {
-    if (bufSize < 3) {
-      return 0;
-    }
-    buf[0] = (char)(0xe0 + (u >> 12));
-    buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
-    buf[2] = (char)(0x80 + (u & 0x3f));
-    return 3;
-  } else if (u <= 0x0010ffff) {
-    if (bufSize < 4) {
-      return 0;
-    }
-    buf[0] = (char)(0xf0 + (u >> 18));
-    buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
-    buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
-    buf[3] = (char)(0x80 + (u & 0x3f));
-    return 4;
-  } else {
-    return 0;
-  }
-}
-
-static int mapUCS2(Unicode u, char *buf, int bufSize) {
-  if (u <= 0xffff) {
-    if (bufSize < 2) {
-      return 0;
-    }
-    buf[0] = (char)((u >> 8) & 0xff);
-    buf[1] = (char)(u & 0xff);
-    return 2;
-  } else if (u < 0x110000) {
-    Unicode uu;
-
-    /* using surrogate pair */
-    if (bufSize < 4) {
-      return 0;
-    }
-    uu = ((u - 0x10000) >> 10) + 0xd800;
-    buf[0] = (char)((uu >> 8) & 0xff);
-    buf[1] = (char)(uu & 0xff);
-    uu = (u & 0x3ff)+0xdc00;
-    buf[2] = (char)((uu >> 8) & 0xff);
-    buf[3] = (char)(uu & 0xff);
-    return 4;
-  } else {
-    return 0;
-  }
-}
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 17541a2..9e113eb 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -398,19 +398,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
     h1 /= uLen;
   }
   for (i = 0; i < uLen; ++i) {
-    Unicode u1 = u[i];
-    if (u1 >= 0xd800 && u1 <= 0xdbff && i < uLen) {
-      // surrogate pair
-      const Unicode u2 = u[i + 1];
-      if (u2 >= 0xdc00 && u2 <= 0xdfff) {
-	u1 = 0x10000 + ((u1 - 0xd800) << 10) + (u2 - 0xdc00);
-	
-	curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
-      }
-      ++i;
-    } else {
-      curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1);
-    }
+    curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
   }
 }
 
-- 
1.7.5.4