From 5a86cff1508065ba72a15651fe38de68fb2b17a6 Mon Sep 17 00:00:00 2001
From: Adrian Johnson <ajohnson@redneon.com>
Date: Thu, 8 Mar 2012 21:21:07 +1030
Subject: [PATCH 2/4] Move text string to unicode conversion into a separate
 function

This also ensures UTF-16 ActualText strings are converted to UCS-4
before calling addChar.
---
 goo/GooString.cc         |    2 +-
 poppler/TextOutputDev.cc |   32 ++++----------------------------
 poppler/UTF.cc           |   34 ++++++++++++++++++++++++++++++++++
 poppler/UTF.h            |    8 ++++++++
 utils/pdfinfo.cc         |   37 ++++++-------------------------------
 5 files changed, 53 insertions(+), 60 deletions(-)

diff --git a/goo/GooString.cc b/goo/GooString.cc
index fc78d90..2bc85e7 100644
--- a/goo/GooString.cc
+++ b/goo/GooString.cc
@@ -854,7 +854,7 @@ int GooString::cmpN(const char *sA, int n) const {
 
 GBool GooString::hasUnicodeMarker(void)
 {
-    return (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff;
+  return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff;
 }
 
 GooString *GooString::sanitizedName(GBool psmode)
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 79e4ae4..332d7ee 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -62,7 +62,7 @@
 #include "TextOutputDev.h"
 #include "Page.h"
 #include "Annot.h"
-#include "PDFDocEncoding.h"
+#include "UTF.h"
 
 #ifdef MACOS
 // needed for setting type/creator of MacOS files
@@ -5223,41 +5223,17 @@ void ActualText::end(GfxState *state) {
   // extents of all the glyphs inside the span
 
   if (actualTextNBytes) {
-    char *uniString = NULL;
     Unicode *uni;
-    int length, i;
-
-    if (!actualText->hasUnicodeMarker()) {
-      if (actualText->getLength() > 0) {
-        //non-unicode string -- assume pdfDocEncoding and
-        //try to convert to UTF16BE
-        uniString = pdfDocEncodingToUTF16(actualText, &length);
-      } else {
-        length = 0;
-      }
-    } else {
-      uniString = actualText->getCString();
-      length = actualText->getLength();
-    }
-
-    if (length < 3)
-      length = 0;
-    else
-      length = length/2 - 1;
-    uni = new Unicode[length];
-    for (i = 0 ; i < length; i++)
-      uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff);
+    int length;
 
     // now that we have the position info for all of the text inside
     // the marked content span, we feed the "ActualText" back through
     // text->addChar()
+    length = TextStringToUCS4(actualText, &uni);
     text->addChar(state, actualTextX0, actualTextY0,
                   actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
                   0, actualTextNBytes, uni, length);
-
-    delete [] uni;
-    if (!actualText->hasUnicodeMarker())
-      delete [] uniString;
+    gfree(uni); // len = 0 ?? XXXXXXXX
   }
 
   delete actualText;
diff --git a/poppler/UTF.cc b/poppler/UTF.cc
index b5f7d9f..0642d04 100644
--- a/poppler/UTF.cc
+++ b/poppler/UTF.cc
@@ -1,4 +1,5 @@
 #include "goo/gmem.h"
+#include "PDFDocEncoding.h"
 #include "UTF.h"
 
 int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
@@ -45,3 +46,36 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4)
   return len;
 }
 
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4)
+{
+  int i, len;
+  const char *s;
+  Unicode *u;
+
+  len = textStr->getLength();
+  s = textStr->getCString();
+  if (len == 0)
+    return 0;
+
+  if (textStr->hasUnicodeMarker()) {
+    Unicode *utf16;
+    len = len/2 - 1;
+    if (len > 0) {
+      utf16 = new Unicode[len];
+      for (i = 0 ; i < len; i++) {
+        utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff);
+      }
+      len = UTF16toUCS4(utf16, len, &u);
+      delete utf16;
+    } else {
+      u = NULL;
+    }
+  } else {
+    u = (Unicode*)gmallocn(len, sizeof(Unicode));
+    for (i = 0 ; i < len; i++) {
+      u[i] = pdfDocEncoding[s[i]];
+    }
+  }
+  *ucs4 = u;
+  return len;
+}
diff --git a/poppler/UTF.h b/poppler/UTF.h
index d0ef5bc..ec51e5a 100644
--- a/poppler/UTF.h
+++ b/poppler/UTF.h
@@ -27,6 +27,7 @@
 #pragma implementation
 #endif
 
+#include "goo/GooString.h"
 #include "CharTypes.h"
 
 // Convert a UTF-16 string to a UCS-4
@@ -36,6 +37,13 @@
 //   returns number of UCS-4 characters
 int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out);
 
+// Convert a PDF Text String to UCS-4
+//   s          - PDF text string
+//   ucs4       - if the number of UCS-4 characters is > 0, allocates and
+//                returns UCS-4 string. Free with gfree.
+//   returns number of UCS-4 characters
+int TextStringToUCS4(GooString *textStr, Unicode **ucs4);
+
 
 static int mapUTF8(Unicode u, char *buf, int bufSize) {
   if        (u <= 0x0000007f) {
diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc
index cdc5375..d1c077b 100644
--- a/utils/pdfinfo.cc
+++ b/utils/pdfinfo.cc
@@ -48,7 +48,7 @@
 #include "PDFDocFactory.h"
 #include "CharTypes.h"
 #include "UnicodeMap.h"
-#include "PDFDocEncoding.h"
+#include "UTF.h"
 #include "Error.h"
 #include "DateInfo.h"
 
@@ -379,41 +379,16 @@ static void printInfoString(Dict *infoDict, const char *key, const char *text,
 			    UnicodeMap *uMap) {
   Object obj;
   GooString *s1;
-  GBool isUnicode;
-  Unicode u, u2;
+  Unicode *u;
   char buf[8];
-  int i, n;
+  int i, n, len;
 
   if (infoDict->lookup(key, &obj)->isString()) {
     fputs(text, stdout);
     s1 = obj.getString();
-    if ((s1->getChar(0) & 0xff) == 0xfe &&
-	(s1->getChar(1) & 0xff) == 0xff) {
-      isUnicode = gTrue;
-      i = 2;
-    } else {
-      isUnicode = gFalse;
-      i = 0;
-    }
-    while (i < obj.getString()->getLength()) {
-      if (isUnicode) {
-	u = ((s1->getChar(i) & 0xff) << 8) |
-	    (s1->getChar(i+1) & 0xff);
-	i += 2;
-	if (u >= 0xd800 && u <= 0xdbff && i < obj.getString()->getLength()) {
-	  // surrogate pair
-	  u2 = ((s1->getChar(i) & 0xff) << 8) |
-	    (s1->getChar(i+1) & 0xff);
-	  i += 2;
-	  if (u2 >= 0xdc00 && u2 <= 0xdfff) {
-	    u = 0x10000 + ((u - 0xd800) << 10) + (u2 - 0xdc00);
-	  }
-	}
-      } else {
-	u = pdfDocEncoding[s1->getChar(i) & 0xff];
-	++i;
-      }
-      n = uMap->mapUnicode(u, buf, sizeof(buf));
+    len = TextStringToUCS4(s1, &u);
+    for (i = 0; i < len; i++) {
+      n = uMap->mapUnicode(u[i], buf, sizeof(buf));
       fwrite(buf, 1, n, stdout);
     }
     fputc('\n', stdout);
-- 
1.7.5.4