From ae1109aa2c2c62f35da5cd2972185fca1943ec1b Mon Sep 17 00:00:00 2001
From: Jason Crain <jason@aquaticape.us>
Date: Mon, 12 Aug 2013 22:32:41 -0500
Subject: [PATCH] Normalize more characters in font Unicode map

Move normalization pass to run after the CMap is read.  Additionally
normalize some greek letters and presentation forms blocks.
---
 poppler/GfxFont.cc          |   65 +++++++++++++++++++++++++++++++++++--------
 poppler/UnicodeTypeTable.cc |    8 ------
 2 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/poppler/GfxFont.cc b/poppler/GfxFont.cc
index ea22af8..28ab641 100644
--- a/poppler/GfxFont.cc
+++ b/poppler/GfxFont.cc
@@ -167,6 +167,7 @@ static const char *base14SubstFonts[14] = {
 static int parseCharName(char *charName, Unicode *uBuf, int uLen,
 			 GBool names, GBool ligatures, 
 			 GBool numeric, GBool hex, GBool variants);
+static void normalizeCTU(CharCodeToUnicode *ctu);
 
 //------------------------------------------------------------------------
 
@@ -1253,17 +1254,6 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA
   // construct the char code -> Unicode mapping object
   ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode);
 
-  // pass 1a: Expand ligatures in the Alphabetic Presentation Form
-  // block (eg "fi", "ffi") to normal form
-  for (code = 0; code < 256; ++code) {
-    if (unicodeIsAlphabeticPresentationForm(toUnicode[code])) {
-      Unicode *normalized = unicodeNormalizeNFKC(&toUnicode[code], 1, &len, NULL);
-      if (len > 1)
-        ctu->setMapping((CharCode)code, normalized, len);
-      gfree(normalized);
-    }
-  }
-
   // pass 2: try to fill in the missing chars, looking for ligatures, numeric
   // references and variants
   if (missing) {
@@ -1300,6 +1290,10 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA
   // holes
   readToUnicodeCMap(fontDict, 8, ctu);
 
+  // expand presentation form blocks and convert math symbols to greek
+  // letters
+  normalizeCTU(ctu);
+
   // look for a Unicode-to-Unicode mapping
   if (name && (utu = globalParams->getUnicodeToUnicode(name))) {
     Unicode *uAux;
@@ -1424,6 +1418,55 @@ Gfx8BitFont::~Gfx8BitFont() {
   }
 }
 
+static void normalizeCTU(CharCodeToUnicode *ctu) {
+  int normalizedSize = 16;
+  Unicode *normalized = (Unicode *)gmallocn(normalizedSize, sizeof(Unicode));
+  int normalizedLen;
+
+  for (int code = 0; code < 256; ++code) {
+    Unicode *u;
+    int uLen = ctu->mapToUnicode(code, &u);
+    normalizedLen = 0;
+
+    for (int i = 0; i < uLen; ++i) {
+      if (u[i] == 0x00B5 // µ
+	  || u[i] == 0x0152 // Œ
+	  || u[i] == 0x0153 // œ
+	  || u[i] == 0x2126 // Ω
+	  || u[i] == 0x2206 // ∆
+	  || u[i] == 0x220A // ∊
+	  || u[i] == 0x220F // ∏
+	  || u[i] == 0x2211 // ∑
+	  || (u[i] >= 0xFB00 && u[i] <= 0xFB4F) // Alphabetic Presentation Forms
+	  || (u[i] >= 0xFB50 && u[i] <= 0xFDFF) // Arabic Presentation Forms-A
+	  || (u[i] >= 0xFE70 && u[i] <= 0xFEFF) // Arabic Presentation Forms-B
+	  ) {
+	int uNormLen;
+	Unicode *uNorm = unicodeNormalizeNFKC(&u[i], 1, &uNormLen, NULL);
+
+	if (normalizedSize < normalizedLen + uNormLen) {
+	  normalizedSize += 16;
+	  if (normalizedSize < normalizedLen + uNormLen)
+	    normalizedSize = normalizedLen + uNormLen;
+	  normalized = (Unicode*)greallocn(normalized, normalizedSize, sizeof(Unicode));
+	}
+
+	memcpy(normalized + normalizedLen, uNorm, uNormLen * sizeof(Unicode));
+	normalizedLen += uNormLen;
+	gfree(uNorm);
+      } else {
+	if (normalizedSize == normalizedLen) {
+	  normalizedSize += 16;
+	  normalized = (Unicode*)greallocn(normalized, normalizedSize, sizeof(Unicode));
+	}
+	normalized[normalizedLen++] = u[i];
+      }
+    }
+    ctu->setMapping((CharCode)code, normalized, normalizedLen);
+  }
+  gfree(normalized);
+}
+
 // This function is in part a derived work of the Adobe Glyph Mapping
 // Convention: http://www.adobe.com/devnet/opentype/archives/glyph.html
 // Algorithmic comments are excerpted from that document to aid
diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc
index 721af9d..66823c4 100644
--- a/poppler/UnicodeTypeTable.cc
+++ b/poppler/UnicodeTypeTable.cc
@@ -963,14 +963,6 @@ GBool unicodeTypeAlphaNum(Unicode c) {
   return t == 'L' || t == 'R' || t == '#';
 }
 
-#define UNICODE_ALPHABETIC_PRESENTATION_BLOCK_BEGIN 0xFB00
-#define UNICODE_ALPHABETIC_PRESENTATION_BLOCK_END   0xFB4F
-
-GBool unicodeIsAlphabeticPresentationForm(Unicode c) {
-  return c >= UNICODE_ALPHABETIC_PRESENTATION_BLOCK_BEGIN
-    && c <= UNICODE_ALPHABETIC_PRESENTATION_BLOCK_END;
-}
-
 Unicode unicodeToUpper(Unicode c) {
   int i;
 
-- 
1.7.10.4