From ae1109aa2c2c62f35da5cd2972185fca1943ec1b Mon Sep 17 00:00:00 2001 From: Jason Crain Date: Mon, 12 Aug 2013 22:32:41 -0500 Subject: [PATCH] Normalize more characters in font Unicode map Move normalization pass to run after the CMap is read. Additionally normalize some greek letters and presentation forms blocks. --- poppler/GfxFont.cc | 65 +++++++++++++++++++++++++++++++++++-------- poppler/UnicodeTypeTable.cc | 8 ------ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/poppler/GfxFont.cc b/poppler/GfxFont.cc index ea22af8..28ab641 100644 --- a/poppler/GfxFont.cc +++ b/poppler/GfxFont.cc @@ -167,6 +167,7 @@ static const char *base14SubstFonts[14] = { static int parseCharName(char *charName, Unicode *uBuf, int uLen, GBool names, GBool ligatures, GBool numeric, GBool hex, GBool variants); +static void normalizeCTU(CharCodeToUnicode *ctu); //------------------------------------------------------------------------ @@ -1253,17 +1254,6 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA // construct the char code -> Unicode mapping object ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); - // pass 1a: Expand ligatures in the Alphabetic Presentation Form - // block (eg "fi", "ffi") to normal form - for (code = 0; code < 256; ++code) { - if (unicodeIsAlphabeticPresentationForm(toUnicode[code])) { - Unicode *normalized = unicodeNormalizeNFKC(&toUnicode[code], 1, &len, NULL); - if (len > 1) - ctu->setMapping((CharCode)code, normalized, len); - gfree(normalized); - } - } - // pass 2: try to fill in the missing chars, looking for ligatures, numeric // references and variants if (missing) { @@ -1300,6 +1290,10 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA // holes readToUnicodeCMap(fontDict, 8, ctu); + // expand presentation form blocks and convert math symbols to greek + // letters + normalizeCTU(ctu); + // look for a Unicode-to-Unicode mapping if (name && (utu = globalParams->getUnicodeToUnicode(name))) { Unicode *uAux; @@ -1424,6 +1418,55 @@ Gfx8BitFont::~Gfx8BitFont() { } } +static void normalizeCTU(CharCodeToUnicode *ctu) { + int normalizedSize = 16; + Unicode *normalized = (Unicode *)gmallocn(normalizedSize, sizeof(Unicode)); + int normalizedLen; + + for (int code = 0; code < 256; ++code) { + Unicode *u; + int uLen = ctu->mapToUnicode(code, &u); + normalizedLen = 0; + + for (int i = 0; i < uLen; ++i) { + if (u[i] == 0x00B5 // µ + || u[i] == 0x0152 // Œ + || u[i] == 0x0153 // œ + || u[i] == 0x2126 // Ω + || u[i] == 0x2206 // ∆ + || u[i] == 0x220A // ∊ + || u[i] == 0x220F // ∏ + || u[i] == 0x2211 // ∑ + || (u[i] >= 0xFB00 && u[i] <= 0xFB4F) // Alphabetic Presentation Forms + || (u[i] >= 0xFB50 && u[i] <= 0xFDFF) // Arabic Presentation Forms-A + || (u[i] >= 0xFE70 && u[i] <= 0xFEFF) // Arabic Presentation Forms-B + ) { + int uNormLen; + Unicode *uNorm = unicodeNormalizeNFKC(&u[i], 1, &uNormLen, NULL); + + if (normalizedSize < normalizedLen + uNormLen) { + normalizedSize += 16; + if (normalizedSize < normalizedLen + uNormLen) + normalizedSize = normalizedLen + uNormLen; + normalized = (Unicode*)greallocn(normalized, normalizedSize, sizeof(Unicode)); + } + + memcpy(normalized + normalizedLen, uNorm, uNormLen * sizeof(Unicode)); + normalizedLen += uNormLen; + gfree(uNorm); + } else { + if (normalizedSize == normalizedLen) { + normalizedSize += 16; + normalized = (Unicode*)greallocn(normalized, normalizedSize, sizeof(Unicode)); + } + normalized[normalizedLen++] = u[i]; + } + } + ctu->setMapping((CharCode)code, normalized, normalizedLen); + } + gfree(normalized); +} + // This function is in part a derived work of the Adobe Glyph Mapping // Convention: http://www.adobe.com/devnet/opentype/archives/glyph.html // Algorithmic comments are excerpted from that document to aid diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc index 721af9d..66823c4 100644 --- a/poppler/UnicodeTypeTable.cc +++ b/poppler/UnicodeTypeTable.cc @@ -963,14 +963,6 @@ GBool unicodeTypeAlphaNum(Unicode c) { return t == 'L' || t == 'R' || t == '#'; } -#define UNICODE_ALPHABETIC_PRESENTATION_BLOCK_BEGIN 0xFB00 -#define UNICODE_ALPHABETIC_PRESENTATION_BLOCK_END 0xFB4F - -GBool unicodeIsAlphabeticPresentationForm(Unicode c) { - return c >= UNICODE_ALPHABETIC_PRESENTATION_BLOCK_BEGIN - && c <= UNICODE_ALPHABETIC_PRESENTATION_BLOCK_END; -} - Unicode unicodeToUpper(Unicode c) { int i; -- 1.7.10.4