From 79c536130d211a6f25212c3526bc8e377a0722c4 Mon Sep 17 00:00:00 2001 From: Jason Crain Date: Fri, 28 Feb 2014 23:52:30 -0600 Subject: [PATCH] Limit numeric parsing of character names When setting up the CharCodeToUnicode map, parse character names like xx or Axx for hex or decimal codes only if the font's Differences array meets the following criteria: * All sequences must start between character codes 0 and 5. * All names must successfully parse numerically. If all other methods fail, use the character code as a fallback. --- poppler/GfxFont.cc | 143 +++++++++++++++++++++++++++++++++++------------- poppler/GlobalParams.cc | 2 +- 2 files changed, 106 insertions(+), 39 deletions(-) diff --git a/poppler/GfxFont.cc b/poppler/GfxFont.cc index c54181b..3160d8a 100644 --- a/poppler/GfxFont.cc +++ b/poppler/GfxFont.cc @@ -916,6 +916,94 @@ char *GfxFont::readEmbFontFile(XRef *xref, int *len) { // Gfx8BitFont //------------------------------------------------------------------------ +// Parse character names of the form 'Axx', 'xx', 'Ann', 'ABnn', or +// 'nn', where 'A' and 'B' are any letters, 'xx' is two hex digits, +// and 'nn' is decimal digits. +static GBool parseNumericName(char *s, GBool hex, unsigned int *u) { + char *endptr; + + // Strip leading alpha characters. + if (hex) { + int n = 0; + + // Get string length while ignoring junk at end. + while (isalnum(s[n])) + ++n; + + // Only 2 hex characters with optional leading alpha is allowed. + if (n == 3 && isalpha(*s)) { + ++s; + } else if (n != 2) { + return gFalse; + } + } else { + // Strip up to two alpha characters. + for (int i = 0; i < 2 && isalpha(*s); ++i) + ++s; + } + + int v = strtol(s, &endptr, hex ? 16 : 10); + + if (endptr == s) + return gFalse; + + // Skip trailing junk characters. + while (*endptr != '\0' && !isalnum(*endptr)) + ++endptr; + + if (*endptr == '\0') { + if (u) + *u = v; + return gTrue; + } + return gFalse; +} + +// Returns gTrue if the font has character names like xx or Axx which +// should be parsed for hex or decimal values. +static GBool testForNumericNames(Dict *fontDict, GBool hex) { + Object enc, diff, obj; + GBool numeric = gTrue; + + fontDict->lookup("Encoding", &enc); + if (!enc.isDict()) { + enc.free(); + return gFalse; + } + + enc.dictLookup("Differences", &diff); + enc.free(); + if (!diff.isArray()) { + diff.free(); + return gFalse; + } + + for (int i = 0; i < diff.arrayGetLength(); ++i) { + diff.arrayGet(i, &obj); + if (obj.isInt()) { + // All sequences must start between character codes 0 and 5. + if (obj.getInt() > 5) { + numeric = gFalse; + break; + } + } else if (obj.isName()) { + // All character names must sucessfully parse. + if (!parseNumericName(obj.getName(), hex, NULL)) { + numeric = gFalse; + break; + } + } else { + numeric = gFalse; + break; + } + obj.free(); + } + + diff.free(); + obj.free(); + return numeric; +} + Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA, GfxFontType typeA, Ref embFontIDA, Dict *fontDict): GfxFont(tagA, idA, nameA, typeA, embFontIDA) { @@ -930,6 +1018,7 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA int code; char *charName; GBool missing, hex; + GBool numeric; Unicode toUnicode[256]; CharCodeToUnicode *utu, *ctu2; Unicode uBuf[8]; @@ -1244,9 +1333,9 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA (charName[2] >= 'A' && charName[2] <= 'F'))) || (strlen(charName) == 2 && isxdigit(charName[0]) && isxdigit(charName[1]) && - ((charName[0] >= 'a' && charName[0] <= 'f') || - (charName[0] >= 'A' && charName[0] <= 'F') || - (charName[1] >= 'a' && charName[1] <= 'f') || + // Only check idx 1 to avoid misidentifying a decimal + // number like a0 + ((charName[1] >= 'a' && charName[1] <= 'f') || (charName[1] >= 'A' && charName[1] <= 'F')))) { hex = gTrue; } @@ -1257,6 +1346,8 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA } } + numeric = testForNumericNames(fontDict, hex); + // construct the char code -> Unicode mapping object ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); @@ -1280,22 +1371,18 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA && (n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf), gFalse, // don't check simple names (pass 1) gTrue, // do check ligatures - globalParams->getMapNumericCharNames(), + numeric, hex, gTrue))) { // do check variants ctu->setMapping((CharCode)code, uBuf, n); - } else if (globalParams->getMapUnknownCharNames()) { - // if the 'mapUnknownCharNames' flag is set, do a simple pass-through - // mapping for unknown character names - if (charName && charName[0]) { - for (n = 0; n < (int)(sizeof(uBuf)/sizeof(*uBuf)); ++n) - if (!(uBuf[n] = charName[n])) - break; - ctu->setMapping((CharCode)code, uBuf, n); - } else { + continue; + } + + // if the 'mapUnknownCharNames' flag is set, do a simple pass-through + // mapping for unknown character names + if (globalParams->getMapUnknownCharNames()) { uBuf[0] = code; ctu->setMapping((CharCode)code, uBuf, 1); - } } } } @@ -1495,7 +1582,7 @@ static int parseCharName(char *charName, Unicode *uBuf, int uLen, if (names && (uBuf[0] = globalParams->mapNameToUnicodeText(charName))) { return 1; } - if (numeric) { + if (globalParams->getMapNumericCharNames()) { unsigned int n = strlen(charName); // 3.3. otherwise, if the component is of the form "uni" (U+0075 U+006E // U+0069) followed by a sequence of uppercase hexadecimal digits (0 .. 9, @@ -1538,30 +1625,10 @@ static int parseCharName(char *charName, Unicode *uBuf, int uLen, return 1; } } - // Not in Adobe Glyph Mapping convention: look for names of the form 'Axx', - // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters, 'xx' is - // two hex digits, and 'nn' is 2-4 decimal digits - if (hex && n == 3 && isalpha(charName[0]) && - isxdigit(charName[1]) && isxdigit(charName[2])) { - sscanf(charName+1, "%x", (unsigned int *)uBuf); - return 1; - } else if (hex && n == 2 && - isxdigit(charName[0]) && isxdigit(charName[1])) { - sscanf(charName, "%x", (unsigned int *)uBuf); + // Not in Adobe Glyph Mapping convention: look for names like xx + // or Axx and parse for hex or decimal values. + if (numeric && parseNumericName(charName, hex, uBuf)) return 1; - } else if (!hex && n >= 2 && n <= 4 && - isdigit(charName[0]) && isdigit(charName[1])) { - uBuf[0] = (Unicode)atoi(charName); - return 1; - } else if (n >= 3 && n <= 5 && - isdigit(charName[1]) && isdigit(charName[2])) { - uBuf[0] = (Unicode)atoi(charName+1); - return 1; - } else if (n >= 4 && n <= 6 && - isdigit(charName[2]) && isdigit(charName[3])) { - uBuf[0] = (Unicode)atoi(charName+2); - return 1; - } } // 3.5. otherwise, map the component to the empty string return 0; diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc index 01a3157..e92889c 100644 --- a/poppler/GlobalParams.cc +++ b/poppler/GlobalParams.cc @@ -633,7 +633,7 @@ GlobalParams::GlobalParams(const char *customPopplerDataDir) minLineWidth = 0.0; overprintPreview = gFalse; mapNumericCharNames = gTrue; - mapUnknownCharNames = gFalse; + mapUnknownCharNames = gTrue; printCommands = gFalse; profileCommands = gFalse; errQuiet = gFalse; -- 1.9.0