--- poppler/GfxFont.cc 2007/12/17 02:05:54 1.1 +++ poppler/GfxFont.cc 2007/12/17 22:06:06 @@ -96,6 +96,10 @@ static StdFontMapEntry stdFontMap[] = { { "TimesNewRomanPSMT,Italic", "Times-Italic" } }; +static int parseCharName(char *charName, Unicode *uBuf, int uLen, + GBool names, GBool ligatures, + GBool numeric, GBool hex, GBool variants); + //------------------------------------------------------------------------ // GfxFont //------------------------------------------------------------------------ @@ -787,35 +791,24 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, cha } } - // pass 2: try to fill in the missing chars, looking for names of - // the form 'Axx', 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' - // are any letters, 'xx' is two hex digits, and 'nn' is 2-4 - // decimal digits - if (missing && globalParams->getMapNumericCharNames()) { + // construct the char code -> Unicode mapping object + ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); + + // pass 2: try to fill in the missing chars, looking for ligatures, numeric + // references and variants + if (missing) { for (code = 0; code < 256; ++code) { if ((charName = enc[code]) && !toUnicode[code] && strcmp(charName, ".notdef")) { - n = strlen(charName); - code2 = -1; - if (hex && n == 3 && isalpha(charName[0]) && - isxdigit(charName[1]) && isxdigit(charName[2])) { - sscanf(charName+1, "%x", &code2); - } else if (hex && n == 2 && - isxdigit(charName[0]) && isxdigit(charName[1])) { - sscanf(charName, "%x", &code2); - } else if (!hex && n >= 2 && n <= 4 && - isdigit(charName[0]) && isdigit(charName[1])) { - code2 = atoi(charName); - } else if (n >= 3 && n <= 5 && - isdigit(charName[1]) && isdigit(charName[2])) { - code2 = atoi(charName+1); - } else if (n >= 4 && n <= 6 && - isdigit(charName[2]) && isdigit(charName[3])) { - code2 = atoi(charName+2); - } - if (code2 >= 0 && code2 <= 0xff) { - toUnicode[code] = (Unicode)code2; - } + if ((n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf), + gFalse, // don't check simple names (pass 1) + gTrue, // do check ligatures + globalParams->getMapNumericCharNames(), + hex, + gTrue))) // do check variants + ctu->setMapping((CharCode)code, uBuf, n); + else + error(-1, "Could not parse charref for nameToUnicode: %s", charName); } } @@ -829,9 +822,6 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, cha } } - // construct the char code -> Unicode mapping object - ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); - // merge in a ToUnicode CMap, if there is one -- this overwrites // existing entries in ctu, i.e., the ToUnicode CMap takes // precedence, but the other encoding info is allowed to fill in any @@ -961,6 +951,141 @@ Gfx8BitFont::~Gfx8BitFont() { } } +// This function is in part a derived work of the Adobe Glyph Mapping +// Convention: http://www.adobe.com/devnet/opentype/archives/glyph.html +// Algorithmic comments are excerpted from that document to aid +// maintainability. +static int parseCharName(char *charName, Unicode *uBuf, int uLen, + GBool names, GBool ligatures, + GBool numeric, GBool hex, GBool variants) +{ + if (uLen <= 0) { + error(-1, "Zero-length output buffer (recursion overflow?) in " + "nameToUnicode: %s", charName); + return 0; + } + // Step 1: drop all the characters from the glyph name starting with the + // first occurrence of a period (U+002E FULL STOP), if any. + if (variants) { + char *var_part = strchr(charName, '.'); + if (var_part == charName) { + return 0; // .notdef or similar + } else if (var_part != NULL) { + // parse names of the form 7.oldstyle, P.swash, s.sc, etc. + char *main_part = strndup(charName, var_part - charName); + GBool namesRecurse = gTrue, variantsRecurse = gFalse; + int n = parseCharName(main_part, uBuf, uLen, namesRecurse, ligatures, + numeric, hex, variantsRecurse); + gfree(main_part); + return n; + } + } + // Step 2: split the remaining string into a sequence of components, using + // underscore (U+005F LOW LINE) as the delimiter. + if (ligatures && strchr(charName, '_')) { + // parse names of the form A_a (e.g. f_i, T_h, l_quotesingle) + char *lig_part, *lig_end, *lig_copy; + int n = 0, m; + lig_part = lig_copy = copyString(charName); + do { + if ((lig_end = strchr(lig_part, '_'))) + *lig_end = '\0'; + if (lig_part[0] != '\0') { + GBool namesRecurse = gTrue, ligaturesRecurse = gFalse; + if ((m = parseCharName(lig_part, uBuf + n, uLen - n, namesRecurse, + ligaturesRecurse, numeric, hex, variants))) + n += m; + else + error(-1, "Could not parse ligature component in charref for " + "nameToUnicode: %s", charName); + } + lig_part = lig_end + 1; + } while (lig_end && n < uLen); + gfree(lig_copy); + return n; + } + // Step 3: map each component to a character string according to the + // procedure below, and concatenate those strings; the result is the + // character string to which the glyph name is mapped. + // 3.1. if the font is Zapf Dingbats (PostScript FontName ZapfDingbats), and + // the component is in the ZapfDingbats list, then map it to the + // corresponding character in that list. + // 3.2. otherwise, if the component is in the Adobe Glyph List, then map it + // to the corresponding character in that list. + if (names && (uBuf[0] = globalParams->mapNameToUnicode(charName))) { + return 1; + } + if (numeric) { + // Not in Adobe Glyph Mapping convention: look for names of the form 'Axx', + // 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' are any letters, 'xx' is + // two hex digits, and 'nn' is 2-4 decimal digits + unsigned int n = strlen(charName); + if (hex && n == 3 && isalpha(charName[0]) && + isxdigit(charName[1]) && isxdigit(charName[2])) { + sscanf(charName+1, "%x", (unsigned int *)uBuf); + return 1; + } else if (hex && n == 2 && + isxdigit(charName[0]) && isxdigit(charName[1])) { + sscanf(charName, "%x", (unsigned int *)uBuf); + return 1; + } else if (!hex && n >= 2 && n <= 4 && + isdigit(charName[0]) && isdigit(charName[1])) { + uBuf[0] = (Unicode)atoi(charName); + return 1; + } else if (n >= 3 && n <= 5 && + isdigit(charName[1]) && isdigit(charName[2])) { + uBuf[0] = (Unicode)atoi(charName+1); + return 1; + } else if (n >= 4 && n <= 6 && + isdigit(charName[2]) && isdigit(charName[3])) { + uBuf[0] = (Unicode)atoi(charName+2); + return 1; + } + // 3.3. otherwise, if the component is of the form "uni" (U+0075 U+006E + // U+0069) followed by a sequence of uppercase hexadecimal digits (0 .. 9, + // A .. F, i.e. U+0030 .. U+0039, U+0041 .. U+0046), the length of that + // sequence is a multiple of four, and each group of four digits represents + // a number in the set {0x0000 .. 0xD7FF, 0xE000 .. 0xFFFF}, then interpret + // each such number as a Unicode scalar value and map the component to the + // string made of those scalar values. Note that the range and digit length + // restrictions mean that the "uni" prefix can be used only with Unicode + // values from the Basic Multilingual Plane (BMP). + if (n >= 7 && (n % 4) == 3 && !strncmp(charName, "uni", 3)) { + unsigned int i, m; + for (i = 0, m = 3; i < uLen && m < n; m += 4) { + if (isxdigit(charName[m]) && isxdigit(charName[m + 1]) && + isxdigit(charName[m + 2]) && isxdigit(charName[m + 3])) { + unsigned int u; + sscanf(charName + m, "%4x", &u); + if (u <= 0xD7FF || (0xE000 <= u && u <= 0xFFFF)) { + uBuf[i++] = u; + } + } + } + return i; + } + // 3.4. otherwise, if the component is of the form "u" (U+0075) followed by + // a sequence of four to six uppercase hexadecimal digits {0 .. 9, A .. F} + // (U+0030 .. U+0039, U+0041 .. U+0046), and those digits represent a + // number in {0x0000 .. 0xD7FF, 0xE000 .. 0x10FFFF}, then interpret this + // number as a Unicode scalar value and map the component to the string + // made of this scalar value. + if (n >= 5 && n <= 7 && charName[0] == 'u' && isxdigit(charName[1]) && + isxdigit(charName[2]) && isxdigit(charName[3]) && isxdigit(charName[4]) + && (n <= 5 || isxdigit(charName[5])) + && (n <= 6 || isxdigit(charName[6]))) { + unsigned int u; + sscanf(charName + 1, "%x", &u); + if (u <= 0xD7FF || (0xE000 <= u && u <= 0x10FFFF)) { + uBuf[0] = u; + return 1; + } + } + } + // 3.5. otherwise, map the component to the empty string + return 0; +} + int Gfx8BitFont::getNextChar(char *s, int len, CharCode *code, Unicode *u, int uSize, int *uLen, double *dx, double *dy, double *ox, double *oy) {