--- poppler/GfxFont.cc 2006/11/14 22:07:55 1.1 +++ poppler/GfxFont.cc 2006/12/09 19:48:20 @@ -97,6 +97,21 @@ static StdFontMapEntry stdFontMap[] = { { "TimesNewRomanPSMT,Italic", "Times-Italic" } }; +static char *variantGlyphForms[] = { + "alt1", // alternative forms + "alt2", + "alt3", + "oldstyle", // text figures + "sc", // small caps + "swash", // swash caps + "superior" // special form for superscript +}; + +static int parseCharName(char *charName, Unicode *uBuf, int uLen, + GBool names, GBool ligatures, + GBool numeric, GBool hex, GBool variants); +static GBool isVariantGlyphForm(char *var_part); + //------------------------------------------------------------------------ // GfxFont //------------------------------------------------------------------------ @@ -773,42 +788,28 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, cha } } - // pass 2: try to fill in the missing chars, looking for names of - // the form 'Axx', 'xx', 'Ann', 'ABnn', or 'nn', where 'A' and 'B' - // are any letters, 'xx' is two hex digits, and 'nn' is 2-4 - // decimal digits - if (missing && globalParams->getMapNumericCharNames()) { + // construct the char code -> Unicode mapping object + ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); + + // pass 2: try to fill in the missing chars, looking for ligatures, numeric + // references and variants + if (missing) { for (code = 0; code < 256; ++code) { if ((charName = enc[code]) && !toUnicode[code] && strcmp(charName, ".notdef")) { - n = strlen(charName); - code2 = -1; - if (hex && n == 3 && isalpha(charName[0]) && - isxdigit(charName[1]) && isxdigit(charName[2])) { - sscanf(charName+1, "%x", &code2); - } else if (hex && n == 2 && - isxdigit(charName[0]) && isxdigit(charName[1])) { - sscanf(charName, "%x", &code2); - } else if (!hex && n >= 2 && n <= 4 && - isdigit(charName[0]) && isdigit(charName[1])) { - code2 = atoi(charName); - } else if (n >= 3 && n <= 5 && - isdigit(charName[1]) && isdigit(charName[2])) { - code2 = atoi(charName+1); - } else if (n >= 4 && n <= 6 && - isdigit(charName[2]) && isdigit(charName[3])) { - code2 = atoi(charName+2); - } - if (code2 >= 0 && code2 <= 0xff) { - toUnicode[code] = (Unicode)code2; - } + if ((n = parseCharName(charName, uBuf, sizeof(uBuf)/sizeof(*uBuf), + gFalse, // don't check simple names (pass 1) + gTrue, // do check ligatures + globalParams->getMapNumericCharNames(), + hex, + gTrue))) // do check variants + ctu->setMapping((CharCode)code, uBuf, n); + else + error(-1, "Could not parse charref for nameToUnicode: %s", charName); } } } - // construct the char code -> Unicode mapping object - ctu = CharCodeToUnicode::make8BitToUnicode(toUnicode); - // merge in a ToUnicode CMap, if there is one -- this overwrites // existing entries in ctu, i.e., the ToUnicode CMap takes // precedence, but the other encoding info is allowed to fill in any @@ -938,6 +939,91 @@ Gfx8BitFont::~Gfx8BitFont() { } } +static int parseCharName(char *charName, Unicode *uBuf, int uLen, + GBool names, GBool ligatures, + GBool numeric, GBool hex, GBool variants) +{ + int n, m; + char *lig_part, *lig_end, *lig_copy, *main_part, *var_part; + + if (names && (uBuf[0] = globalParams->mapNameToUnicode(charName))) + return 1; + if (ligatures && strchr(charName, '_')) { + // look for names of the form A_a (e.g. f_i, T_h, l_quotesingle) + lig_part = lig_copy = copyString(charName); + n = 0; + do { + if ((lig_end = strchr(lig_part, '_'))) + *lig_end = 0; + if (strcmp(lig_part, ".notdef")) { + if ((m = parseCharName(lig_part, uBuf + n, uLen - n, gTrue, gFalse, + numeric, hex, variants))) + n += m; + else + error(-1, "Could not parse ligature component in charref for " + "nameToUnicode: %s", charName); + } + lig_part = lig_end + 1; + } while (lig_end && n < uLen); + gfree(lig_copy); + if (n > 0) + return n; + } + if (numeric) { + // look for names of the form 'Axx', 'xx', 'Ann', 'ABnn', or 'nn', where + // 'A' and 'B' are any letters, 'xx' is two hex digits, and 'nn' is 2-4 + // decimal digits + n = strlen(charName); + if (hex && n == 3 && isalpha(charName[0]) && + isxdigit(charName[1]) && isxdigit(charName[2])) { + sscanf(charName+1, "%x", (unsigned int *)uBuf); + return 1; + } else if (hex && n == 2 && + isxdigit(charName[0]) && isxdigit(charName[1])) { + sscanf(charName, "%x", (unsigned int *)uBuf); + return 1; + } else if (!hex && n >= 2 && n <= 4 && + isdigit(charName[0]) && isdigit(charName[1])) { + uBuf[0] = (Unicode)atoi(charName); + return 1; + } else if (n >= 3 && n <= 5 && + isdigit(charName[1]) && isdigit(charName[2])) { + uBuf[0] = (Unicode)atoi(charName+1); + return 1; + } else if (n >= 4 && n <= 6 && + isdigit(charName[2]) && isdigit(charName[3])) { + uBuf[0] = (Unicode)atoi(charName+2); + return 1; + } + // look for names of the form 'uniXXXX' where 'XXXX' is 4 hex digits + if (n == 7 && !strncmp(charName, "uni", 3) && + isxdigit(charName[3]) && isxdigit(charName[4]) && + isxdigit(charName[5]) && isxdigit(charName[6])) { + sscanf(charName + 3, "%x", (unsigned int *)uBuf); + return 1; + } + } + if (variants && (var_part = strrchr(charName, '.'))) { + // look for names of the form 7.oldstyle, P.swash, s.sc, etc. + if (isVariantGlyphForm(var_part + 1)) { + n = parseCharName(main_part = strndup(charName, var_part - charName), + uBuf, uLen, gTrue, ligatures, numeric, hex, gFalse); + gfree(main_part); + if (n > 0) + return n; + } + } + return 0; +} + +static GBool isVariantGlyphForm(char *var_part) { + unsigned int i; + for (i = 0; i < sizeof(variantGlyphForms)/sizeof(*variantGlyphForms); ++i) + if (!strcmp(var_part, variantGlyphForms[i])) + return gTrue; + return gFalse; +} + int Gfx8BitFont::getNextChar(char *s, int len, CharCode *code, Unicode *u, int uSize, int *uLen, double *dx, double *dy, double *ox, double *oy) {