From a40bb1316a32e5a56590a436877c8ed6c3b2406a Mon Sep 17 00:00:00 2001 From: Jason Crain Date: Sun, 4 Aug 2013 22:30:23 -0500 Subject: [PATCH] Use ZapfDingbats names to locate glyphs only Some PDFs use names from ZapfDingbats (a1-a206) without intending for it to be used for text extraction. This modifies NameToUnicodeTable.h so ZapfDingbats names are only used to locate glyphs. --- poppler/GfxFont.cc | 6 +- poppler/GlobalParams.cc | 32 +++- poppler/GlobalParams.h | 9 +- poppler/NameToUnicodeTable.h | 414 +++++++++++++++++++++--------------------- 4 files changed, 243 insertions(+), 218 deletions(-) diff --git a/poppler/GfxFont.cc b/poppler/GfxFont.cc index ea22af8..b2b19ad 100644 --- a/poppler/GfxFont.cc +++ b/poppler/GfxFont.cc @@ -1223,7 +1223,7 @@ Gfx8BitFont::Gfx8BitFont(XRef *xref, const char *tagA, Ref idA, GooString *nameA missing = hex = gFalse; for (code = 0; code < 256; ++code) { if ((charName = enc[code])) { - if (!(toUnicode[code] = globalParams->mapNameToUnicode(charName)) && + if (!(toUnicode[code] = globalParams->mapNameToUnicodeText(charName)) && strcmp(charName, ".notdef")) { // if it wasn't in the name-to-Unicode table, check for a // name that looks like 'Axx' or 'xx', where 'A' is any letter @@ -1485,7 +1485,7 @@ static int parseCharName(char *charName, Unicode *uBuf, int uLen, // corresponding character in that list. // 3.2. otherwise, if the component is in the Adobe Glyph List, then map it // to the corresponding character in that list. - if (names && (uBuf[0] = globalParams->mapNameToUnicode(charName))) { + if (names && (uBuf[0] = globalParams->mapNameToUnicodeText(charName))) { return 1; } if (numeric) { @@ -1674,7 +1674,7 @@ int *Gfx8BitFont::getCodeToGIDMap(FoFiTrueType *ff) { } else if (useUnicode) { Unicode *uAux; for (i = 0; i < 256; ++i) { - if (((charName = enc[i]) && (u = globalParams->mapNameToUnicode(charName)))) + if (((charName = enc[i]) && (u = globalParams->mapNameToUnicodeGlyph(charName)))) map[i] = ff->mapCodeToGID(cmap, u); else { diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc index 6efdd0c..06e56fd 100644 --- a/poppler/GlobalParams.cc +++ b/poppler/GlobalParams.cc @@ -576,7 +576,8 @@ GlobalParams::GlobalParams(const char *customPopplerDataDir) #ifdef _WIN32 substFiles = new GooHash(gTrue); #endif - nameToUnicode = new NameToCharCode(); + nameToUnicodeGlyph = new NameToCharCode(); + nameToUnicodeText = new NameToCharCode(); cidToUnicodes = new GooHash(gTrue); unicodeToUnicodes = new GooHash(gTrue); residentUnicodeMaps = new GooHash(); @@ -648,9 +649,13 @@ GlobalParams::GlobalParams(const char *customPopplerDataDir) securityHandlers = new GooList(); #endif - // set up the initial nameToUnicode table - for (i = 0; nameToUnicodeTab[i].name; ++i) { - nameToUnicode->add(nameToUnicodeTab[i].name, nameToUnicodeTab[i].u); + // set up the initial nameToUnicode tables + for (i = 0; nameToUnicodeGlyphTab[i].name; ++i) { + nameToUnicodeGlyph->add(nameToUnicodeGlyphTab[i].name, nameToUnicodeGlyphTab[i].u); + } + + for (i = 0; nameToUnicodeTextTab[i].name; ++i) { + nameToUnicodeText->add(nameToUnicodeTextTab[i].name, nameToUnicodeTextTab[i].u); } // set up the residentUnicodeMaps table @@ -740,7 +745,7 @@ void GlobalParams::parseNameToUnicode(GooString *name) { tok2 = strtok_r(NULL, " \t\r\n", &tokptr); if (tok1 && tok2) { sscanf(tok1, "%x", &u); - nameToUnicode->add(tok2, u); + nameToUnicodeText->add(tok2, u); } else { error(errConfig, -1, "Bad line in 'nameToUnicode' file ({0:t}:{1:d})", name, line); @@ -796,7 +801,8 @@ GlobalParams::~GlobalParams() { delete macRomanReverseMap; - delete nameToUnicode; + delete nameToUnicodeGlyph; + delete nameToUnicodeText; deleteGooHash(cidToUnicodes, GooString); deleteGooHash(unicodeToUnicodes, GooString); deleteGooHash(residentUnicodeMaps, UnicodeMap); @@ -853,9 +859,17 @@ CharCode GlobalParams::getMacRomanCharCode(char *charName) { return macRomanReverseMap->lookup(charName); } -Unicode GlobalParams::mapNameToUnicode(const char *charName) { - // no need to lock - nameToUnicode is constant - return nameToUnicode->lookup(charName); +Unicode GlobalParams::mapNameToUnicodeGlyph(const char *charName) { + // no need to lock - nameToUnicodeGlyph and nameToUnicodeText are constant + Unicode u = nameToUnicodeGlyph->lookup(charName); + if (!u) + u = nameToUnicodeText->lookup(charName); + return u; +} + +Unicode GlobalParams::mapNameToUnicodeText(const char *charName) { + // no need to lock - nameToUnicodeText is constant + return nameToUnicodeText->lookup(charName); } UnicodeMap *GlobalParams::getResidentUnicodeMap(GooString *encodingName) { diff --git a/poppler/GlobalParams.h b/poppler/GlobalParams.h index bc11684..68ba715 100644 --- a/poppler/GlobalParams.h +++ b/poppler/GlobalParams.h @@ -139,7 +139,8 @@ public: CharCode getMacRomanCharCode(char *charName); - Unicode mapNameToUnicode(const char *charName); + Unicode mapNameToUnicodeGlyph(const char *charName); + Unicode mapNameToUnicodeText(const char *charName); UnicodeMap *getResidentUnicodeMap(GooString *encodingName); FILE *getUnicodeMapFile(GooString *encodingName); FILE *findCMapFile(GooString *collection, GooString *cMapName); @@ -271,8 +272,10 @@ private: //----- user-modifiable settings - NameToCharCode * // mapping from char name to Unicode - nameToUnicode; + NameToCharCode * // mapping from char name to Unicode for glyphs + nameToUnicodeGlyph; + NameToCharCode * // mapping from char name to Unicode for text + nameToUnicodeText; // extraction GooHash *cidToUnicodes; // files for mappings from char collections // to Unicode, indexed by collection name // [GooString] diff --git a/poppler/NameToUnicodeTable.h b/poppler/NameToUnicodeTable.h index 74645ab..63a2baa 100644 --- a/poppler/NameToUnicodeTable.h +++ b/poppler/NameToUnicodeTable.h @@ -24,10 +24,13 @@ #include -static struct { +struct NameToUnicodeTab { Unicode u; const char *name; -} nameToUnicodeTab[] = { +}; + +// used to map character names to Unicode and Glyph IDs +static struct NameToUnicodeTab nameToUnicodeTextTab[] = { {0x0021, "!"}, {0x0023, "#"}, {0x0024, "$"}, @@ -800,207 +803,6 @@ static struct { {0x005f, "_"}, {0x0060, "`"}, {0x0061, "a"}, - {0x275e, "a100"}, - {0x2761, "a101"}, - {0x2762, "a102"}, - {0x2763, "a103"}, - {0x2764, "a104"}, - {0x2710, "a105"}, - {0x2765, "a106"}, - {0x2766, "a107"}, - {0x2767, "a108"}, - {0x2660, "a109"}, - {0x2721, "a10"}, - {0x2665, "a110"}, - {0x2666, "a111"}, - {0x2663, "a112"}, - {0x2709, "a117"}, - {0x2708, "a118"}, - {0x2707, "a119"}, - {0x261b, "a11"}, - {0x2460, "a120"}, - {0x2461, "a121"}, - {0x2462, "a122"}, - {0x2463, "a123"}, - {0x2464, "a124"}, - {0x2465, "a125"}, - {0x2466, "a126"}, - {0x2467, "a127"}, - {0x2468, "a128"}, - {0x2469, "a129"}, - {0x261e, "a12"}, - {0x2776, "a130"}, - {0x2777, "a131"}, - {0x2778, "a132"}, - {0x2779, "a133"}, - {0x277a, "a134"}, - {0x277b, "a135"}, - {0x277c, "a136"}, - {0x277d, "a137"}, - {0x277e, "a138"}, - {0x277f, "a139"}, - {0x270c, "a13"}, - {0x2780, "a140"}, - {0x2781, "a141"}, - {0x2782, "a142"}, - {0x2783, "a143"}, - {0x2784, "a144"}, - {0x2785, "a145"}, - {0x2786, "a146"}, - {0x2787, "a147"}, - {0x2788, "a148"}, - {0x2789, "a149"}, - {0x270d, "a14"}, - {0x278a, "a150"}, - {0x278b, "a151"}, - {0x278c, "a152"}, - {0x278d, "a153"}, - {0x278e, "a154"}, - {0x278f, "a155"}, - {0x2790, "a156"}, - {0x2791, "a157"}, - {0x2792, "a158"}, - {0x2793, "a159"}, - {0x270e, "a15"}, - {0x2794, "a160"}, - {0x2192, "a161"}, - {0x27a3, "a162"}, - {0x2194, "a163"}, - {0x2195, "a164"}, - {0x2799, "a165"}, - {0x279b, "a166"}, - {0x279c, "a167"}, - {0x279d, "a168"}, - {0x279e, "a169"}, - {0x270f, "a16"}, - {0x279f, "a170"}, - {0x27a0, "a171"}, - {0x27a1, "a172"}, - {0x27a2, "a173"}, - {0x27a4, "a174"}, - {0x27a5, "a175"}, - {0x27a6, "a176"}, - {0x27a7, "a177"}, - {0x27a8, "a178"}, - {0x27a9, "a179"}, - {0x2711, "a17"}, - {0x27ab, "a180"}, - {0x27ad, "a181"}, - {0x27af, "a182"}, - {0x27b2, "a183"}, - {0x27b3, "a184"}, - {0x27b5, "a185"}, - {0x27b8, "a186"}, - {0x27ba, "a187"}, - {0x27bb, "a188"}, - {0x27bc, "a189"}, - {0x2712, "a18"}, - {0x27bd, "a190"}, - {0x27be, "a191"}, - {0x279a, "a192"}, - {0x27aa, "a193"}, - {0x27b6, "a194"}, - {0x27b9, "a195"}, - {0x2798, "a196"}, - {0x27b4, "a197"}, - {0x27b7, "a198"}, - {0x27ac, "a199"}, - {0x2713, "a19"}, - {0x2701, "a1"}, - {0x27ae, "a200"}, - {0x27b1, "a201"}, - {0x2703, "a202"}, - {0x2750, "a203"}, - {0x2752, "a204"}, - {0x276e, "a205"}, - {0x2770, "a206"}, - {0x2714, "a20"}, - {0x2715, "a21"}, - {0x2716, "a22"}, - {0x2717, "a23"}, - {0x2718, "a24"}, - {0x2719, "a25"}, - {0x271a, "a26"}, - {0x271b, "a27"}, - {0x271c, "a28"}, - {0x2722, "a29"}, - {0x2702, "a2"}, - {0x2723, "a30"}, - {0x2724, "a31"}, - {0x2725, "a32"}, - {0x2726, "a33"}, - {0x2727, "a34"}, - {0x2605, "a35"}, - {0x2729, "a36"}, - {0x272a, "a37"}, - {0x272b, "a38"}, - {0x272c, "a39"}, - {0x2704, "a3"}, - {0x272d, "a40"}, - {0x272e, "a41"}, - {0x272f, "a42"}, - {0x2730, "a43"}, - {0x2731, "a44"}, - {0x2732, "a45"}, - {0x2733, "a46"}, - {0x2734, "a47"}, - {0x2735, "a48"}, - {0x2736, "a49"}, - {0x260e, "a4"}, - {0x2737, "a50"}, - {0x2738, "a51"}, - {0x2739, "a52"}, - {0x273a, "a53"}, - {0x273b, "a54"}, - {0x273c, "a55"}, - {0x273d, "a56"}, - {0x273e, "a57"}, - {0x273f, "a58"}, - {0x2740, "a59"}, - {0x2706, "a5"}, - {0x2741, "a60"}, - {0x2742, "a61"}, - {0x2743, "a62"}, - {0x2744, "a63"}, - {0x2745, "a64"}, - {0x2746, "a65"}, - {0x2747, "a66"}, - {0x2748, "a67"}, - {0x2749, "a68"}, - {0x274a, "a69"}, - {0x271d, "a6"}, - {0x274b, "a70"}, - {0x25cf, "a71"}, - {0x274d, "a72"}, - {0x25a0, "a73"}, - {0x274f, "a74"}, - {0x2751, "a75"}, - {0x25b2, "a76"}, - {0x25bc, "a77"}, - {0x25c6, "a78"}, - {0x2756, "a79"}, - {0x271e, "a7"}, - {0x25d7, "a81"}, - {0x2758, "a82"}, - {0x2759, "a83"}, - {0x275a, "a84"}, - {0x276f, "a85"}, - {0x2771, "a86"}, - {0x2772, "a87"}, - {0x2773, "a88"}, - {0x2768, "a89"}, - {0x271f, "a8"}, - {0x2769, "a90"}, - {0x276c, "a91"}, - {0x276d, "a92"}, - {0x276a, "a93"}, - {0x276b, "a94"}, - {0x2774, "a95"}, - {0x2775, "a96"}, - {0x275b, "a97"}, - {0x275c, "a98"}, - {0x275d, "a99"}, - {0x2720, "a9"}, {0x0986, "aabengali"}, {0x00e1, "aacute"}, {0x0906, "aadeva"}, @@ -4473,3 +4275,209 @@ static struct { {0x007e, "~"}, { 0, NULL } }; + +// used only to map character names to Glyph IDs +static struct NameToUnicodeTab nameToUnicodeGlyphTab[] = { + {0x275e, "a100"}, + {0x2761, "a101"}, + {0x2762, "a102"}, + {0x2763, "a103"}, + {0x2764, "a104"}, + {0x2710, "a105"}, + {0x2765, "a106"}, + {0x2766, "a107"}, + {0x2767, "a108"}, + {0x2660, "a109"}, + {0x2721, "a10"}, + {0x2665, "a110"}, + {0x2666, "a111"}, + {0x2663, "a112"}, + {0x2709, "a117"}, + {0x2708, "a118"}, + {0x2707, "a119"}, + {0x261b, "a11"}, + {0x2460, "a120"}, + {0x2461, "a121"}, + {0x2462, "a122"}, + {0x2463, "a123"}, + {0x2464, "a124"}, + {0x2465, "a125"}, + {0x2466, "a126"}, + {0x2467, "a127"}, + {0x2468, "a128"}, + {0x2469, "a129"}, + {0x261e, "a12"}, + {0x2776, "a130"}, + {0x2777, "a131"}, + {0x2778, "a132"}, + {0x2779, "a133"}, + {0x277a, "a134"}, + {0x277b, "a135"}, + {0x277c, "a136"}, + {0x277d, "a137"}, + {0x277e, "a138"}, + {0x277f, "a139"}, + {0x270c, "a13"}, + {0x2780, "a140"}, + {0x2781, "a141"}, + {0x2782, "a142"}, + {0x2783, "a143"}, + {0x2784, "a144"}, + {0x2785, "a145"}, + {0x2786, "a146"}, + {0x2787, "a147"}, + {0x2788, "a148"}, + {0x2789, "a149"}, + {0x270d, "a14"}, + {0x278a, "a150"}, + {0x278b, "a151"}, + {0x278c, "a152"}, + {0x278d, "a153"}, + {0x278e, "a154"}, + {0x278f, "a155"}, + {0x2790, "a156"}, + {0x2791, "a157"}, + {0x2792, "a158"}, + {0x2793, "a159"}, + {0x270e, "a15"}, + {0x2794, "a160"}, + {0x2192, "a161"}, + {0x27a3, "a162"}, + {0x2194, "a163"}, + {0x2195, "a164"}, + {0x2799, "a165"}, + {0x279b, "a166"}, + {0x279c, "a167"}, + {0x279d, "a168"}, + {0x279e, "a169"}, + {0x270f, "a16"}, + {0x279f, "a170"}, + {0x27a0, "a171"}, + {0x27a1, "a172"}, + {0x27a2, "a173"}, + {0x27a4, "a174"}, + {0x27a5, "a175"}, + {0x27a6, "a176"}, + {0x27a7, "a177"}, + {0x27a8, "a178"}, + {0x27a9, "a179"}, + {0x2711, "a17"}, + {0x27ab, "a180"}, + {0x27ad, "a181"}, + {0x27af, "a182"}, + {0x27b2, "a183"}, + {0x27b3, "a184"}, + {0x27b5, "a185"}, + {0x27b8, "a186"}, + {0x27ba, "a187"}, + {0x27bb, "a188"}, + {0x27bc, "a189"}, + {0x2712, "a18"}, + {0x27bd, "a190"}, + {0x27be, "a191"}, + {0x279a, "a192"}, + {0x27aa, "a193"}, + {0x27b6, "a194"}, + {0x27b9, "a195"}, + {0x2798, "a196"}, + {0x27b4, "a197"}, + {0x27b7, "a198"}, + {0x27ac, "a199"}, + {0x2713, "a19"}, + {0x2701, "a1"}, + {0x27ae, "a200"}, + {0x27b1, "a201"}, + {0x2703, "a202"}, + {0x2750, "a203"}, + {0x2752, "a204"}, + {0x276e, "a205"}, + {0x2770, "a206"}, + {0x2714, "a20"}, + {0x2715, "a21"}, + {0x2716, "a22"}, + {0x2717, "a23"}, + {0x2718, "a24"}, + {0x2719, "a25"}, + {0x271a, "a26"}, + {0x271b, "a27"}, + {0x271c, "a28"}, + {0x2722, "a29"}, + {0x2702, "a2"}, + {0x2723, "a30"}, + {0x2724, "a31"}, + {0x2725, "a32"}, + {0x2726, "a33"}, + {0x2727, "a34"}, + {0x2605, "a35"}, + {0x2729, "a36"}, + {0x272a, "a37"}, + {0x272b, "a38"}, + {0x272c, "a39"}, + {0x2704, "a3"}, + {0x272d, "a40"}, + {0x272e, "a41"}, + {0x272f, "a42"}, + {0x2730, "a43"}, + {0x2731, "a44"}, + {0x2732, "a45"}, + {0x2733, "a46"}, + {0x2734, "a47"}, + {0x2735, "a48"}, + {0x2736, "a49"}, + {0x260e, "a4"}, + {0x2737, "a50"}, + {0x2738, "a51"}, + {0x2739, "a52"}, + {0x273a, "a53"}, + {0x273b, "a54"}, + {0x273c, "a55"}, + {0x273d, "a56"}, + {0x273e, "a57"}, + {0x273f, "a58"}, + {0x2740, "a59"}, + {0x2706, "a5"}, + {0x2741, "a60"}, + {0x2742, "a61"}, + {0x2743, "a62"}, + {0x2744, "a63"}, + {0x2745, "a64"}, + {0x2746, "a65"}, + {0x2747, "a66"}, + {0x2748, "a67"}, + {0x2749, "a68"}, + {0x274a, "a69"}, + {0x271d, "a6"}, + {0x274b, "a70"}, + {0x25cf, "a71"}, + {0x274d, "a72"}, + {0x25a0, "a73"}, + {0x274f, "a74"}, + {0x2751, "a75"}, + {0x25b2, "a76"}, + {0x25bc, "a77"}, + {0x25c6, "a78"}, + {0x2756, "a79"}, + {0x271e, "a7"}, + {0x25d7, "a81"}, + {0x2758, "a82"}, + {0x2759, "a83"}, + {0x275a, "a84"}, + {0x276f, "a85"}, + {0x2771, "a86"}, + {0x2772, "a87"}, + {0x2773, "a88"}, + {0x2768, "a89"}, + {0x271f, "a8"}, + {0x2769, "a90"}, + {0x276c, "a91"}, + {0x276d, "a92"}, + {0x276a, "a93"}, + {0x276b, "a94"}, + {0x2774, "a95"}, + {0x2775, "a96"}, + {0x275b, "a97"}, + {0x275c, "a98"}, + {0x275d, "a99"}, + {0x2720, "a9"}, + { 0, NULL } +}; -- 1.7.10.4