From e3ee72ec9995a244cd89b234a96f6cd5bcd17ad3 Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 26 Nov 2017 20:43:15 +1030 Subject: [PATCH 1/8] Fix some bugs in StructTreeRoot parsing of parent tree - Add support for parsing child nodes in the number tree - Number tree keys do not have to be consecutive numbers. Use map instead of vector for parentTree. - Due to performance impact of iterating a map instead of vector in parentTreeAdd, add a reverse mapping from Ref to parentTree. - Add mcid parameter to findParentElement() to enable finding the parent when there are multiple MCIDs on the same page. - Move RefCompare from pdfinfo.cc to Object.h so it can be used by other files. Bug #103912 --- poppler/Object.h | 9 ++++ poppler/StructElement.cc | 2 +- poppler/StructTreeRoot.cc | 121 +++++++++++++++++++++++++++------------------- poppler/StructTreeRoot.h | 14 ++++-- utils/pdfinfo.cc | 6 --- 5 files changed, 92 insertions(+), 60 deletions(-) diff --git a/poppler/Object.h b/poppler/Object.h index f2ca20d1..811ce55e 100644 --- a/poppler/Object.h +++ b/poppler/Object.h @@ -86,6 +86,15 @@ struct Ref { int gen; // generation number }; +struct RefCompare { + bool operator() (const Ref& lhs, const Ref& rhs) const { + if (lhs.num != rhs.num) + return lhs.num < rhs.num; + else + return lhs.gen < rhs.gen; + } +}; + //------------------------------------------------------------------------ // object types //------------------------------------------------------------------------ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 62925474..e46fafea 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -1198,7 +1198,7 @@ StructElement *StructElement::parseChild(Object *ref, * page will be always scanned when calling StructElement::getText(). */ Object mcidObj = childObj->dictLookup("MCID"); - if (mcidObj.isInt()) { + if (!mcidObj.isInt()) { error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName()); return NULL; } diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc index 5f561115..efcb6fb4 100644 --- a/poppler/StructTreeRoot.cc +++ b/poppler/StructTreeRoot.cc @@ -55,50 +55,7 @@ void StructTreeRoot::parse(Dict *root) // be filled-in later when parsing them. Object obj = root->lookup("ParentTree"); if (obj.isDict()) { - Object nums = obj.dictLookup("Nums"); - if (nums.isArray()) { - if (nums.arrayGetLength() % 2 == 0) { - parentTree.resize(nums.arrayGetLength() / 2); - // Index numbers in even positions, references in odd ones - for (int i = 0; i < nums.arrayGetLength(); i += 2) { - Object index = nums.arrayGet(i); - - if (!index.isInt()) { - error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i, index.getTypeName()); - continue; - } - const int idx = index.getInt(); - if (idx < 0 || idx >= (int)parentTree.size()) { - error(errSyntaxError, -1, "Nums item at position {0:d} is invalid value ({1:d}): [0..{2:d}]", i, idx, parentTree.size() - 1); - continue; - } - - Object value = nums.arrayGetNF(i + 1); - if (value.isRef()) { - parentTree[idx].resize(1); - parentTree[idx][0].ref = value.getRef(); - } else { - value = nums.arrayGet(i + 1); - if (value.isArray()) { - parentTree[idx].resize(value.arrayGetLength()); - for (int j = 0; j < value.arrayGetLength(); j++) { - Object itemvalue = value.arrayGetNF(j); - if (itemvalue.isRef()) - parentTree[idx][j].ref = itemvalue.getRef(); - else - error(errSyntaxError, -1, "Nums array item at position {0:d}/{1:d} is invalid type ({2:s})", i, j, itemvalue.getTypeName()); - } - } else { - error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i + 1, value.getTypeName()); - } - } - } - } else { - error(errSyntaxError, -1, "Nums array length is not a even ({0:d})", nums.arrayGetLength()); - } - } else { - error(errSyntaxError, -1, "Nums object is wrong type ({0:s})", nums.getTypeName()); - } + parseNumberTreeNode(obj.getDict()); } std::set seenElements; @@ -154,14 +111,80 @@ void StructTreeRoot::parse(Dict *root) } else if (!kids.isNull()) { error(errSyntaxWarning, -1, "K in StructTreeRoot is wrong type ({0:s})", kids.getTypeName()); } + + // refToParentMap is only used during parsing. Ensure all memory used by it is freed. + std::multimap().swap(refToParentMap); } -void StructTreeRoot::parentTreeAdd(const Ref &objectRef, StructElement *element) +void StructTreeRoot::parseNumberTreeNode(Dict *node) { - for (std::vector< std::vector >::iterator i = parentTree.begin(); i != parentTree.end(); ++i) { - for (std::vector::iterator j = i->begin(); j != i->end(); ++j) { - if (j->ref.num == objectRef.num && j->ref.gen == objectRef.gen) - j->element = element; + Object kids = node->lookup("Kids"); + if (kids.isArray()) { + for (int i = 0; i < kids.arrayGetLength(); i++) { + Object obj = kids.arrayGet(i); + if (obj.isDict()) { + parseNumberTreeNode(obj.getDict()); + } else { + error(errSyntaxError, -1, "Kids item at position {0:d} is wrong type ({1:s})", i, obj.getTypeName()); + } } + return; + } else if (!kids.isNull()) { + error(errSyntaxError, -1, "Kids object is wrong type ({0:s})", kids.getTypeName()); } + + Object nums = node->lookup("Nums"); + if (nums.isArray()) { + if (nums.arrayGetLength() % 2 == 0) { + // keys in even positions, references in odd ones + for (int i = 0; i < nums.arrayGetLength(); i += 2) { + Object key = nums.arrayGet(i); + + if (!key.isInt()) { + error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i, key.getTypeName()); + continue; + } + int keyVal = key.getInt(); + std::vector& vec = parentTree[keyVal]; + + Object value = nums.arrayGet(i + 1); + if (value.isArray()) { + vec.resize(value.arrayGetLength()); + memset(vec.data(), 0, vec.size()*sizeof(Parent*)); + for (int j = 0; j < value.arrayGetLength(); j++) { + Object itemvalue = value.arrayGetNF(j); + if (itemvalue.isRef()) { + Ref ref = itemvalue.getRef(); + vec[j].ref = ref; + refToParentMap.insert(std::pair(ref, &vec[j])); + } else if (!itemvalue.isNull()) { + error(errSyntaxError, -1, "Nums array item at position {0:d}/{1:d} is invalid type ({2:s})", i, j, itemvalue.getTypeName()); + } + } + } else { + value = nums.arrayGetNF(i + 1); + if (value.isRef()) { + Ref ref = value.getRef(); + vec.resize(1); + vec[0].ref = ref; + refToParentMap.insert(std::pair(ref, &vec[0])); + } else { + error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i + 1, value.getTypeName()); + } + } + } + } else { + error(errSyntaxError, -1, "Nums array length is not a even ({0:d})", nums.arrayGetLength()); + } + } else { + error(errSyntaxError, -1, "Nums object is wrong type ({0:s})", nums.getTypeName()); + } +} + + +void StructTreeRoot::parentTreeAdd(const Ref &objectRef, StructElement *element) +{ + auto range = refToParentMap.equal_range(objectRef); + for (auto it = range.first; it !=range.second; ++it) + it->second->element = element; } diff --git a/poppler/StructTreeRoot.h b/poppler/StructTreeRoot.h index 3b1f3c84..ca688499 100644 --- a/poppler/StructTreeRoot.h +++ b/poppler/StructTreeRoot.h @@ -18,6 +18,7 @@ #include "goo/gtypes.h" #include "Object.h" #include "StructElement.h" +#include #include class Dict; @@ -43,9 +44,12 @@ public: } } - const StructElement *findParentElement(unsigned index) const { - if (index < parentTree.size() && parentTree[index].size() == 1) { - return parentTree[index][0].element; + const StructElement *findParentElement(int key, unsigned mcid = 0) const { + auto it = parentTree.find(key); + if (it != parentTree.end()) { + if (mcid < it->second.size()) { + return it->second[mcid].element; + } } return NULL; } @@ -71,9 +75,11 @@ private: Object roleMap; Object classMap; ElemPtrArray elements; - std::vector< std::vector > parentTree; + std::map > parentTree; + std::multimap refToParentMap; void parse(Dict *rootDict); + void parseNumberTreeNode(Dict *node); void parentTreeAdd(const Ref &objectRef, StructElement *element); friend class StructElement; diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index a3099cf6..4cb569c5 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -291,12 +291,6 @@ static void printStruct(const StructElement *element, unsigned indent) { } } -struct RefCompare { - bool operator() (const Ref& lhs, const Ref& rhs) const { - return lhs.num < rhs.num; - } -}; - struct GooStringCompare { bool operator() (GooString* lhs, GooString* rhs) const { return lhs->cmp(const_cast(rhs)) < 0; -- 2.11.0