From 5a9452e549f322c8cf2cf4dad2843939053e5b70 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Mon, 17 Jun 2013 17:00:27 +0300 Subject: [PATCH v6 02/10] Tagged-PDF: Implement parsing of StructTreeRoot Implement parsing of the StructTreeRoot entry of the Catalog. Also, the Catalog::getStructTreeRoot() and PDFDoc::getStructTreeRoot() methods are modified to return an instance of StructTreeRoot instead of an Object. All elements from the StructTreeRoot are parsed except for: - IDTree: it is a lookup tree to locate items by their ID, which would be barely useful because the whole structure tree is to be kept in memory, which should be fast enough to traverse. - ParentTreeNextKey: This is needed only when the ParentTree object is to be modified. For the moment the implementation deals only with reading, so this has been deliberately left out. Also, pdfinfo is used to print tagging info from Catalog::getMarkInfo() instead opf assuming that the presence of the StrucTreeRoot implies that the file is tagged. --- poppler/Catalog.cc | 36 ++++++---- poppler/Catalog.h | 5 +- poppler/Makefile.am | 3 + poppler/PDFDoc.h | 3 +- poppler/StructElement.h | 56 +++++++++++++++ poppler/StructTreeRoot.cc | 169 ++++++++++++++++++++++++++++++++++++++++++++++ poppler/StructTreeRoot.h | 83 +++++++++++++++++++++++ utils/pdfinfo.cc | 8 ++- 8 files changed, 343 insertions(+), 20 deletions(-) create mode 100644 poppler/StructElement.h create mode 100644 poppler/StructTreeRoot.cc create mode 100644 poppler/StructTreeRoot.h diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc index 25a8997..a08a5df 100644 --- a/poppler/Catalog.cc +++ b/poppler/Catalog.cc @@ -56,6 +56,7 @@ #include "OptionalContent.h" #include "ViewerPreferences.h" #include "FileSpec.h" +#include "StructTreeRoot.h" #if MULTITHREADED # define catalogLocker() MutexLocker locker(&mutex) @@ -91,6 +92,7 @@ Catalog::Catalog(PDFDoc *docA) { embeddedFileNameTree = NULL; jsNameTree = NULL; viewerPrefs = NULL; + structTreeRoot = NULL; pagesList = NULL; pagesRefList = NULL; @@ -176,8 +178,8 @@ Catalog::~Catalog() { delete form; delete optContent; delete viewerPrefs; + delete structTreeRoot; metadata.free(); - structTreeRoot.free(); outline.free(); acroForm.free(); viewerPreferences.free(); @@ -838,24 +840,28 @@ PageLabelInfo *Catalog::getPageLabelInfo() return pageLabelInfo; } -Object *Catalog::getStructTreeRoot() +StructTreeRoot *Catalog::getStructTreeRoot() { catalogLocker(); - if (structTreeRoot.isNone()) - { - Object catDict; + if (!structTreeRoot) { + Object catalog; + Object root; - xref->getCatalog(&catDict); - if (catDict.isDict()) { - catDict.dictLookup("StructTreeRoot", &structTreeRoot); - } else { - error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catDict.getTypeName()); - structTreeRoot.initNull(); - } - catDict.free(); + xref->getCatalog(&catalog); + if (!catalog.isDict()) { + error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catalog.getTypeName()); + catalog.free(); + return NULL; + } + + if (catalog.dictLookup("StructTreeRoot", &root)->isDict("StructTreeRoot")) { + structTreeRoot = new StructTreeRoot(doc, root.getDict()); + } + + root.free(); + catalog.free(); } - - return &structTreeRoot; + return structTreeRoot; } Guint Catalog::getMarkInfo() diff --git a/poppler/Catalog.h b/poppler/Catalog.h index a89d9aa..4b1629a 100644 --- a/poppler/Catalog.h +++ b/poppler/Catalog.h @@ -53,6 +53,7 @@ class Form; class OCGs; class ViewerPreferences; class FileSpec; +class StructTreeRoot; //------------------------------------------------------------------------ // NameTree @@ -123,7 +124,7 @@ public: GooString *readMetadata(); // Return the structure tree root object. - Object *getStructTreeRoot(); + StructTreeRoot *getStructTreeRoot(); // Return values from the MarkInfo dictionary as flags in a bitfield. enum MarkInfoFlags { @@ -227,7 +228,7 @@ private: NameTree *jsNameTree; // Java Script name-tree GooString *baseURI; // base URI for URI-type links Object metadata; // metadata stream - Object structTreeRoot; // structure tree root dictionary + StructTreeRoot *structTreeRoot; // structure tree root Guint markInfo; // Flags from MarkInfo dictionary Object outline; // outline dictionary Object acroForm; // AcroForm dictionary diff --git a/poppler/Makefile.am b/poppler/Makefile.am index c061e71..a2efdd3 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -216,6 +216,8 @@ poppler_include_HEADERS = \ StdinPDFDocBuilder.h \ Stream-CCITT.h \ Stream.h \ + StructElement.h \ + StructTreeRoot.h \ UnicodeMap.h \ UnicodeMapTables.h \ UnicodeTypeTable.h \ @@ -294,6 +296,7 @@ libpoppler_la_SOURCES = \ StdinCachedFile.cc \ StdinPDFDocBuilder.cc \ Stream.cc \ + StructTreeRoot.cc \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h index da9bf5b..48189bc 100644 --- a/poppler/PDFDoc.h +++ b/poppler/PDFDoc.h @@ -60,6 +60,7 @@ class Outline; class Linearization; class SecurityHandler; class Hints; +class StructTreeRoot; enum PDFWriteMode { writeStandard, @@ -139,7 +140,7 @@ public: GooString *readMetadata() { return catalog->readMetadata(); } // Return the structure tree root object. - Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); } + StructTreeRoot *getStructTreeRoot() { return catalog->getStructTreeRoot(); } // Get page. Page *getPage(int page); diff --git a/poppler/StructElement.h b/poppler/StructElement.h new file mode 100644 index 0000000..79151a3 --- /dev/null +++ b/poppler/StructElement.h @@ -0,0 +1,56 @@ +//======================================================================== +// +// StructElement.h +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef STRUCTELEMENT_H +#define STRUCTELEMENT_H + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "goo/gtypes.h" +#include "goo/GooString.h" +#include "Object.h" +#include + +class GooString; +class Dict; +class StructTreeRoot; + + +// TODO: Dummy class to make it possible to build StructTreeRoot.cc +class StructElement { +public: + enum Type { + Unknown = 0, + Document, Part, Art, Sect, Div, // Structural elements + }; + + const char* getTypeName() const { return "dummy"; } + Type getType() const { return type; } + GBool isOk() const { return type != Unknown; } + StructTreeRoot* getStructTreeRoot() { return treeRoot; } + + ~StructElement() {} + +private: + + // Common data + Type type; + StructTreeRoot *treeRoot; + StructElement *parent; + + StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA = 0) {} + + friend class StructTreeRoot; +}; + +#endif + diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc new file mode 100644 index 0000000..730f944 --- /dev/null +++ b/poppler/StructTreeRoot.cc @@ -0,0 +1,169 @@ +//======================================================================== +// +// StructTreeRoot.cc +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "goo/GooString.h" +#include "StructTreeRoot.h" +#include "StructElement.h" +#include "PDFDoc.h" +#include "Object.h" +#include "Dict.h" + +#include + + +StructTreeRoot::StructTreeRoot(PDFDoc *docA, Dict *structTreeRootDict): + doc(docA) +{ + assert(doc); + assert(structTreeRootDict); + parse(structTreeRootDict); +} + +StructTreeRoot::~StructTreeRoot() +{ + for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) + delete *i; + classMap.free(); + roleMap.free(); +} + +void StructTreeRoot::parse(Dict *root) +{ + // The RoleMap/ClassMap dictionaries are needed by all the parsing + // functions, which will resolve the custom names to canonical + // standard names. + root->lookup("RoleMap", &roleMap); + root->lookup("ClassMap", &classMap); + + // ParentTree (optional). If present, it must be a number tree, + // otherwise it is not possible to map stream objects to their + // corresponsing structure element. Here only the references are + // loaded into the array, the pointers to the StructElements will + // be filled-in later when parsing them. + Object obj; + if (root->lookup("ParentTree", &obj)->isDict()) { + Object nums; + if (obj.dictLookup("Nums", &nums)->isArray()) { + if (nums.arrayGetLength() % 2 == 0) { + parentTree.resize(nums.arrayGetLength() / 2); + // Index numbers in even positions, references in odd ones + for (int i = 0; i < nums.arrayGetLength(); i += 2) { + Object index, value; + + if (!nums.arrayGet(i, &index)->isInt()) { + error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i, index.getTypeName()); + index.free(); + continue; + } + if (index.getInt() < 0) { + error(errSyntaxError, -1, "Nums item at position {0:d} is invalid value ({1:d})", i, index.getInt()); + index.free(); + continue; + } + + const unsigned idx = index.getInt(); + if (nums.arrayGetNF(i + 1, &value)->isRef()) { + parentTree[idx].resize(1); + parentTree[idx][0].ref = value.getRef(); + } else if (nums.arrayGet(i + 1, &value)->isArray()) { + parentTree[idx].resize(value.arrayGetLength()); + for (int j = 0; j < value.arrayGetLength(); j++) { + Object itemvalue; + if (value.arrayGetNF(j, &itemvalue)->isRef()) + parentTree[idx][j].ref = itemvalue.getRef(); + else + error(errSyntaxError, -1, "Nums array item at position {0:d}/{1:d} is invalid type ({2:s})", i, j, itemvalue.getTypeName()); + itemvalue.free(); + } + } else { + error(errSyntaxError, -1, "Nums item at position {0:d} is wrong type ({1:s})", i + 1, value.getTypeName()); + } + + value.free(); + index.free(); + } + } else { + error(errSyntaxError, -1, "Nums array length is not a even ({0:i})", nums.arrayGetLength()); + } + } else { + error(errSyntaxError, -1, "Nums object is wrong type ({0:s})", nums.getTypeName()); + } + nums.free(); + } + obj.free(); + + // Parse the children StructElements + const GBool marked = doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked; + Object kids; + if (root->lookup("K", &kids)->isArray()) { + if (marked && kids.arrayGetLength() > 1) { + error(errSyntaxWarning, -1, "K in StructTreeRoot has more than one children in a tagged PDF"); + } + for (int i = 0; i < kids.arrayGetLength(); i++) { + Object obj, ref; + kids.arrayGetNF(i, &ref); + if (kids.arrayGet(i, &obj)->isDict()) { + StructElement *child = new StructElement(obj.getDict(), this); + if (child->isOk()) { + if (marked && !(child->getType() == StructElement::Document || + child->getType() == StructElement::Part || + child->getType() == StructElement::Art || + child->getType() == StructElement::Div)) { + error(errSyntaxWarning, -1, "StructTreeRoot element of tagged PDF is wrong type ({0:s})", child->getTypeName()); + } + appendElement(child); + if (ref.isRef()) { + parentTreeAdd(ref.getRef(), child); + } + } else { + error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed"); + delete child; + } + } else { + error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + ref.free(); + } + } else if (kids.isDict()) { + if (marked) { + error(errSyntaxWarning, -1, "K has a child of wrong type for a tagged PDF ({0:s})", kids.getTypeName()); + } + StructElement *child = new StructElement(kids.getDict(), this); + if (child->isOk()) { + appendElement(child); + Object ref; + if (root->lookupNF("K", &ref)->isRef()) + parentTreeAdd(ref.getRef(), child); + ref.free(); + } else { + error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed"); + delete child; + } + } else if (!kids.isNull()) { + error(errSyntaxWarning, -1, "K in StructTreeRoot is wrong type ({0:s})", kids.getTypeName()); + } + + kids.free(); +} + +void StructTreeRoot::parentTreeAdd(const Ref &objectRef, StructElement *element) +{ + for (std::vector< std::vector >::iterator i = parentTree.begin(); i != parentTree.end(); ++i) { + for (std::vector::iterator j = i->begin(); j != i->end(); ++j) { + if (j->ref.num == objectRef.num && j->ref.gen == objectRef.gen) + j->element = element; + } + } +} diff --git a/poppler/StructTreeRoot.h b/poppler/StructTreeRoot.h new file mode 100644 index 0000000..9928e2f --- /dev/null +++ b/poppler/StructTreeRoot.h @@ -0,0 +1,83 @@ +//======================================================================== +// +// StructTreeRoot.h +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef STRUCTTREEROOT_H +#define STRUCTTREEROOT_H + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "goo/gtypes.h" +#include "Object.h" +#include "StructElement.h" +#include + +class Dict; +class PDFDoc; + + +class StructTreeRoot +{ +public: + StructTreeRoot(PDFDoc *docA, Dict *rootDict); + ~StructTreeRoot(); + + PDFDoc *getDoc() { return doc; } + Dict *getRoleMap() { return roleMap.isDict() ? roleMap.getDict() : NULL; } + Dict *getClassMap() { return classMap.isDict() ? classMap.getDict() : NULL; } + unsigned getNumElements() const { return elements.size(); } + const StructElement *getElement(int i) const { return elements.at(i); } + StructElement *getElement(int i) { return elements.at(i); } + + void appendElement(StructElement *element) { + if (element && element->isOk()) { + elements.push_back(element); + } + } + + const StructElement *findParentElement(unsigned index) const { + if (index < parentTree.size() && parentTree[index].size() == 1) { + return parentTree[index][0].element; + } + return NULL; + } + +private: + typedef std::vector ElemPtrArray; + + // Structure for items in /ParentTree, it keeps a mapping of + // object references and pointers to StructElement objects. + struct Parent { + Ref ref; + StructElement *element; + + Parent(): element(NULL) { ref.num = ref.gen = -1; } + Parent(const Parent &p): element(p.element) { + ref.num = p.ref.num; + ref.gen = p.ref.gen; + } + ~Parent() {} + }; + + PDFDoc *doc; + Object roleMap; + Object classMap; + ElemPtrArray elements; + std::vector< std::vector > parentTree; + + void parse(Dict *rootDict); + void parentTreeAdd(const Ref &objectRef, StructElement *element); + + friend class StructElement; +}; + +#endif + diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index 14e4f6c..f297614 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -225,8 +225,12 @@ int main(int argc, char *argv[]) { info.free(); // print tagging info - printf("Tagged: %s\n", - doc->getStructTreeRoot()->isDict() ? "yes" : "no"); + printf("Tagged: %s\n", + (doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked) ? "yes" : "no"); + printf("UserProperties: %s\n", + (doc->getCatalog()->getMarkInfo() & Catalog::markInfoUserProperties) ? "yes" : "no"); + printf("Suspects: %s\n", + (doc->getCatalog()->getMarkInfo() & Catalog::markInfoSuspects) ? "yes" : "no"); // print form info switch (doc->getCatalog()->getFormType()) -- 1.8.3.2