From 0ec20c4716351c1c3155b38469b5313a076d47d0 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Mon, 17 Jun 2013 23:20:04 +0300 Subject: [PATCH v8 03/15] Tagged-PDF: Implement parsing of StructElem objects Implement parsing of StructElem tree nodes from the document structure tree, each object is parsed as a StructElement instance. Attributes and extraction of content out from elements are not yet handled. --- poppler/Makefile.am | 1 + poppler/StructElement.cc | 322 ++++++++++++++++++++++++++++++++++++++++++++++ poppler/StructElement.h | 121 ++++++++++++++++- poppler/StructTreeRoot.cc | 11 +- 4 files changed, 447 insertions(+), 8 deletions(-) create mode 100644 poppler/StructElement.cc diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 128589a..9f90c9d 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -297,6 +297,7 @@ libpoppler_la_SOURCES = \ StdinPDFDocBuilder.cc \ Stream.cc \ StructTreeRoot.cc \ + StructElement.cc \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc new file mode 100644 index 0000000..e403457 --- /dev/null +++ b/poppler/StructElement.cc @@ -0,0 +1,322 @@ +//======================================================================== +// +// StructElement.cc +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "StructElement.h" +#include "StructTreeRoot.h" +#include "PDFDoc.h" +#include "Dict.h" + +#include + +class GfxState; + + +static const char *typeToName(StructElement::Type type) +{ + if (type == StructElement::MCID) + return "MarkedContent"; + if (type == StructElement::OBJR) + return "ObjectReference"; + + return "Unknown"; +} + + +//------------------------------------------------------------------------ +// StructElement +//------------------------------------------------------------------------ + +StructElement::StructData::StructData(): + altText(0), + actualText(0), + id(0), + title(0), + expandedAbbr(0), + language(0), + revision(0) +{ +} + +StructElement::StructData::~StructData() +{ + delete altText; + delete actualText; + delete id; + delete title; + delete language; + parentRef.free(); + for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) delete *i; +} + + +StructElement::StructElement(Dict *element, + StructTreeRoot *treeRootA, + StructElement *parentA, + std::set &seen): + type(Unknown), + treeRoot(treeRootA), + parent(parentA), + s(new StructData()) +{ + assert(treeRoot); + assert(element); + + parse(element); + parseChildren(element, seen); +} + +StructElement::StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA): + type(MCID), + treeRoot(treeRootA), + parent(parentA), + c(new ContentData(mcid)) +{ + assert(treeRoot); + assert(parent); +} + +StructElement::StructElement(const Ref& ref, StructTreeRoot *treeRootA, StructElement *parentA): + type(OBJR), + treeRoot(treeRootA), + parent(parentA), + c(new ContentData(ref)) +{ + assert(treeRoot); + assert(parent); +} + +StructElement::~StructElement() +{ + if (isContent()) + delete c; + else + delete s; + pageRef.free(); +} + +GBool StructElement::hasPageRef() const +{ + return pageRef.isRef() || (parent && parent->hasPageRef()); +} + +bool StructElement::getPageRef(Ref& ref) const +{ + if (pageRef.isRef()) { + ref = pageRef.getRef(); + return gTrue; + } + + if (parent) + return parent->getPageRef(ref); + + return gFalse; +} + +const char* StructElement::getTypeName() const +{ + return typeToName(type); +} + +static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) +{ + // TODO Replace this dummy implementation + return StructElement::Unknown; +} + +void StructElement::parse(Dict *element) +{ + Object obj; + + // Type is optional, but if present must be StructElem + if (!element->lookup("Type", &obj)->isNull() && !obj.isName("StructElem")) { + error(errSyntaxError, -1, "Type of StructElem object is wrong"); + obj.free(); + return; + } + obj.free(); + + // Parent object reference (required). + if (!element->lookupNF("P", &s->parentRef)->isRef()) { + error(errSyntaxError, -1, "P object is wrong type ({0:s})", obj.getTypeName()); + return; + } + + // Check whether the S-type is valid for the top level + // element and create a node of the appropriate type. + if (!element->lookup("S", &obj)->isName()) { + error(errSyntaxError, -1, "S object is wrong type ({0:s})", obj.getTypeName()); + obj.free(); + return; + } + + // Type name may not be standard, resolve through RoleMap first. + if (treeRoot->getRoleMap()) { + Object resolvedName; + type = roleMapResolve(treeRoot->getRoleMap(), obj.getName(), NULL, &resolvedName); + } + + obj.free(); + + // Object ID (optional), to be looked at the IDTree in the tree root. + if (element->lookup("ID", &obj)->isString()) { + s->id = obj.takeString(); + } + obj.free(); + + // Page reference (optional) in which at least one of the child items + // is to be rendered in. Note: each element stores only the /Pg value + // contained by it, and StructElement::getPageRef() may look in parent + // elements to find the page where an element belongs. + element->lookupNF("Pg", &pageRef); + + // Revision number (optional). + if (element->lookup("R", &obj)->isInt()) { + s->revision = obj.getInt(); + } + obj.free(); + + // Element title (optional). + if (element->lookup("T", &obj)->isString()) { + s->title = obj.takeString(); + } + obj.free(); + + // Language (optional). + if (element->lookup("Lang", &obj)->isString()) { + s->language = obj.takeString(); + } + obj.free(); + + // Alternative text (optional). + if (element->lookup("Alt", &obj)->isString()) { + s->altText = obj.takeString(); + } + obj.free(); + + // Expanded form of an abbreviation (optional). + if (element->lookup("E", &obj)->isString()) { + s->expandedAbbr = obj.takeString(); + } + obj.free(); + + // Actual text (optional). + if (element->lookup("ActualText", &obj)->isString()) { + s->actualText = obj.takeString(); + } + obj.free(); + + // TODO: Attributes directly attached to the element (optional). + // TODO: Attributes referenced indirectly through the ClassMap (optional). +} + +StructElement *StructElement::parseChild(Object *ref, + Object *childObj, + std::set &seen) +{ + assert(childObj); + assert(ref); + + StructElement *child = NULL; + + if (childObj->isInt()) { + child = new StructElement(childObj->getInt(), treeRoot, this); + } else if (childObj->isDict("MCR")) { + /* + * TODO: The optional Stm/StwOwn attributes are not handled, so all the + * page will be always scanned when calling StructElement::getText(). + */ + Object mcidObj; + Object pageRefObj; + + if (!childObj->dictLookup("MCID", &mcidObj)->isInt()) { + error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName()); + mcidObj.free(); + return NULL; + } + + child = new StructElement(mcidObj.getInt(), treeRoot, this); + mcidObj.free(); + + if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) { + child->pageRef = pageRefObj; + } else { + pageRefObj.free(); + } + } else if (childObj->isDict("OBJR")) { + Object refObj; + + if (childObj->dictLookupNF("Obj", &refObj)->isRef()) { + Object pageRefObj; + + child = new StructElement(refObj.getRef(), treeRoot, this); + + if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) { + child->pageRef = pageRefObj; + } else { + pageRefObj.free(); + } + } else { + error(errSyntaxError, -1, "Obj object is wrong type ({0:s})", refObj.getTypeName()); + } + refObj.free(); + } else if (childObj->isDict()) { + if (!ref->isRef()) { + error(errSyntaxError, -1, + "Structure element dictionary is not an indirect reference ({0:s})", + ref->getTypeName()); + } else if (seen.find(ref->getRefNum()) == seen.end()) { + seen.insert(ref->getRefNum()); + child = new StructElement(childObj->getDict(), treeRoot, this, seen); + } else { + error(errSyntaxWarning, -1, + "Loop detected in structure tree, skipping subtree at object {0:i}:{0:i}", + ref->getRefNum(), ref->getRefGen()); + } + } else { + error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", childObj->getTypeName()); + } + + if (child) { + if (child->isOk()) { + appendElement(child); + if (ref->isRef()) + treeRoot->parentTreeAdd(ref->getRef(), child); + } else { + delete child; + child = NULL; + } + } + + return child; +} + +void StructElement::parseChildren(Dict *element, std::set &seen) +{ + Object kids; + + if (element->lookup("K", &kids)->isArray()) { + for (int i = 0; i < kids.arrayGetLength(); i++) { + Object obj, ref; + parseChild(kids.arrayGetNF(i, &ref), kids.arrayGet(i, &obj), seen); + obj.free(); + ref.free(); + } + } else if (kids.isDict() || kids.isInt()) { + Object ref; + parseChild(element->lookupNF("K", &ref), &kids, seen); + ref.free(); + } + + kids.free(); +} diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 79151a3..d1997c9 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -19,35 +19,146 @@ #include "goo/GooString.h" #include "Object.h" #include +#include class GooString; class Dict; class StructTreeRoot; -// TODO: Dummy class to make it possible to build StructTreeRoot.cc class StructElement { public: enum Type { Unknown = 0, + MCID, // MCID reference, used internally + OBJR, // Object reference, used internally + Document, Part, Art, Sect, Div, // Structural elements + + Span, Quote, Note, Reference, BibEntry, // Inline elements + Code, Link, Annot, + BlockQuote, Caption, NonStruct, + TOC, TOCI, Index, Private, + + P, H, H1, H2, H3, H4, H5, H6, // Paragraph-like + + L, LI, Lbl, // List elements + + Table, TR, TH, TD, THead, TFoot, TBody, // Table elements + + Ruby, RB, RT, RP, // Ruby text elements + Warichu, WT, WP, + + Figure, Formula, Form, // Illustration-like elements }; - const char* getTypeName() const { return "dummy"; } + static const Ref InvalidRef; + + const char *getTypeName() const; Type getType() const { return type; } GBool isOk() const { return type != Unknown; } - StructTreeRoot* getStructTreeRoot() { return treeRoot; } - ~StructElement() {} + inline GBool isContent() const { return (type == MCID) || isObjectRef(); } + inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); } + + int getMCID() const { return c->mcid; } + Ref getObjectRef() const { return c->ref; } + Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef.getRef(); } + GBool hasPageRef() const; + GBool getPageRef(Ref& ref) const; + StructTreeRoot *getStructTreeRoot() { return treeRoot; } + + // Optional element identifier. + const GooString *getID() const { return isContent() ? NULL : s->id; } + GooString *getID() { return isContent() ? NULL : s->id; } + + // Optional ISO language name, e.g. en_US + GooString *getLanguage() { + if (!isContent() && s->language) return s->language; + return parent ? parent->getLanguage() : NULL; + } + const GooString *getLanguage() const { + if (!isContent() && s->language) return s->language; + return parent ? parent->getLanguage() : NULL; + } + + // Optional revision number, defaults to zero. + Guint getRevision() const { return isContent() ? 0 : s->revision; } + void setRevision(Guint revision) { if (isContent()) s->revision = revision; } + + // Optional element title, in human-readable form. + const GooString *getTitle() const { return isContent() ? NULL : s->title; } + GooString *getTitle() { return isContent() ? NULL : s->title; } + + // Optional element expanded abbreviation text. + const GooString *getExpandedAbbr() const { return isContent() ? NULL : s->expandedAbbr; } + GooString *getExpandedAbbr() { return isContent() ? NULL : s->expandedAbbr; } + + unsigned getNumElements() const { return isContent() ? 0 : s->elements.size(); } + const StructElement *getElement(int i) const { return isContent() ? NULL : s->elements.at(i); } + StructElement *getElement(int i) { return isContent() ? NULL : s->elements.at(i); } + + void appendElement(StructElement *element) { + if (!isContent() && element && element->isOk()) { + s->elements.push_back(element); + } + } + + const GooString *getAltText() const { return isContent() ? NULL : s->altText; } + GooString *getAltText() { return isContent() ? NULL : s->altText; } + + const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } + GooString *getActualText() { return isContent() ? NULL : s->actualText; } + + ~StructElement(); private: + typedef std::vector ElemPtrArray; + + struct StructData { + Object parentRef; + GooString *altText; + GooString *actualText; + GooString *id; + GooString *title; + GooString *expandedAbbr; + GooString *language; + Guint revision; + ElemPtrArray elements; + + StructData(); + ~StructData(); + }; + + // Data in content elements (MCID, MCR) + struct ContentData { + union { + int mcid; + Ref ref; + }; + + ContentData(int mcidA): mcid(mcidA) {} + ContentData(const Ref& r) { ref.num = r.num; ref.gen = r.gen; } + }; // Common data Type type; StructTreeRoot *treeRoot; StructElement *parent; + mutable Object pageRef; - StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA = 0) {} + union { + StructData *s; + ContentData *c; + }; + + StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, std::set &seen); + StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA); + StructElement(const Ref &ref, StructTreeRoot *treeRootA, StructElement *parentA); + + void parse(Dict* elementDict); + StructElement* parseChild(Object *ref, Object* childObj, std::set &seen); + void parseChildren(Dict* element, std::set &seen); friend class StructTreeRoot; }; diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc index 730f944..59f017e 100644 --- a/poppler/StructTreeRoot.cc +++ b/poppler/StructTreeRoot.cc @@ -18,7 +18,7 @@ #include "PDFDoc.h" #include "Object.h" #include "Dict.h" - +#include #include @@ -103,6 +103,8 @@ void StructTreeRoot::parse(Dict *root) } obj.free(); + std::set seenElements; + // Parse the children StructElements const GBool marked = doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked; Object kids; @@ -113,8 +115,11 @@ void StructTreeRoot::parse(Dict *root) for (int i = 0; i < kids.arrayGetLength(); i++) { Object obj, ref; kids.arrayGetNF(i, &ref); + if (ref.isRef()) { + seenElements.insert(ref.getRefNum()); + } if (kids.arrayGet(i, &obj)->isDict()) { - StructElement *child = new StructElement(obj.getDict(), this); + StructElement *child = new StructElement(obj.getDict(), this, NULL, seenElements); if (child->isOk()) { if (marked && !(child->getType() == StructElement::Document || child->getType() == StructElement::Part || @@ -140,7 +145,7 @@ void StructTreeRoot::parse(Dict *root) if (marked) { error(errSyntaxWarning, -1, "K has a child of wrong type for a tagged PDF ({0:s})", kids.getTypeName()); } - StructElement *child = new StructElement(kids.getDict(), this); + StructElement *child = new StructElement(kids.getDict(), this, NULL, seenElements); if (child->isOk()) { appendElement(child); Object ref; -- 1.8.4