From e7192c2f629cc2407ef2c9828893251387adbec1 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Mon, 17 Jun 2013 23:20:04 +0300 Subject: [PATCH v5 03/10] Tagged-PDF: Implement parsing of StructElem objects Implement parsing of StructElem tree nodes from the document structure tree, each object is parsed as a StructElement instance. Attributes and extraction of content out from elements are not yet handled. --- poppler/Makefile.am | 1 + poppler/StructElement.cc | 466 +++++++++++++++++++++++++++++++++++++++++++++++ poppler/StructElement.h | 145 ++++++++++++++- 3 files changed, 608 insertions(+), 4 deletions(-) create mode 100644 poppler/StructElement.cc diff --git a/poppler/Makefile.am b/poppler/Makefile.am index e6ecb12..9be0811 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -318,6 +318,7 @@ libpoppler_la_SOURCES = \ StdinPDFDocBuilder.cc \ Stream.cc \ StructTreeRoot.cc \ + StructElement.cc \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc new file mode 100644 index 0000000..589767b --- /dev/null +++ b/poppler/StructElement.cc @@ -0,0 +1,466 @@ +//======================================================================== +// +// StructElement.cc +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "StructElement.h" +#include "StructTreeRoot.h" +#include "PDFDoc.h" +#include "Dict.h" + +#include + +class GfxState; + + +// Maps element types to their names and also serves as lookup table +// for additional element type attributes. + +enum ElementType { + elementTypeUndefined, + elementTypeInline, + elementTypeBlock, +}; + +static const struct TypeMapEntry { + StructElement::Type type; + const char* name; + ElementType elementType; +} typeMap[] = { + { StructElement::Document, "Document", elementTypeInline }, + { StructElement::Part, "Part", elementTypeInline }, + { StructElement::Art, "Art", elementTypeInline }, + { StructElement::Sect, "Sect", elementTypeInline }, + { StructElement::Div, "Div", elementTypeInline }, + { StructElement::BlockQuote, "BlockQuote", elementTypeInline }, + { StructElement::Caption, "Caption", elementTypeInline }, + { StructElement::NonStruct, "NonStruct", elementTypeInline }, + { StructElement::Index, "Index", elementTypeInline }, + { StructElement::Private, "Private", elementTypeInline }, + { StructElement::Span, "Span", elementTypeInline }, + { StructElement::Quote, "Quote", elementTypeInline }, + { StructElement::Note, "Note", elementTypeInline }, + { StructElement::Reference, "Reference", elementTypeInline }, + { StructElement::BibEntry, "BibEntry", elementTypeInline }, + { StructElement::Code, "Code", elementTypeInline }, + { StructElement::Link, "Link", elementTypeInline }, + { StructElement::Annot, "Annot", elementTypeInline }, + { StructElement::Ruby, "Ruby", elementTypeInline }, + { StructElement::RB, "RB", elementTypeUndefined }, + { StructElement::RT, "RT", elementTypeUndefined }, + { StructElement::RP, "RP", elementTypeUndefined }, + { StructElement::Warichu, "Warichu", elementTypeInline }, + { StructElement::WT, "WT", elementTypeUndefined }, + { StructElement::WP, "WP", elementTypeUndefined }, + { StructElement::P, "P", elementTypeBlock }, + { StructElement::H, "H", elementTypeBlock }, + { StructElement::H1, "H1", elementTypeBlock }, + { StructElement::H2, "H2", elementTypeBlock }, + { StructElement::H3, "H3", elementTypeBlock }, + { StructElement::H4, "H4", elementTypeBlock }, + { StructElement::H5, "H5", elementTypeBlock }, + { StructElement::H6, "H6", elementTypeBlock }, + { StructElement::L, "L", elementTypeBlock }, + { StructElement::LI, "LI", elementTypeBlock }, + { StructElement::Lbl, "Lbl", elementTypeBlock }, + { StructElement::Table, "Table", elementTypeBlock }, + { StructElement::TR, "TR", elementTypeUndefined }, + { StructElement::TH, "TH", elementTypeUndefined }, + { StructElement::TD, "TD", elementTypeUndefined }, + { StructElement::THead, "THead", elementTypeUndefined }, + { StructElement::TFoot, "TFoot", elementTypeUndefined }, + { StructElement::TBody, "TBody", elementTypeUndefined }, + { StructElement::Figure, "Figure", elementTypeUndefined }, + { StructElement::Formula, "Formula", elementTypeUndefined }, + { StructElement::Form, "Form", elementTypeUndefined }, + { StructElement::TOC, "TOC", elementTypeUndefined }, + { StructElement::TOCI, "TOCI", elementTypeUndefined }, +}; + + +static inline const TypeMapEntry *getTypeMapEntry(StructElement::Type type) +{ + for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { + if (type == typeMap[i].type) + return &typeMap[i]; + } + return NULL; +} + +static inline const TypeMapEntry *getTypeMapEntry(const char *name) +{ + for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { + if (strcmp(name, typeMap[i].name) == 0) + return &typeMap[i]; + } + return NULL; +} + +static const char *typeToName(StructElement::Type type) +{ + if (type == StructElement::MCID) + return "MarkedContent"; + if (type == StructElement::OBJR) + return "ObjectReference"; + + const TypeMapEntry *entry = getTypeMapEntry(type); + return entry ? entry->name : "Unknown"; +} + +static StructElement::Type nameToType(const char *name) +{ + const TypeMapEntry *entry = getTypeMapEntry(name); + return entry ? entry->type : StructElement::Unknown; +} + + +//------------------------------------------------------------------------ +// StructElement +//------------------------------------------------------------------------ + +const Ref StructElement::InvalidRef = { -1, -1 }; + + +StructElement::StructData::StructData(): + parentRef(), + altText(0), + actualText(0), + id(0), + title(0), + expandedAbbr(0), + language(0), + revision(0), + elements() +{ +} + +StructElement::StructData::~StructData() +{ + delete altText; + delete actualText; + delete id; + delete title; + delete language; + parentRef.free(); + for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) delete *i; +} + + +StructElement::StructElement(Dict *element, StructTreeRoot *treeRootA, StructElement *parentA): + type(Unknown), + treeRoot(treeRootA), + parent(parentA), + pageRef(), + s(new StructData()) +{ + assert(treeRoot); + assert(element); + parse(element); +} + +StructElement::StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA): + type(MCID), + treeRoot(treeRootA), + parent(parentA), + pageRef(), + c(new ContentData(mcid)) +{ + assert(treeRoot); + assert(parent); + assert(c->mcid != InvalidMCID); +} + +StructElement::StructElement(const Ref& ref, StructTreeRoot *treeRootA, StructElement *parentA): + type(OBJR), + treeRoot(treeRootA), + parent(parentA), + pageRef(), + c(new ContentData(ref)) +{ + assert(treeRoot); + assert(parent); + assert(c->ref.num >= 0); + assert(c->ref.gen >= 0); +} + +StructElement::~StructElement() +{ + if (isContent()) + delete c; + else + delete s; + pageRef.free(); +} + +GBool StructElement::isBlock() const +{ + const TypeMapEntry *entry = getTypeMapEntry(type); + return entry ? (entry->elementType == elementTypeBlock) : gFalse; +} + +GBool StructElement::isInline() const +{ + const TypeMapEntry *entry = getTypeMapEntry(type); + return entry ? (entry->elementType == elementTypeInline) : gFalse; +} + +GBool StructElement::hasPageRef() const +{ + return pageRef.isRef() || (parent && parent->hasPageRef()); +} + +Ref StructElement::getPageRef() const +{ + if (pageRef.isRef()) + return pageRef.getRef(); + + if (parent) + return parent->getPageRef(); + + static const Ref invalidRef = { -1, -1 }; + return invalidRef; +} + +const char* StructElement::getTypeName() const +{ + return typeToName(type); +} + +GooString* StructElement::getText(GooString *string, GBool recursive) const +{ + // TODO: Dummy implementation, complete + return NULL; +} + +static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) +{ + // Circular reference + if (curName && !strcmp(name, curName)) + return StructElement::Unknown; + + if (roleMap->lookup(curName ? curName : name, resolved)->isName()) { + StructElement::Type type = nameToType(resolved->getName()); + return type == StructElement::Unknown + ? roleMapResolve(roleMap, name, resolved->getName(), resolved) + : type; + } + + if (!resolved->isNull()) + error(errSyntaxWarning, -1, "RoleMap entry is wrong type ({0:s})", resolved->getTypeName()); + return StructElement::Unknown; +} + +void StructElement::parse(Dict *element) +{ + Object obj; + + // Type is optional, but if present must be StructElem + if (!element->lookup("Type", &obj)->isNull() && !obj.isName("StructElem")) { + error(errSyntaxError, -1, "Type of StructElem object is wrong"); + obj.free(); + return; + } + obj.free(); + + // Parent object reference (required). + if (!element->lookupNF("P", &s->parentRef)->isRef()) { + error(errSyntaxError, -1, "P object is wrong type ({0:s})", obj.getTypeName()); + return; + } + + // Check whether the S-type is valid for the top level + // element and create a node of the appropriate type. + if (!element->lookup("S", &obj)->isName()) { + error(errSyntaxError, -1, "S object is wrong type ({0:s})", obj.getTypeName()); + obj.free(); + return; + } + + // Type name may not be standard, resolve through RoleMap first. + if (treeRoot->getRoleMap()) { + Object resolvedName; + type = roleMapResolve(treeRoot->getRoleMap(), obj.getName(), NULL, &resolvedName); + } + + // Resolving through RoleMap may leave type as Unknown, e.g. for types + // which are not present in it, yet they are standard element types. + if (type == Unknown) + type = nameToType(obj.getName()); + + // At this point either the type name must have been resolved. + if (type == Unknown) { + error(errSyntaxError, -1, "StructElem object is wrong type ({0:s})", obj.getName()); + obj.free(); + return; + } + obj.free(); + + // Object ID (optional), to be looked at the IDTree in the tree root. + if (element->lookup("ID", &obj)->isString()) { + s->id = new GooString(obj.getString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "ID object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Page reference (optional) in which at least one of the child items + // is to be rendered in. Note: each element stores only the /Pg value + // contained by it, and StructElement::getPageRef() may look in parent + // elements to find the page where an element belongs. + if (!element->lookupNF("Pg", &pageRef)->isRef() && !pageRef.isNull()) { + error(errSyntaxWarning, -1, "Pg object is wrong type ({0:s})", pageRef.getTypeName()); + } + + // Revision number (optional). + if (element->lookup("R", &obj)->isInt()) { + s->revision = obj.getInt(); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "R object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Element title (optional). + if (element->lookup("T", &obj)->isString()) { + s->title = new GooString(obj.getString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "T object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Language (optional). + if (element->lookup("Lang", &obj)->isString()) { + s->language = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "Lang object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Alternative text (optional). + if (element->lookup("Alt", &obj)->isString()) { + s->altText = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "Alt object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Expanded form of an abbreviation (optional). + if (element->lookup("E", &obj)->isString()) { + s->expandedAbbr = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "E object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Actual text (optional). + if (element->lookup("ActualText", &obj)->isString()) { + s->actualText = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "ActualText object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // TODO: Attributes directly attached to the element (optional). + // TODO: Attributes referenced indirectly through the ClassMap (optional). + + parseChildren(element); +} + +StructElement *StructElement::parseChild(Object *ref, Object *childObj) +{ + assert(childObj); + StructElement *child = NULL; + + if (childObj->isInt()) { + child = new StructElement(childObj->getInt(), treeRoot, this); + } else if (childObj->isDict("MCR")) { + /* + * TODO: The optional Stm/StwOwn attributes are not handled, so all the + * page will be always scanned when calling StructElement::getText(). + */ + Object mcidObj; + Object pageRefObj; + + if (!childObj->dictLookup("MCID", &mcidObj)->isInt()) { + error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName()); + mcidObj.free(); + return NULL; + } + + child = new StructElement(mcidObj.getInt(), treeRoot, this); + + if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) { + child->pageRef = pageRefObj; + } else { + pageRefObj.free(); + } + } else if (childObj->isDict("OBJR")) { + Object refObj; + + if (childObj->dictLookupNF("Obj", &refObj)->isRef()) { + Object pageRefObj; + + child = new StructElement(refObj.getRef(), treeRoot, this); + + if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) { + child->pageRef = pageRefObj; + } else { + pageRefObj.free(); + } + } else { + error(errSyntaxError, -1, "Obj object is wrong type ({0:s})", refObj.getTypeName()); + } + refObj.free(); + } else if (childObj->isDict()) { + child = new StructElement(childObj->getDict(), treeRoot, this); + } else { + error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", childObj->getTypeName()); + } + + if (child) { + if (child->isOk()) { + appendElement(child); + if (ref->isRef()) + treeRoot->parentTreeAdd(ref->getRef(), child); + } else { + delete child; + child = NULL; + } + } + + return child; +} + +void StructElement::parseChildren(Dict *element) +{ + Object kids; + + if (element->lookup("K", &kids)->isArray()) { + for (int i = 0; i < kids.arrayGetLength(); i++) { + Object obj, ref; + parseChild(kids.arrayGetNF(i, &ref), kids.arrayGet(i, &obj)); + obj.free(); + ref.free(); + } + } else if (kids.isDict() || kids.isInt()) { + Object ref; + parseChild(element->lookupNF("K", &ref), &kids); + ref.free(); + } else if (!kids.isNull()) { + error(errSyntaxWarning, -1, "K in StructElement is wrong type ({0:s})", kids.getTypeName()); + } + + kids.free(); +} diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 219b66a..499ed4c 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -25,29 +25,166 @@ class Dict; class StructTreeRoot; -// TODO: Dummy class to make it possible to build StructTreeRoot.cc class StructElement { public: enum Type { Unknown = 0, + MCID, // MCID reference, used internally + OBJR, // Object reference, used internally + Document, Part, Art, Sect, Div, // Structural elements + + Span, Quote, Note, Reference, BibEntry, // Inline elements + Code, Link, Annot, + BlockQuote, Caption, NonStruct, + TOC, TOCI, Index, Private, + + P, H, H1, H2, H3, H4, H5, H6, // Paragraph-like + + L, LI, Lbl, // List elements + + Table, TR, TH, TD, THead, TFoot, TBody, // Table elements + + Ruby, RB, RT, RP, // Ruby text elements + Warichu, WT, WP, + + Figure, Formula, Form, // Illustration-like elements }; - const char* getTypeName() const { return "dummy"; } + static const int InvalidMCID = -1; + static const Ref InvalidRef; + + const char *getTypeName() const; Type getType() const { return type; } GBool isOk() const { return type != Unknown; } - StructTreeRoot* getStructTreeRoot() { return treeRoot; } + GBool isBlock() const; + GBool isInline() const; + + inline GBool isContent() const { return (type == MCID && c->mcid != InvalidMCID) || isObjectRef(); } + inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); } + + int getMCID() const { return type == MCID ? c->mcid : InvalidMCID; } + Ref getObjectRef() const { return type == OBJR ? c->ref : InvalidRef; } + Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef.getRef(); } + GBool hasPageRef() const; + Ref getPageRef() const; + StructTreeRoot *getStructTreeRoot() { return treeRoot; } + + // Optional element identifier. + const GooString *getID() const { return isContent() ? NULL : s->id; } + GooString *getID() { return isContent() ? NULL : s->id; } + + // Optional ISO language name, e.g. en_US + GooString *getLang() + { return (!isContent() && s->language) ? s->language : (parent ? parent->getLang() : NULL); } + const GooString *getLang() const + { return (!isContent() && s->language) ? s->language : (parent ? parent->getLang() : NULL); } + + // Optional revision number, defaults to zero. + Guint getRevision() const { return isContent() ? 0 : s->revision; } + void setRevision(Guint revision) { if (isContent()) s->revision = revision; } + + // Optional element title, in human-readable form. + const GooString *getTitle() const { return isContent() ? NULL : s->title; } + GooString *getTitle() { return isContent() ? NULL : s->title; } + + // Optional element expanded abbreviation text. + const GooString *getExpandedAbbr() const { return isContent() ? NULL : s->expandedAbbr; } + GooString *getExpandedAbbr() { return isContent() ? NULL : s->expandedAbbr; } + + unsigned getNumElements() const { return isContent() ? 0 : s->elements.size(); } + const StructElement *getElement(int i) const { return isContent() ? NULL : s->elements.at(i); } + StructElement *getElement(int i) { return isContent() ? NULL : s->elements.at(i); } + + void appendElement(StructElement *element) + { if (!isContent() && element && element->isOk()) s->elements.push_back(element); } + +<<<<<<< HEAD + const GooString* getAltText() const { return isContent() ? NULL : s->altText; } + GooString* getAltText() { return isContent() ? NULL : s->altText; } +======= + unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } + const Attribute *getAttribute(int i) const { return isContent() ? NULL : s->attributes.at(i); } + Attribute *getAttribute(int i) { return isContent() ? NULL : s->attributes.at(i); } + + void appendAttribute(Attribute *attribute) + { if (!isContent() && attribute) s->attributes.push_back(attribute); } + + const Attribute* findAttribute(Attribute::Type attributeType, GBool inherit = gFalse, + Attribute::Owner owner = Attribute::UnknownOwner) const; + + const GooString *getAltText() const { return isContent() ? NULL : s->altText; } + GooString *getAltText() { return isContent() ? NULL : s->altText; } +>>>>>>> 9932f3b... fixup! Tagged-PDF: Implement parsing of StructElem objects + + const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } + GooString *getActualText() { return isContent() ? NULL : s->actualText; } + + // Content text referenced by the element: + // + // - For MCID reference elements, this is just the text of the + // corresponding marked content object in the page stream, regardless + // of the setting of the "recursive" flag. + // - For other elements, if the "recursive" flag is set, the text + // enclosed by *all* the child MCID reference elements of the subtree + // is returned. The text is assembled by traversing the leaf MCID + // reference elements in logical order. + // - In any other case, the function returns NULL. + // + // The text will be appended to the passed GooString. If NULL is passed, + // a new string is returned, and the ownership passed to the caller. + // + GooString* getText(GooString *string = NULL, GBool recursive = gTrue) const; ~StructElement(); private: + typedef std::vector ElemPtrArray; + + struct StructData { + Object parentRef; + GooString *altText; + GooString *actualText; + GooString *id; + GooString *title; + GooString *expandedAbbr; + GooString *language; + Guint revision; + ElemPtrArray elements; + + StructData(); + ~StructData(); + }; + + // Data in content elements (MCID, MCR) + struct ContentData { + union { + int mcid; + Ref ref; + }; + + ContentData(int mcidA = InvalidMCID): mcid(mcidA) {} + ContentData(const Ref& r) { ref.num = r.num; ref.gen = r.gen; } + }; // Common data Type type; StructTreeRoot *treeRoot; StructElement *parent; + mutable Object pageRef; - StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA = 0) {} + union { + StructData *s; + ContentData *c; + }; + + StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA = 0); + StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA); + StructElement(const Ref &ref, StructTreeRoot *treeRootA, StructElement *parentA); + + void parse(Dict *elementDict); + StructElement *parseChild(Object *ref, Object *childObj); + void parseChildren(Dict *element); friend class StructTreeRoot; }; -- 1.8.3.1