From ef845cc67b192500628f4197fe68437db0e040e1 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Thu, 26 Sep 2013 20:46:34 +0300 Subject: [PATCH v8 04/15] Tagged-PDF: Parsing of StructElem standard types and attributes Parse attributes and standard types of StructElem nodes of the document structure tree. Type name aliases are resolved via the RoleMap (and cycles detected). Both standard attributes and user properties are mapped to instances of the Attribute class. Attributes are parsed both via ClassMap references and directly referenced from the StructElem objects. --- poppler/Makefile.am | 2 +- poppler/StructElement.cc | 1009 +++++++++++++++++++++++++++++++++++++++++++++- poppler/StructElement.h | 114 +++++- 3 files changed, 1119 insertions(+), 6 deletions(-) diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 9f90c9d..7e5f3c6 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -296,8 +296,8 @@ libpoppler_la_SOURCES = \ StdinCachedFile.cc \ StdinPDFDocBuilder.cc \ Stream.cc \ - StructTreeRoot.cc \ StructElement.cc \ + StructTreeRoot.cc \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 837321c..f7c6b46 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -22,6 +22,628 @@ class GfxState; +static GBool isPlacementName(Object *value) +{ + return value->isName("Block") + || value->isName("Inline") + || value->isName("Before") + || value->isName("Start") + || value->isName("End"); +} + +static GBool isWritingModeName(Object *value) +{ + return value->isName("LrTb") + || value->isName("RlTb") + || value->isName("TbRl"); +} + +static GBool isBorderStyleName(Object *value) +{ + return value->isName("None") + || value->isName("Hidden") + || value->isName("Dotted") + || value->isName("Dashed") + || value->isName("Solid") + || value->isName("Double") + || value->isName("Groove") + || value->isName("Ridge") + || value->isName("Inset") + || value->isName("Outset"); +} + +static GBool isTextAlignName(Object *value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center") + || value->isName("Justify"); +} + +static GBool isBlockAlignName(Object *value) +{ + return value->isName("Before") + || value->isName("Middle") + || value->isName("After") + || value->isName("Justify"); +} + +static GBool isInlineAlignName(Object *value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center"); +} + +static GBool isNumber(Object *value); + +static GBool isLineHeight(Object *value) +{ + return value->isName("Normal") + || value->isName("Auto") + || isNumber(value); +} + +static GBool isTextDecorationName(Object *value) +{ + return value->isName("None") + || value->isName("Underline") + || value->isName("Overline") + || value->isName("LineThrough"); +} + +static GBool isRubyAlignName(Object *value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center") + || value->isName("Justify") + || value->isName("Distribute"); +} + +static GBool isRubyPositionName(Object *value) +{ + return value->isName("Before") + || value->isName("After") + || value->isName("Warichu") + || value->isName("Inline"); +} + +static GBool isGlyphOrientationName(Object *value) +{ + return value->isName("Auto") + || value->isName("90") + || value->isName("180") + || value->isName("270") + || value->isName("360") + || value->isName("-90") + || value->isName("-180"); +} + +static GBool isListNumberingName(Object *value) +{ + return value->isName("None") + || value->isName("Disc") + || value->isName("Circle") + || value->isName("Square") + || value->isName("Decimal") + || value->isName("UpperRoman") + || value->isName("LowerRoman") + || value->isName("UpperAlpha") + || value->isName("LowerAlpha"); +} + +static GBool isFieldRoleName(Object *value) +{ + return value->isName("rb") + || value->isName("cb") + || value->isName("pb") + || value->isName("tv"); +} + +static GBool isFieldCheckedName(Object *value) +{ + return value->isName("on") + || value->isName("off") + || value->isName("neutral"); +} + +static GBool isTableScopeName(Object *value) +{ + return value->isName("Row") + || value->isName("Column") + || value->isName("Both"); +} + +static GBool isRGBColor(Object *value) +{ + if (!(value->isArray() && value->arrayGetLength() == 3)) + return gFalse; + + GBool okay = gTrue; + for (int i = 0; i < 3; i++) { + Object obj; + if (!value->arrayGet(i, &obj)->isNum()) { + okay = gFalse; + obj.free(); + break; + } + if (obj.getNum() < 0.0 || obj.getNum() > 1.0) { + okay = gFalse; + obj.free(); + break; + } + obj.free(); + } + + return okay; +} + +static GBool isNatural(Object *value) +{ + return (value->isInt() && value->getInt() > 0) + || (value->isInt64() && value->getInt64() > 0); +} + +static GBool isPositive(Object *value) +{ + return value->isNum() && value->getNum() >= 0.0; +} + +static GBool isNumber(Object *value) +{ + return value->isNum(); +} + +static GBool isNumberOrAuto(Object *value) +{ + return isNumber(value) || value->isName("Auto"); +} + +static GBool isTextString(Object *value) +{ + // XXX: Shall isName() also be checked? + return value->isString(); +} + + +#define ARRAY_CHECKER(name, checkItem, length, allowSingle, allowNulls) \ + static GBool name(Object *value) { \ + if (!value->isArray()) \ + return allowSingle ? checkItem(value) : gFalse; \ + \ + if (length && value->arrayGetLength() != length) \ + return gFalse; \ + \ + GBool okay = gTrue; \ + for (int i = 0; i < value->arrayGetLength(); i++) { \ + Object obj; \ + value->arrayGet(i, &obj); \ + if ((!allowNulls && obj.isNull()) || !checkItem(&obj)) { \ + okay = gFalse; \ + obj.free(); \ + break; \ + } \ + obj.free(); \ + } \ + return okay; \ + } + +ARRAY_CHECKER(isRGBColorOrOptionalArray4, isRGBColor, 4, gTrue, gTrue ); +ARRAY_CHECKER(isPositiveOrOptionalArray4, isPositive, 4, gTrue, gTrue ); +ARRAY_CHECKER(isPositiveOrArray4, isPositive, 4, gTrue, gFalse); +ARRAY_CHECKER(isBorderStyle, isBorderStyleName, 4, gTrue, gTrue ); +ARRAY_CHECKER(isNumberArray4, isNumber, 4, gFalse, gFalse); +ARRAY_CHECKER(isNumberOrArrayN, isNumber, 0, gTrue, gFalse); +ARRAY_CHECKER(isTableHeaders, isTextString, 0, gFalse, gFalse); + + +// Type of functions used to do type-checking on attribute values +typedef GBool (*AttributeCheckFunc)(Object*); + +// Maps attributes to their names and whether the attribute can be inherited. +struct AttributeMapEntry { + Attribute::Type type; + const char *name; + const Object *defval; + GBool inherit; + AttributeCheckFunc check; +}; + +struct AttributeDefaults { + Object Inline; + Object LrTb; + Object Normal; + Object Distribute; + Object off; + Object Zero; + Object Auto; + Object Start; + Object None; + Object Before; + Object Nat1; + + AttributeDefaults() { + Inline.initName("Inline"); + LrTb.initName("LrTb"); + Normal.initName("Normal"); + Distribute.initName("Distribute"); + off.initName("off"); + + Zero.initReal(0.0); + Auto.initName("Auto"); + Start.initName("Start"); + None.initName("None"); + Before.initName("Before"); + Nat1.initInt(1); + } +}; + +static const AttributeDefaults attributeDefaults; + + +#define ATTR_LIST_END { Attribute::Unknown, NULL, NULL, gFalse, NULL } +#define ATTR_D(x, i, c, v) { Attribute::x, #x, &attributeDefaults.v, i, c } +#define ATTR_N(x, i, c) { Attribute::x, #x, NULL, i, c } + +static const AttributeMapEntry attributeMapCommonShared[] = +{ + ATTR_D(Placement, gFalse, isPlacementName, Inline), + ATTR_D(WritingMode, gFalse, isWritingModeName, LrTb), + ATTR_N(BackgroundColor, gFalse, isRGBColor), + ATTR_N(BorderColor, gTrue, isRGBColorOrOptionalArray4), + ATTR_D(BorderStyle, gFalse, isBorderStyle, None), + ATTR_N(BorderThickness, gTrue, isPositiveOrOptionalArray4), + ATTR_D(Padding, gFalse, isPositiveOrArray4, Zero), + ATTR_N(Color, gTrue, isRGBColor), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonBlock[] = +{ + ATTR_D(SpaceBefore, gFalse, isPositive, Zero), + ATTR_D(SpaceAfter, gFalse, isPositive, Zero), + ATTR_D(StartIndent, gTrue, isNumber, Zero), + ATTR_D(EndIndent, gTrue, isNumber, Zero), + ATTR_D(TextIndent, gTrue, isNumber, Zero), + ATTR_D(TextAlign, gTrue, isTextAlignName, Start), + ATTR_N(BBox, gFalse, isNumberArray4), + ATTR_D(Width, gFalse, isNumberOrAuto, Auto), + ATTR_D(Height, gFalse, isNumberOrAuto, Auto), + ATTR_D(BlockAlign, gTrue, isBlockAlignName, Before), + ATTR_D(InlineAlign, gTrue, isInlineAlignName, Start), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonInline[] = +{ + ATTR_D(BaselineShift, gFalse, isNumber, Zero), + ATTR_D(LineHeight, gTrue, isLineHeight, Normal), + ATTR_N(TextDecorationColor, gTrue, isRGBColor), + ATTR_N(TextDecorationThickness, gTrue, isPositive), + ATTR_D(TextDecorationType, gFalse, isTextDecorationName, None), + ATTR_D(GlyphOrientationVertical, gTrue, isGlyphOrientationName, Auto), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonRubyText[] = +{ + ATTR_D(RubyPosition, gTrue, isRubyPositionName, Before), + ATTR_D(RubyAlign, gTrue, isRubyAlignName, Distribute), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonColumns[] = +{ + ATTR_D(ColumnCount, gFalse, isNatural, Nat1), + ATTR_N(ColumnGap, gFalse, isNumberOrArrayN), + ATTR_N(ColumnWidths, gFalse, isNumberOrArrayN), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonList[] = { + ATTR_D(ListNumbering, gFalse, isListNumberingName, None), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonPrintField[] = +{ + ATTR_N(Role, gFalse, isFieldRoleName), + ATTR_D(checked, gFalse, isFieldCheckedName, off), + ATTR_N(Desc, gFalse, isTextString), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonTable[] = +{ + ATTR_N(Headers, gFalse, isTableHeaders), + ATTR_N(Scope, gFalse, isTableScopeName), + ATTR_N(Summary, gFalse, isTextString), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonTableCell[] = +{ + ATTR_D(RowSpan, gFalse, isNatural, Nat1), + ATTR_D(ColSpan, gFalse, isNatural, Nat1), + ATTR_D(TBorderStyle, gTrue, isBorderStyle, None), + ATTR_D(TPadding, gTrue, isPositiveOrArray4, Zero), + ATTR_LIST_END +}; + + +static const AttributeMapEntry *attributeMapAll[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonInline, + attributeMapCommonRubyText, + attributeMapCommonColumns, + attributeMapCommonList, + attributeMapCommonPrintField, + attributeMapCommonTable, + attributeMapCommonTableCell, + NULL, +}; + +static const AttributeMapEntry *attributeMapShared[] = { + attributeMapCommonShared, + NULL, +}; + +static const AttributeMapEntry *attributeMapBlock[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + NULL, +}; + +static const AttributeMapEntry *attributeMapInline[] = { + attributeMapCommonShared, + attributeMapCommonInline, + NULL, +}; + +static const AttributeMapEntry *attributeMapTableCell[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonTable, + attributeMapCommonTableCell, + NULL, +}; + +static const AttributeMapEntry *attributeMapRubyText[] = { + attributeMapCommonShared, + attributeMapCommonInline, + attributeMapCommonRubyText, + NULL, +}; + +static const AttributeMapEntry *attributeMapColumns[] = { + attributeMapCommonShared, + attributeMapCommonInline, + attributeMapCommonColumns, + NULL, +}; + +static const AttributeMapEntry *attributeMapList[] = { + attributeMapCommonShared, + attributeMapCommonList, + NULL, +}; + +static const AttributeMapEntry *attributeMapPrintField[] = { + attributeMapCommonShared, + attributeMapCommonPrintField, + NULL, +}; + +static const AttributeMapEntry *attributeMapTable[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonTable, + NULL, +}; + +static const AttributeMapEntry *attributeMapIllustration[] = { + // XXX: Illustrations may have some attributes from the "shared", "inline", + // the "block" sets. This is a loose specification; making it better + // means duplicating entries from the sets. This seems good enough... + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonInline, + NULL, +}; + +// Table mapping owners of attributes to their names. +static const struct OwnerMapEntry { + Attribute::Owner owner; + const char *name; +} ownerMap[] = { + // XXX: Those are sorted in the owner priority resolution order. If the + // same attribute is defined with two owners, the order in the table + // can be used to know which one has more priority. + { Attribute::XML_1_00, "XML-1.00" }, + { Attribute::HTML_3_20, "HTML-3.20" }, + { Attribute::HTML_4_01, "HTML-4.01" }, + { Attribute::OEB_1_00, "OEB-1.00" }, + { Attribute::RTF_1_05, "RTF-1.05" }, + { Attribute::CSS_1_00, "CSS-1.00" }, + { Attribute::CSS_2_00, "CSS-2.00" }, + { Attribute::Layout, "Layout" }, + { Attribute::PrintField, "PrintField" }, + { Attribute::Table, "Table" }, + { Attribute::List, "List" }, + { Attribute::UserProperties, "UserProperties" }, +}; + + +static GBool ownerHasMorePriority(Attribute::Owner a, Attribute::Owner b) +{ + unsigned a_index, b_index; + + for (unsigned i = a_index = b_index = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (ownerMap[i].owner == a) + a_index = i; + if (ownerMap[i].owner == b) + b_index = i; + } + + return a_index < b_index; +} + + +// Maps element types to their names and also serves as lookup table +// for additional element type attributes. + +enum ElementType { + elementTypeUndefined, + elementTypeInline, + elementTypeBlock, +}; + +static const struct TypeMapEntry { + StructElement::Type type; + const char *name; + ElementType elementType; + const AttributeMapEntry **attributes; +} typeMap[] = { + { StructElement::Document, "Document", elementTypeInline, attributeMapShared }, + { StructElement::Part, "Part", elementTypeInline, attributeMapShared }, + { StructElement::Art, "Art", elementTypeInline, attributeMapColumns }, + { StructElement::Sect, "Sect", elementTypeInline, attributeMapColumns }, + { StructElement::Div, "Div", elementTypeInline, attributeMapColumns }, + { StructElement::BlockQuote, "BlockQuote", elementTypeInline, attributeMapInline }, + { StructElement::Caption, "Caption", elementTypeInline, attributeMapInline }, + { StructElement::NonStruct, "NonStruct", elementTypeInline, attributeMapInline }, + { StructElement::Index, "Index", elementTypeInline, attributeMapInline }, + { StructElement::Private, "Private", elementTypeInline, attributeMapInline }, + { StructElement::Span, "Span", elementTypeInline, attributeMapInline }, + { StructElement::Quote, "Quote", elementTypeInline, attributeMapInline }, + { StructElement::Note, "Note", elementTypeInline, attributeMapInline }, + { StructElement::Reference, "Reference", elementTypeInline, attributeMapInline }, + { StructElement::BibEntry, "BibEntry", elementTypeInline, attributeMapInline }, + { StructElement::Code, "Code", elementTypeInline, attributeMapInline }, + { StructElement::Link, "Link", elementTypeInline, attributeMapInline }, + { StructElement::Annot, "Annot", elementTypeInline, attributeMapInline }, + { StructElement::Ruby, "Ruby", elementTypeInline, attributeMapRubyText }, + { StructElement::RB, "RB", elementTypeUndefined, attributeMapRubyText }, + { StructElement::RT, "RT", elementTypeUndefined, attributeMapRubyText }, + { StructElement::RP, "RP", elementTypeUndefined, attributeMapShared }, + { StructElement::Warichu, "Warichu", elementTypeInline, attributeMapRubyText }, + { StructElement::WT, "WT", elementTypeUndefined, attributeMapShared }, + { StructElement::WP, "WP", elementTypeUndefined, attributeMapShared }, + { StructElement::P, "P", elementTypeBlock, attributeMapBlock }, + { StructElement::H, "H", elementTypeBlock, attributeMapBlock }, + { StructElement::H1, "H1", elementTypeBlock, attributeMapBlock }, + { StructElement::H2, "H2", elementTypeBlock, attributeMapBlock }, + { StructElement::H3, "H3", elementTypeBlock, attributeMapBlock }, + { StructElement::H4, "H4", elementTypeBlock, attributeMapBlock }, + { StructElement::H5, "H5", elementTypeBlock, attributeMapBlock }, + { StructElement::H6, "H6", elementTypeBlock, attributeMapBlock }, + { StructElement::L, "L", elementTypeBlock, attributeMapList }, + { StructElement::LI, "LI", elementTypeBlock, attributeMapBlock }, + { StructElement::Lbl, "Lbl", elementTypeBlock, attributeMapBlock }, + { StructElement::LBody, "LBody", elementTypeUndefined, attributeMapBlock }, + { StructElement::Table, "Table", elementTypeBlock, attributeMapTable }, + { StructElement::TR, "TR", elementTypeUndefined, attributeMapShared }, + { StructElement::TH, "TH", elementTypeUndefined, attributeMapTableCell }, + { StructElement::TD, "TD", elementTypeUndefined, attributeMapTableCell }, + { StructElement::THead, "THead", elementTypeUndefined, attributeMapShared }, + { StructElement::TFoot, "TFoot", elementTypeUndefined, attributeMapShared }, + { StructElement::TBody, "TBody", elementTypeUndefined, attributeMapShared }, + { StructElement::Figure, "Figure", elementTypeUndefined, attributeMapIllustration }, + { StructElement::Formula, "Formula", elementTypeUndefined, attributeMapIllustration }, + { StructElement::Form, "Form", elementTypeUndefined, attributeMapIllustration }, + { StructElement::TOC, "TOC", elementTypeUndefined, attributeMapShared }, + { StructElement::TOCI, "TOCI", elementTypeUndefined, attributeMapShared }, +}; + + +//------------------------------------------------------------------------ +// Helpers for the attribute and structure type tables +//------------------------------------------------------------------------ + +static inline const AttributeMapEntry* +getAttributeMapEntry(const AttributeMapEntry **entryList, Attribute::Type type) +{ + assert(entryList); + while (*entryList) { + const AttributeMapEntry *entry = *entryList; + while (entry->type != Attribute::Unknown) { + assert(entry->name); + if (type == entry->type) + return entry; + entry++; + } + entryList++; + } + return NULL; +} + +static inline const AttributeMapEntry* +getAttributeMapEntry(const AttributeMapEntry **entryList, const char *name) +{ + assert(entryList); + while (*entryList) { + const AttributeMapEntry *entry = *entryList; + while (entry->type != Attribute::Unknown) { + assert(entry->name); + if (strcmp(name, entry->name) == 0) + return entry; + entry++; + } + entryList++; + } + return NULL; +} + +static inline const OwnerMapEntry *getOwnerMapEntry(Attribute::Owner owner) +{ + for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (owner == ownerMap[i].owner) + return &ownerMap[i]; + } + return NULL; +} + +static inline const OwnerMapEntry *getOwnerMapEntry(const char *name) +{ + for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (strcmp(name, ownerMap[i].name) == 0) + return &ownerMap[i]; + } + return NULL; +} + +static const char *ownerToName(Attribute::Owner owner) +{ + const OwnerMapEntry *entry = getOwnerMapEntry(owner); + return entry ? entry->name : "UnknownOwner"; +} + +Attribute::Owner nameToOwner(const char *name) +{ + const OwnerMapEntry *entry = getOwnerMapEntry(name); + return entry ? entry->owner : Attribute::UnknownOwner; +} + +static inline const TypeMapEntry *getTypeMapEntry(StructElement::Type type) +{ + for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { + if (type == typeMap[i].type) + return &typeMap[i]; + } + return NULL; +} + +static inline const TypeMapEntry *getTypeMapEntry(const char *name) +{ + for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { + if (strcmp(name, typeMap[i].name) == 0) + return &typeMap[i]; + } + return NULL; +} + static const char *typeToName(StructElement::Type type) { if (type == StructElement::MCID) @@ -29,9 +651,177 @@ static const char *typeToName(StructElement::Type type) if (type == StructElement::OBJR) return "ObjectReference"; + const TypeMapEntry *entry = getTypeMapEntry(type); + return entry ? entry->name : "Unknown"; +} + +static StructElement::Type nameToType(const char *name) +{ + const TypeMapEntry *entry = getTypeMapEntry(name); + return entry ? entry->type : StructElement::Unknown; +} + + +//------------------------------------------------------------------------ +// Attribute +//------------------------------------------------------------------------ + +Attribute::Attribute(const char *nameA, Object *valueA, GBool copyValue): + type(UserProperty), + owner(UserProperties), + revision(0), + name(nameA), + value(), + hidden(gFalse), + formatted(NULL) +{ + assert(valueA); + + if (copyValue) + valueA->copy(&value); + else + valueA->shallowCopy(&value); +} + +Attribute::Attribute(Type type, Object *valueA, GBool copyValue): + type(type), + owner(UserProperties), // TODO: Determine corresponding owner from Type + revision(0), + name(), + value(), + hidden(gFalse), + formatted(NULL) +{ + assert(valueA); + + if (copyValue) + valueA->copy(&value); + else + valueA->shallowCopy(&value); + + if (!typeCheck()) { + type = Unknown; + } +} + +Attribute::~Attribute() +{ + delete formatted; + value.free(); +} + +const char *Attribute::getTypeName() const +{ + if (type == UserProperty) + return name.getCString(); + + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, type); + if (entry) + return entry->name; + return "Unknown"; } +const char *Attribute::getOwnerName() const +{ + return ownerToName(owner); +} + +Object *Attribute::getDefaultValue(Attribute::Type type) +{ + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, type); + return entry ? const_cast(entry->defval) : NULL; +} + +void Attribute::setFormattedValue(const char *formattedA) +{ + if (formattedA) { + if (formatted) + formatted->Set(formattedA); + else + formatted = new GooString(formattedA); + } else { + delete formatted; + } +} + +GBool Attribute::typeCheck(StructElement *element) +{ + // If an element is passed, tighther type-checking can be done. + if (element) { + const TypeMapEntry *elementTypeEntry = getTypeMapEntry(element->getType()); + if (elementTypeEntry && elementTypeEntry->attributes) { + const AttributeMapEntry *entry = getAttributeMapEntry(elementTypeEntry->attributes, type); + if (entry) { + if (entry->check && !((*entry->check)(&value))) { + return gFalse; + } + } else { + // No entry: the attribute is not valid for the containing element. + return gFalse; + } + } + } + + return gTrue; +} + +Attribute::Type Attribute::typeForName(const char *name, StructElement *element) +{ + const AttributeMapEntry **attributes = attributeMapAll; + if (element) { + const TypeMapEntry *elementTypeEntry = getTypeMapEntry(element->getType()); + if (elementTypeEntry && elementTypeEntry->attributes) { + attributes = elementTypeEntry->attributes; + } + } + + const AttributeMapEntry *entry = getAttributeMapEntry(attributes, name); + return entry ? entry->type : Unknown; +} + +Attribute *Attribute::parseUserProperty(Dict *property) +{ + Object obj, value; + const char *name = NULL; + + if (property->lookup("N", &obj)->isString()) + name = obj.getString()->getCString(); + else if (obj.isName()) + name = obj.getName(); + else { + error(errSyntaxError, -1, "N object is wrong type ({0:s})", obj.getTypeName()); + obj.free(); + return NULL; + } + + if (property->lookup("V", &value)->isNull()) { + error(errSyntaxError, -1, "V object is wrong type ({0:s})", value.getTypeName()); + value.free(); + obj.free(); + return NULL; + } + + Attribute *attribute = new Attribute(name, &value, gFalse); + obj.free(); + + if (property->lookup("F", &obj)->isString()) { + attribute->setFormattedValue(obj.getString()->getCString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "F object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + if (property->lookup("H", &obj)->isBool()) { + attribute->setHidden(obj.getBool()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "H object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + return attribute; +} + //------------------------------------------------------------------------ // StructElement @@ -57,6 +847,7 @@ StructElement::StructData::~StructData() delete language; parentRef.free(); for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) delete *i; + for (AttrPtrArray::iterator i = attributes.begin(); i != attributes.end(); ++i) delete *i; } @@ -105,6 +896,18 @@ StructElement::~StructElement() pageRef.free(); } +GBool StructElement::isBlock() const +{ + const TypeMapEntry *entry = getTypeMapEntry(type); + return entry ? (entry->elementType == elementTypeBlock) : gFalse; +} + +GBool StructElement::isInline() const +{ + const TypeMapEntry *entry = getTypeMapEntry(type); + return entry ? (entry->elementType == elementTypeInline) : gFalse; +} + GBool StructElement::hasPageRef() const { return pageRef.isRef() || (parent && parent->hasPageRef()); @@ -123,14 +926,71 @@ bool StructElement::getPageRef(Ref& ref) const return gFalse; } -const char* StructElement::getTypeName() const +const char *StructElement::getTypeName() const { return typeToName(type); } +const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBool inherit, + Attribute::Owner attributeOwner) const +{ + if (isContent()) + return parent->findAttribute(attributeType, inherit, attributeOwner); + + if (attributeType != Attribute::Unknown && attributeType != Attribute::UserProperty) { + const Attribute *result = NULL; + + if (attributeOwner == Attribute::UnknownOwner) { + // Search for the attribute, no matter who the owner is + for (unsigned i = 0; i < getNumAttributes(); i++) { + const Attribute *attr = getAttribute(i); + if (attributeType == attr->getType()) { + if (!result || ownerHasMorePriority(attr->getOwner(), result->getOwner())) + result = attr; + } + } + } else { + // Search for the attribute, with a specific owner + for (unsigned i = 0; i < getNumAttributes(); i++) { + const Attribute *attr = getAttribute(i); + if (attributeType == attr->getType() && attributeOwner == attr->getOwner()) { + result = attr; + break; + } + } + } + + if (result) + return result; + + if (inherit && parent) { + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, attributeType); + assert(entry); + // TODO: Take into account special inheritance cases, for example: + // inline elements which have been changed to be block using + // "/Placement/Block" have slightly different rules. + if (entry->inherit) + return parent->findAttribute(attributeType, inherit, attributeOwner); + } + } + return NULL; +} + static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) { - // TODO Replace this dummy implementation + // Circular reference + if (curName && !strcmp(name, curName)) + return StructElement::Unknown; + + if (roleMap->lookup(curName ? curName : name, resolved)->isName()) { + StructElement::Type type = nameToType(resolved->getName()); + return type == StructElement::Unknown + ? roleMapResolve(roleMap, name, resolved->getName(), resolved) + : type; + } + + if (!resolved->isNull()) + error(errSyntaxWarning, -1, "RoleMap entry is wrong type ({0:s})", resolved->getTypeName()); return StructElement::Unknown; } @@ -166,6 +1026,17 @@ void StructElement::parse(Dict *element) type = roleMapResolve(treeRoot->getRoleMap(), obj.getName(), NULL, &resolvedName); } + // Resolving through RoleMap may leave type as Unknown, e.g. for types + // which are not present in it, yet they are standard element types. + if (type == Unknown) + type = nameToType(obj.getName()); + + // At this point either the type name must have been resolved. + if (type == Unknown) { + error(errSyntaxError, -1, "StructElem object is wrong type ({0:s})", obj.getName()); + obj.free(); + return; + } obj.free(); // Object ID (optional), to be looked at the IDTree in the tree root. @@ -216,8 +1087,60 @@ void StructElement::parse(Dict *element) } obj.free(); - // TODO: Attributes directly attached to the element (optional). - // TODO: Attributes referenced indirectly through the ClassMap (optional). + // Attributes directly attached to the element (optional). + if (element->lookup("A", &obj)->isDict()) { + parseAttributes(obj.getDict()); + } else if (obj.isArray()) { + Object iobj; + unsigned attrIndex = getNumAttributes(); + for (int i = 0; i < obj.arrayGetLength(); i++) { + if (obj.arrayGet(i, &iobj)->isDict()) { + attrIndex = getNumAttributes(); + parseAttributes(obj.getDict()); + } else if (iobj.isInt()) { + const int revision = iobj.getInt(); + // Set revision numbers for the elements previously created. + for (unsigned j = attrIndex; j < getNumAttributes(); j++) + getAttribute(j)->setRevision(revision); + } else { + error(errSyntaxWarning, -1, "A item is wrong type ({0:s})", iobj.getTypeName()); + } + iobj.free(); + } + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "A is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Attributes referenced indirectly through the ClassMap (optional). + if (treeRoot->getClassMap()) { + Object classes; + if (element->lookup("C", &classes)->isName()) { + Object attr; + if (treeRoot->getClassMap()->lookup(classes.getName(), &attr)->isDict()) { + parseAttributes(attr.getDict(), gTrue); + } else if (attr.isArray()) { + for (int i = 0; i < attr.arrayGetLength(); i++) { + Object iobj; + unsigned attrIndex = getNumAttributes(); + if (attr.arrayGet(i, &iobj)->isDict()) { + attrIndex = getNumAttributes(); + parseAttributes(iobj.getDict(), gTrue); + } else if (iobj.isInt()) { + // Set revision numbers for the elements previously created. + const int revision = iobj.getInt(); + for (unsigned j = attrIndex; j < getNumAttributes(); j++) + getAttribute(j)->setRevision(revision); + } else { + error(errSyntaxWarning, -1, "C item is wrong type ({0:s})", iobj.getTypeName()); + } + } + } else if (!attr.isNull()) { + error(errSyntaxWarning, -1, "C object is wrong type ({0:s})", classes.getTypeName()); + } + classes.free(); + } + } } StructElement *StructElement::parseChild(Object *ref, @@ -320,3 +1243,81 @@ void StructElement::parseChildren(Dict *element, std::set &seen) kids.free(); } + +void StructElement::parseAttributes(Dict *attributes, GBool keepExisting) +{ + Object owner; + if (attributes->lookup("O", &owner)->isName("UserProperties")) { + // In this case /P is an array of UserProperty dictionaries + Object userProperties; + if (attributes->lookup("P", &userProperties)->isArray()) { + for (int i = 0; i < userProperties.arrayGetLength(); i++) { + Object property; + if (userProperties.arrayGet(i, &property)->isDict()) { + Attribute *attribute = Attribute::parseUserProperty(property.getDict()); + if (attribute && attribute->isOk()) { + appendAttribute(attribute); + } else { + error(errSyntaxWarning, -1, "Item in P is invalid"); + delete attribute; + } + } else { + error(errSyntaxWarning, -1, "Item in P is wrong type ({0:s})", property.getTypeName()); + } + property.free(); + } + } + userProperties.free(); + } else if (owner.isName()) { + // In this case /P contains standard attributes. + // Check first if the owner is a valid standard one. + Attribute::Owner ownerValue = nameToOwner(owner.getName()); + if (ownerValue != Attribute::UnknownOwner) { + // Iterate over the entries of the "attributes" dictionary. + // The /O entry (owner) is skipped. + for (int i = 0; i < attributes->getLength(); i++) { + const char *key = attributes->getKey(i); + if (strcmp(key, "O") != 0) { + Attribute::Type type = Attribute::typeForName(key, this); + + // Check if the attribute is already defined. + if (keepExisting) { + GBool exists = gFalse; + for (unsigned j = 0; j < getNumAttributes(); j++) { + if (getAttribute(j)->getType() == type) { + exists = gTrue; + break; + } + } + if (exists) + continue; + } + + if (type != Attribute::Unknown) { + Object value; + GBool typeCheckOk = gTrue; + Attribute *attribute = new Attribute(type, attributes->getVal(i, &value), gFalse); + if (attribute->isOk() && (typeCheckOk = attribute->typeCheck(this))) { + appendAttribute(attribute); + } else { + // It is not needed to free "value", the Attribute instance + // owns the contents, so deleting "attribute" is enough. + if (!typeCheckOk) { + error(errSyntaxWarning, -1, "Attribute {0:s} value is of wrong type ({1:s})", + attribute->getTypeName(), attribute->getValue()->getTypeName()); + } + delete attribute; + } + } else { + error(errSyntaxWarning, -1, "Wrong Attribute '{0:s}' in element {1:s}", key, getTypeName()); + } + } + } + } else { + error(errSyntaxWarning, -1, "O object is invalid value ({0:s})", owner.getName()); + } + } else if (!owner.isNull()) { + error(errSyntaxWarning, -1, "O is wrong type ({0:s})", owner.getTypeName()); + } + owner.free(); +} diff --git a/poppler/StructElement.h b/poppler/StructElement.h index d1997c9..977e445 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -23,7 +23,101 @@ class GooString; class Dict; +class StructElement; class StructTreeRoot; +class TextWordList; + + +class Attribute { +public: + enum Type { + Unknown = 0, // Uninitialized, parsing error, etc. + UserProperty, // User defined attribute (i.e. non-standard) + + // Common standard attributes + Placement, WritingMode, BackgroundColor, BorderColor, BorderStyle, + BorderThickness, Color, Padding, + + // Block element standard attributes + SpaceBefore, SpaceAfter, StartIndent, EndIndent, TextIndent, TextAlign, + BBox, Width, Height, BlockAlign, InlineAlign, TBorderStyle, TPadding, + + // Inline element standard attributes + BaselineShift, LineHeight, TextDecorationColor, TextDecorationThickness, + TextDecorationType, RubyAlign, RubyPosition, GlyphOrientationVertical, + + // Column-only standard attributes + ColumnCount, ColumnGap, ColumnWidths, + + // List-only standard attributes + ListNumbering, + + // PrintField-only standard attributes + Role, checked, Desc, + + // Table-only standard attributes + RowSpan, ColSpan, Headers, Scope, Summary, + }; + + enum Owner { + UnknownOwner = 0, + // User-defined attributes + UserProperties, + // Standard attributes + Layout, List, PrintField, Table, + // Translation to other formats + XML_1_00, HTML_3_20, HTML_4_01, OEB_1_00, RTF_1_05, CSS_1_00, CSS_2_00, + }; + + // Creates a standard attribute. The name is predefined, and the + // value is type-checked to conform to the PDF specification. + Attribute(Type type, Object *value, GBool copyValue = gTrue); + + // Creates an UserProperty attribute, with an arbitrary name and value. + Attribute(const char *name, Object *value, GBool copyValue = gTrue); + + GBool isOk() const { return type != Unknown; } + + // Name, type and value can be set only on construction. + Type getType() const { return type; } + Owner getOwner() const { return owner; } + const char *getTypeName() const; + const char *getOwnerName() const; + Object *getValue() const { return &value; } + static Object *getDefaultValue(Type type); + + const char *getName() const { return type == UserProperty ? name.getCString() : getTypeName(); } + + // The revision is optional, and defaults to zero. + Guint getRevision() const { return revision; } + void setRevision(Guint revisionA) { revision = revisionA; } + + // Hidden elements should not be displayed by the user agent + GBool isHidden() const { return hidden; } + void setHidden(GBool hiddenA) { hidden = hiddenA; } + + // The formatted value may be in the PDF, or be left undefined (NULL). + // In the later case the user agent should provide a default representation. + const char *getFormattedValue() const { return formatted ? formatted->getCString() : NULL; } + void setFormattedValue(const char *formattedA); + + ~Attribute(); + +private: + Type type; + Owner owner; + Guint revision; + mutable GooString name; + mutable Object value; + GBool hidden; + GooString *formatted; + + GBool typeCheck(StructElement *element = NULL); + static Type typeForName(const char *name, StructElement *element = NULL); + static Attribute *parseUserProperty(Dict *property); + + friend class StructElement; +}; class StructElement { @@ -42,7 +136,7 @@ public: P, H, H1, H2, H3, H4, H5, H6, // Paragraph-like - L, LI, Lbl, // List elements + L, LI, Lbl, LBody, // List elements Table, TR, TH, TD, THead, TFoot, TBody, // Table elements @@ -57,6 +151,8 @@ public: const char *getTypeName() const; Type getType() const { return type; } GBool isOk() const { return type != Unknown; } + GBool isBlock() const; + GBool isInline() const; inline GBool isContent() const { return (type == MCID) || isObjectRef(); } inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); } @@ -104,6 +200,19 @@ public: } } + unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } + const Attribute *getAttribute(int i) const { return isContent() ? NULL : s->attributes.at(i); } + Attribute *getAttribute(int i) { return isContent() ? NULL : s->attributes.at(i); } + + void appendAttribute(Attribute *attribute) { + if (!isContent() && attribute) { + s->attributes.push_back(attribute); + } + } + + const Attribute* findAttribute(Attribute::Type attributeType, GBool inherit = gFalse, + Attribute::Owner owner = Attribute::UnknownOwner) const; + const GooString *getAltText() const { return isContent() ? NULL : s->altText; } GooString *getAltText() { return isContent() ? NULL : s->altText; } @@ -113,6 +222,7 @@ public: ~StructElement(); private: + typedef std::vector AttrPtrArray; typedef std::vector ElemPtrArray; struct StructData { @@ -125,6 +235,7 @@ private: GooString *language; Guint revision; ElemPtrArray elements; + AttrPtrArray attributes; StructData(); ~StructData(); @@ -159,6 +270,7 @@ private: void parse(Dict* elementDict); StructElement* parseChild(Object *ref, Object* childObj, std::set &seen); void parseChildren(Dict* element, std::set &seen); + void parseAttributes(Dict *element, GBool keepExisting = gFalse); friend class StructTreeRoot; }; -- 1.8.4