From 6c360712ac6d11c34cdde57dcce1cf4696055b82 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Tue, 18 Jun 2013 00:24:21 +0300 Subject: [PATCH v5 04/10] Tagged-PDF: Implement parsing of StructElem attributes Parse attributes of StructElem nodes of the document structure tree. Both standard attributes and user properties are mapped to instances of the Attribute class. Attributes are parsed both via ClassMap references and directly referenced from the StructElem objects. --- poppler/Makefile.am | 2 +- poppler/StructElement.cc | 987 ++++++++++++++++++++++++++++++++++++++++++++--- poppler/StructElement.h | 108 +++++- 3 files changed, 1034 insertions(+), 63 deletions(-) diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 9be0811..4670113 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -317,8 +317,8 @@ libpoppler_la_SOURCES = \ StdinCachedFile.cc \ StdinPDFDocBuilder.cc \ Stream.cc \ - StructTreeRoot.cc \ StructElement.cc \ + StructTreeRoot.cc \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 589767b..6af9af3 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -22,6 +22,475 @@ class GfxState; +static GBool isPlacementName(Object *value) +{ + return value->isName("Block") + || value->isName("Inline") + || value->isName("Before") + || value->isName("Start") + || value->isName("End"); +} + +static GBool isWritingModeName(Object *value) +{ + return value->isName("LrTb") + || value->isName("RlTb") + || value->isName("TbRl"); +} + +static GBool isBorderStyleName(Object *value) +{ + return value->isName("None") + || value->isName("Hidden") + || value->isName("Dotted") + || value->isName("Dashed") + || value->isName("Solid") + || value->isName("Double") + || value->isName("Groove") + || value->isName("Ridge") + || value->isName("Inset") + || value->isName("Outset"); +} + +static GBool isTextAlignName(Object *value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center") + || value->isName("Justify"); +} + +static GBool isBlockAlignName(Object *value) +{ + return value->isName("Before") + || value->isName("Middle") + || value->isName("After") + || value->isName("Justify"); +} + +static GBool isInlineAlignName(Object *value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center"); +} + +static GBool isNumber(Object *value); + +static GBool isLineHeight(Object *value) +{ + return value->isName("Normal") + || value->isName("Auto") + || isNumber(value); +} + +static GBool isTextDecorationName(Object *value) +{ + return value->isName("None") + || value->isName("Underline") + || value->isName("Overline") + || value->isName("LineThrough"); +} + +static GBool isRubyAlignName(Object *value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center") + || value->isName("Justify") + || value->isName("Distribute"); +} + +static GBool isRubyPositionName(Object *value) +{ + return value->isName("Before") + || value->isName("After") + || value->isName("Warichu") + || value->isName("Inline"); +} + +static GBool isGlyphOrientationName(Object *value) +{ + return value->isName("Auto") + || value->isName("90") + || value->isName("180") + || value->isName("270") + || value->isName("360") + || value->isName("-90") + || value->isName("-180"); +} + +static GBool isListNumberingName(Object *value) +{ + return value->isName("None") + || value->isName("Disc") + || value->isName("Circle") + || value->isName("Square") + || value->isName("Decimal") + || value->isName("UpperRoman") + || value->isName("LowerRoman") + || value->isName("UpperAlpha") + || value->isName("LowerAlpha"); +} + +static GBool isFieldRoleName(Object *value) +{ + return value->isName("rb") + || value->isName("cb") + || value->isName("pb") + || value->isName("tv"); +} + +static GBool isFieldCheckedName(Object *value) +{ + return value->isName("on") + || value->isName("off") + || value->isName("neutral"); +} + +static GBool isTableScopeName(Object *value) +{ + return value->isName("Row") + || value->isName("Column") + || value->isName("Both"); +} + +static GBool isRGBColor(Object *value) +{ + if (!(value->isArray() && value->arrayGetLength() == 3)) + return gFalse; + + GBool okay = gTrue; + for (int i = 0; i < 3; i++) { + Object obj; + if (!value->arrayGet(i, &obj)->isNum()) { + okay = gFalse; + obj.free(); + break; + } + if (obj.getNum() < 0.0 || obj.getNum() > 1.0) { + okay = gFalse; + obj.free(); + break; + } + obj.free(); + } + + return okay; +} + +static GBool isNatural(Object *value) +{ + return (value->isInt() && value->getInt() > 0) + || (value->isInt64() && value->getInt64() > 0); +} + +static GBool isPositive(Object *value) +{ + return value->isNum() && value->getNum() >= 0.0; +} + +static GBool isNumber(Object *value) +{ + return value->isNum(); +} + +static GBool isNumberOrAuto(Object *value) +{ + return isNumber(value) || value->isName("Auto"); +} + +static GBool isTextString(Object *value) +{ + // XXX: Shall isName() also be checked? + return value->isString(); +} + + +#define ARRAY_CHECKER(name, checkItem, length, allowSingle, allowNulls) \ + static GBool name(Object *value) { \ + if (!value->isArray()) \ + return allowSingle ? checkItem(value) : gFalse; \ + \ + if (length && value->arrayGetLength() != length) \ + return gFalse; \ + \ + GBool okay = gTrue; \ + for (int i = 0; i < value->arrayGetLength(); i++) { \ + Object obj; \ + value->arrayGet(i, &obj); \ + if ((!allowNulls && obj.isNull()) || !checkItem(&obj)) { \ + okay = gFalse; \ + obj.free(); \ + break; \ + } \ + obj.free(); \ + } \ + return okay; \ + } + +ARRAY_CHECKER(isRGBColorOrOptionalArray4, isRGBColor, 4, gTrue, gTrue ); +ARRAY_CHECKER(isPositiveOrOptionalArray4, isPositive, 4, gTrue, gTrue ); +ARRAY_CHECKER(isPositiveOrArray4, isPositive, 4, gTrue, gFalse); +ARRAY_CHECKER(isBorderStyle, isBorderStyleName, 4, gTrue, gTrue ); +ARRAY_CHECKER(isNumberArray4, isNumber, 4, gFalse, gFalse); +ARRAY_CHECKER(isNumberOrArrayN, isNumber, 0, gTrue, gFalse); +ARRAY_CHECKER(isTableHeaders, isTextString, 0, gFalse, gFalse); + + +// Type of functions used to do type-checking on attribute values +typedef GBool (*AttributeCheckFunc)(Object*); + +// Maps attributes to their names and whether the attribute can be inherited. +struct AttributeMapEntry { + Attribute::Type type; + const char *name; + const Object *defval; + GBool inherit; + AttributeCheckFunc check; +}; + +struct AttributeDefaults { + Object Inline; + Object LrTb; + Object Normal; + Object Distribute; + Object off; + Object Zero; + Object Auto; + Object Start; + Object None; + Object Before; + Object Nat1; + + AttributeDefaults() { + Inline.initName("Inline"); + LrTb.initName("LrTb"); + Normal.initName("Normal"); + Distribute.initName("Distribute"); + off.initName("off"); + + Zero.initReal(0.0); + Auto.initName("Auto"); + Start.initName("Start"); + None.initName("None"); + Before.initName("Before"); + Nat1.initInt(1); + } +}; + +static const AttributeDefaults attributeDefaults; + + +#define ATTR_LIST_END { Attribute::Unknown, NULL, NULL, gFalse, NULL } +#define ATTR_D(x, i, c, v) { Attribute::x, #x, &attributeDefaults.v, i, c } +#define ATTR_N(x, i, c) { Attribute::x, #x, NULL, i, c } + +static const AttributeMapEntry attributeMapCommonShared[] = +{ + ATTR_D(Placement, gFalse, isPlacementName, Inline), + ATTR_D(WritingMode, gFalse, isWritingModeName, LrTb), + ATTR_N(BackgroundColor, gFalse, isRGBColor), + ATTR_N(BorderColor, gTrue, isRGBColorOrOptionalArray4), + ATTR_D(BorderStyle, gFalse, isBorderStyle, None), + ATTR_N(BorderThickness, gTrue, isPositiveOrOptionalArray4), + ATTR_D(Padding, gFalse, isPositiveOrArray4, Zero), + ATTR_N(Color, gTrue, isRGBColor), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonBlock[] = +{ + ATTR_D(SpaceBefore, gFalse, isPositive, Zero), + ATTR_D(SpaceAfter, gFalse, isPositive, Zero), + ATTR_D(StartIndent, gTrue, isNumber, Zero), + ATTR_D(EndIndent, gTrue, isNumber, Zero), + ATTR_D(TextIndent, gTrue, isNumber, Zero), + ATTR_D(TextAlign, gTrue, isTextAlignName, Start), + ATTR_N(BBox, gFalse, isNumberArray4), + ATTR_D(Width, gFalse, isNumberOrAuto, Auto), + ATTR_D(Height, gFalse, isNumberOrAuto, Auto), + ATTR_D(BlockAlign, gTrue, isBlockAlignName, Before), + ATTR_D(InlineAlign, gTrue, isInlineAlignName, Start), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonInline[] = +{ + ATTR_D(BaselineShift, gFalse, isNumber, Zero), + ATTR_D(LineHeight, gTrue, isLineHeight, Normal), + ATTR_N(TextDecorationColor, gTrue, isRGBColor), + ATTR_N(TextDecorationThickness, gTrue, isPositive), + ATTR_D(TextDecorationType, gFalse, isTextDecorationName, None), + ATTR_D(GlyphOrientationVertical, gTrue, isGlyphOrientationName, Auto), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonRubyText[] = +{ + ATTR_D(RubyPosition, gTrue, isRubyPositionName, Before), + ATTR_D(RubyAlign, gTrue, isRubyAlignName, Distribute), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonColumns[] = +{ + ATTR_D(ColumnCount, gFalse, isNatural, Nat1), + ATTR_N(ColumnGap, gFalse, isNumberOrArrayN), + ATTR_N(ColumnWidths, gFalse, isNumberOrArrayN), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonList[] = { + ATTR_D(ListNumbering, gFalse, isListNumberingName, None), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonPrintField[] = +{ + ATTR_N(Role, gFalse, isFieldRoleName), + ATTR_D(checked, gFalse, isFieldCheckedName, off), + ATTR_N(Desc, gFalse, isTextString), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonTable[] = +{ + ATTR_N(Headers, gFalse, isTableHeaders), + ATTR_N(Scope, gFalse, isTableScopeName), + ATTR_N(Summary, gFalse, isTextString), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonTableCell[] = +{ + ATTR_D(RowSpan, gFalse, isNatural, Nat1), + ATTR_D(ColSpan, gFalse, isNatural, Nat1), + ATTR_D(TBorderStyle, gTrue, isBorderStyle, None), + ATTR_D(TPadding, gTrue, isPositiveOrArray4, Zero), + ATTR_LIST_END +}; + + +static const AttributeMapEntry *attributeMapAll[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonInline, + attributeMapCommonRubyText, + attributeMapCommonColumns, + attributeMapCommonList, + attributeMapCommonPrintField, + attributeMapCommonTable, + attributeMapCommonTableCell, + NULL, +}; + +static const AttributeMapEntry *attributeMapShared[] = { + attributeMapCommonShared, + NULL, +}; + +static const AttributeMapEntry *attributeMapBlock[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + NULL, +}; + +static const AttributeMapEntry *attributeMapInline[] = { + attributeMapCommonShared, + attributeMapCommonInline, + NULL, +}; + +static const AttributeMapEntry *attributeMapTableCell[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonTable, + attributeMapCommonTableCell, + NULL, +}; + +static const AttributeMapEntry *attributeMapRubyText[] = { + attributeMapCommonShared, + attributeMapCommonInline, + attributeMapCommonRubyText, + NULL, +}; + +static const AttributeMapEntry *attributeMapColumns[] = { + attributeMapCommonShared, + attributeMapCommonInline, + attributeMapCommonColumns, + NULL, +}; + +static const AttributeMapEntry *attributeMapList[] = { + attributeMapCommonShared, + attributeMapCommonList, + NULL, +}; + +static const AttributeMapEntry *attributeMapPrintField[] = { + attributeMapCommonShared, + attributeMapCommonPrintField, + NULL, +}; + +static const AttributeMapEntry *attributeMapTable[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonTable, + NULL, +}; + +static const AttributeMapEntry *attributeMapIllustration[] = { + // XXX: Illustrations may have some attributes from the "shared", "inline", + // the "block" sets. This is a loose specification; making it better + // means duplicating entries from the sets. This seems good enough... + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonInline, + NULL, +}; + +// Table mapping owners of attributes to their names. +static const struct OwnerMapEntry { + Attribute::Owner owner; + const char *name; +} ownerMap[] = { + // XXX: Those are sorted in the owner priority resolution order. If the + // same attribute is defined with two owners, the order in the table + // can be used to know which one has more priority. + { Attribute::XML_1_00, "XML-1.00" }, + { Attribute::HTML_3_20, "HTML-3.20" }, + { Attribute::HTML_4_01, "HTML-4.01" }, + { Attribute::OEB_1_00, "OEB-1.00" }, + { Attribute::RTF_1_05, "RTF-1.05" }, + { Attribute::CSS_1_00, "CSS-1.00" }, + { Attribute::CSS_2_00, "CSS-2.00" }, + { Attribute::Layout, "Layout" }, + { Attribute::PrintField, "PrintField" }, + { Attribute::Table, "Table" }, + { Attribute::List, "List" }, + { Attribute::UserProperties, "UserProperties" }, +}; + + +static GBool ownerHasMorePriority(Attribute::Owner a, Attribute::Owner b) +{ + unsigned a_index, b_index; + + for (unsigned i = a_index = b_index = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (ownerMap[i].owner == a) + a_index = i; + if (ownerMap[i].owner == b) + b_index = i; + } + + return a_index < b_index; +} + + // Maps element types to their names and also serves as lookup table // for additional element type attributes. @@ -33,60 +502,129 @@ enum ElementType { static const struct TypeMapEntry { StructElement::Type type; - const char* name; + const char *name; ElementType elementType; + const AttributeMapEntry **attributes; } typeMap[] = { - { StructElement::Document, "Document", elementTypeInline }, - { StructElement::Part, "Part", elementTypeInline }, - { StructElement::Art, "Art", elementTypeInline }, - { StructElement::Sect, "Sect", elementTypeInline }, - { StructElement::Div, "Div", elementTypeInline }, - { StructElement::BlockQuote, "BlockQuote", elementTypeInline }, - { StructElement::Caption, "Caption", elementTypeInline }, - { StructElement::NonStruct, "NonStruct", elementTypeInline }, - { StructElement::Index, "Index", elementTypeInline }, - { StructElement::Private, "Private", elementTypeInline }, - { StructElement::Span, "Span", elementTypeInline }, - { StructElement::Quote, "Quote", elementTypeInline }, - { StructElement::Note, "Note", elementTypeInline }, - { StructElement::Reference, "Reference", elementTypeInline }, - { StructElement::BibEntry, "BibEntry", elementTypeInline }, - { StructElement::Code, "Code", elementTypeInline }, - { StructElement::Link, "Link", elementTypeInline }, - { StructElement::Annot, "Annot", elementTypeInline }, - { StructElement::Ruby, "Ruby", elementTypeInline }, - { StructElement::RB, "RB", elementTypeUndefined }, - { StructElement::RT, "RT", elementTypeUndefined }, - { StructElement::RP, "RP", elementTypeUndefined }, - { StructElement::Warichu, "Warichu", elementTypeInline }, - { StructElement::WT, "WT", elementTypeUndefined }, - { StructElement::WP, "WP", elementTypeUndefined }, - { StructElement::P, "P", elementTypeBlock }, - { StructElement::H, "H", elementTypeBlock }, - { StructElement::H1, "H1", elementTypeBlock }, - { StructElement::H2, "H2", elementTypeBlock }, - { StructElement::H3, "H3", elementTypeBlock }, - { StructElement::H4, "H4", elementTypeBlock }, - { StructElement::H5, "H5", elementTypeBlock }, - { StructElement::H6, "H6", elementTypeBlock }, - { StructElement::L, "L", elementTypeBlock }, - { StructElement::LI, "LI", elementTypeBlock }, - { StructElement::Lbl, "Lbl", elementTypeBlock }, - { StructElement::Table, "Table", elementTypeBlock }, - { StructElement::TR, "TR", elementTypeUndefined }, - { StructElement::TH, "TH", elementTypeUndefined }, - { StructElement::TD, "TD", elementTypeUndefined }, - { StructElement::THead, "THead", elementTypeUndefined }, - { StructElement::TFoot, "TFoot", elementTypeUndefined }, - { StructElement::TBody, "TBody", elementTypeUndefined }, - { StructElement::Figure, "Figure", elementTypeUndefined }, - { StructElement::Formula, "Formula", elementTypeUndefined }, - { StructElement::Form, "Form", elementTypeUndefined }, - { StructElement::TOC, "TOC", elementTypeUndefined }, - { StructElement::TOCI, "TOCI", elementTypeUndefined }, + { StructElement::Document, "Document", elementTypeInline, attributeMapShared }, + { StructElement::Part, "Part", elementTypeInline, attributeMapShared }, + { StructElement::Art, "Art", elementTypeInline, attributeMapColumns }, + { StructElement::Sect, "Sect", elementTypeInline, attributeMapColumns }, + { StructElement::Div, "Div", elementTypeInline, attributeMapColumns }, + { StructElement::BlockQuote, "BlockQuote", elementTypeInline, attributeMapInline }, + { StructElement::Caption, "Caption", elementTypeInline, attributeMapInline }, + { StructElement::NonStruct, "NonStruct", elementTypeInline, attributeMapInline }, + { StructElement::Index, "Index", elementTypeInline, attributeMapInline }, + { StructElement::Private, "Private", elementTypeInline, attributeMapInline }, + { StructElement::Span, "Span", elementTypeInline, attributeMapInline }, + { StructElement::Quote, "Quote", elementTypeInline, attributeMapInline }, + { StructElement::Note, "Note", elementTypeInline, attributeMapInline }, + { StructElement::Reference, "Reference", elementTypeInline, attributeMapInline }, + { StructElement::BibEntry, "BibEntry", elementTypeInline, attributeMapInline }, + { StructElement::Code, "Code", elementTypeInline, attributeMapInline }, + { StructElement::Link, "Link", elementTypeInline, attributeMapInline }, + { StructElement::Annot, "Annot", elementTypeInline, attributeMapInline }, + { StructElement::Ruby, "Ruby", elementTypeInline, attributeMapRubyText }, + { StructElement::RB, "RB", elementTypeUndefined, attributeMapRubyText }, + { StructElement::RT, "RT", elementTypeUndefined, attributeMapRubyText }, + { StructElement::RP, "RP", elementTypeUndefined, attributeMapShared }, + { StructElement::Warichu, "Warichu", elementTypeInline, attributeMapRubyText }, + { StructElement::WT, "WT", elementTypeUndefined, attributeMapShared }, + { StructElement::WP, "WP", elementTypeUndefined, attributeMapShared }, + { StructElement::P, "P", elementTypeBlock, attributeMapBlock }, + { StructElement::H, "H", elementTypeBlock, attributeMapBlock }, + { StructElement::H1, "H1", elementTypeBlock, attributeMapBlock }, + { StructElement::H2, "H2", elementTypeBlock, attributeMapBlock }, + { StructElement::H3, "H3", elementTypeBlock, attributeMapBlock }, + { StructElement::H4, "H4", elementTypeBlock, attributeMapBlock }, + { StructElement::H5, "H5", elementTypeBlock, attributeMapBlock }, + { StructElement::H6, "H6", elementTypeBlock, attributeMapBlock }, + { StructElement::L, "L", elementTypeBlock, attributeMapList }, + { StructElement::LI, "LI", elementTypeBlock, attributeMapBlock }, + { StructElement::Lbl, "Lbl", elementTypeBlock, attributeMapBlock }, + { StructElement::Table, "Table", elementTypeBlock, attributeMapTable }, + { StructElement::TR, "TR", elementTypeUndefined, attributeMapShared }, + { StructElement::TH, "TH", elementTypeUndefined, attributeMapTableCell }, + { StructElement::TD, "TD", elementTypeUndefined, attributeMapTableCell }, + { StructElement::THead, "THead", elementTypeUndefined, attributeMapShared }, + { StructElement::TFoot, "TFoot", elementTypeUndefined, attributeMapShared }, + { StructElement::TBody, "TBody", elementTypeUndefined, attributeMapShared }, + { StructElement::Figure, "Figure", elementTypeUndefined, attributeMapIllustration }, + { StructElement::Formula, "Formula", elementTypeUndefined, attributeMapIllustration }, + { StructElement::Form, "Form", elementTypeUndefined, attributeMapIllustration }, + { StructElement::TOC, "TOC", elementTypeUndefined, attributeMapShared }, + { StructElement::TOCI, "TOCI", elementTypeUndefined, attributeMapShared }, }; +//------------------------------------------------------------------------ +// Helpers for the attribute and structure type tables +//------------------------------------------------------------------------ + +static inline const AttributeMapEntry* +getAttributeMapEntry(const AttributeMapEntry **entryList, Attribute::Type type) +{ + assert(entryList); + while (*entryList) { + const AttributeMapEntry *entry = *entryList; + while (entry->type != Attribute::Unknown) { + assert(entry->name); + if (type == entry->type) + return entry; + entry++; + } + entryList++; + } + return NULL; +} + +static inline const AttributeMapEntry* +getAttributeMapEntry(const AttributeMapEntry **entryList, const char *name) +{ + assert(entryList); + while (*entryList) { + const AttributeMapEntry *entry = *entryList; + while (entry->type != Attribute::Unknown) { + assert(entry->name); + if (strcmp(name, entry->name) == 0) + return entry; + entry++; + } + entryList++; + } + return NULL; +} + +static inline const OwnerMapEntry *getOwnerMapEntry(Attribute::Owner owner) +{ + for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (owner == ownerMap[i].owner) + return &ownerMap[i]; + } + return NULL; +} + +static inline const OwnerMapEntry *getOwnerMapEntry(const char *name) +{ + for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (strcmp(name, ownerMap[i].name) == 0) + return &ownerMap[i]; + } + return NULL; +} + +static const char *ownerToName(Attribute::Owner owner) +{ + const OwnerMapEntry *entry = getOwnerMapEntry(owner); + return entry ? entry->name : "UnknownOwner"; +} + +Attribute::Owner nameToOwner(const char *name) +{ + const OwnerMapEntry *entry = getOwnerMapEntry(name); + return entry ? entry->owner : Attribute::UnknownOwner; +} + static inline const TypeMapEntry *getTypeMapEntry(StructElement::Type type) { for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { @@ -124,6 +662,167 @@ static StructElement::Type nameToType(const char *name) //------------------------------------------------------------------------ +// Attribute +//------------------------------------------------------------------------ + +Attribute::Attribute(const char *nameA, Object *valueA, GBool copyValue): + type(UserProperty), + owner(UserProperties), + revision(0), + name(nameA), + value(), + hidden(gFalse), + formatted(NULL) +{ + assert(valueA); + + if (copyValue) + valueA->copy(&value); + else + valueA->shallowCopy(&value); +} + +Attribute::Attribute(Type type, Object *valueA, GBool copyValue): + type(type), + owner(UserProperties), // TODO: Determine corresponding owner from Type + revision(0), + name(), + value(), + hidden(gFalse), + formatted(NULL) +{ + assert(valueA); + + if (copyValue) + valueA->copy(&value); + else + valueA->shallowCopy(&value); + + if (!typeCheck()) { + type = Unknown; + } +} + +Attribute::~Attribute() +{ + delete formatted; + value.free(); +} + +const char *Attribute::getTypeName() const +{ + if (type == UserProperty) + return name.getCString(); + + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, type); + if (entry) + return entry->name; + + return "Unknown"; +} + +const char *Attribute::getOwnerName() const +{ + return ownerToName(owner); +} + +Object *Attribute::getDefaultValue(Attribute::Type type) +{ + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, type); + return entry ? const_cast(entry->defval) : NULL; +} + +void Attribute::setFormattedValue(const char *formattedA) +{ + if (formattedA) { + if (formatted) + formatted->Set(formattedA); + else + formatted = new GooString(formattedA); + } else { + delete formatted; + } +} + +GBool Attribute::typeCheck(StructElement *element) +{ + // If an element is passed, tighther type-checking can be done. + if (element) { + const TypeMapEntry *elementTypeEntry = getTypeMapEntry(element->getType()); + if (elementTypeEntry && elementTypeEntry->attributes) { + const AttributeMapEntry *entry = getAttributeMapEntry(elementTypeEntry->attributes, type); + if (entry) { + if (entry->check && !((*entry->check)(&value))) { + return gFalse; + } + } else { + // No entry: the attribute is not valid for the containing element. + return gFalse; + } + } + } + + return gTrue; +} + +Attribute::Type Attribute::typeForName(const char *name, StructElement *element) +{ + const AttributeMapEntry **attributes = attributeMapAll; + if (element) { + const TypeMapEntry *elementTypeEntry = getTypeMapEntry(element->getType()); + if (elementTypeEntry && elementTypeEntry->attributes) { + attributes = elementTypeEntry->attributes; + } + } + + const AttributeMapEntry *entry = getAttributeMapEntry(attributes, name); + return entry ? entry->type : Unknown; +} + +Attribute *Attribute::parseUserProperty(Dict *property) +{ + Object obj, value; + const char *name = NULL; + + if (property->lookup("N", &obj)->isString()) + name = obj.getString()->getCString(); + else if (obj.isName()) + name = obj.getName(); + else { + error(errSyntaxError, -1, "N object is wrong type ({0:s})", obj.getTypeName()); + obj.free(); + return NULL; + } + + if (property->lookup("V", &value)->isNull()) { + error(errSyntaxError, -1, "V object is wrong type ({0:s})", value.getTypeName()); + value.free(); + obj.free(); + return NULL; + } + + Attribute *attribute = new Attribute(name, &value, gFalse); + obj.free(); + + if (property->lookup("F", &obj)->isString()) { + attribute->setFormattedValue(obj.getString()->getCString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "F object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + if (property->lookup("H", &obj)->isBool()) { + attribute->setHidden(obj.getBool()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "H object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + return attribute; +} + + +//------------------------------------------------------------------------ // StructElement //------------------------------------------------------------------------ @@ -139,7 +838,8 @@ StructElement::StructData::StructData(): expandedAbbr(0), language(0), revision(0), - elements() + elements(), + attributes() { } @@ -152,6 +852,7 @@ StructElement::StructData::~StructData() delete language; parentRef.free(); for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) delete *i; + for (AttrPtrArray::iterator i = attributes.begin(); i != attributes.end(); ++i) delete *i; } @@ -230,17 +931,63 @@ Ref StructElement::getPageRef() const return invalidRef; } -const char* StructElement::getTypeName() const +const char *StructElement::getTypeName() const { return typeToName(type); } +const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBool inherit, + Attribute::Owner attributeOwner) const +{ + if (isContent()) + return parent->findAttribute(attributeType, inherit, attributeOwner); + + if (attributeType != Attribute::Unknown && attributeType != Attribute::UserProperty) { + const Attribute *result = NULL; + + if (attributeOwner == Attribute::UnknownOwner) { + // Search for the attribute, no matter who the owner is + for (unsigned i = 0; i < getNumAttributes(); i++) { + const Attribute *attr = getAttribute(i); + if (attributeType == attr->getType()) { + if (!result || ownerHasMorePriority(attr->getOwner(), result->getOwner())) + result = attr; + } + } + } else { + // Search for the attribute, with a specific owner + for (unsigned i = 0; i < getNumAttributes(); i++) { + const Attribute *attr = getAttribute(i); + if (attributeType == attr->getType() && attributeOwner == attr->getOwner()) { + result = attr; + break; + } + } + } + + if (result) + return result; + + if (inherit && parent) { + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, attributeType); + assert(entry); + // TODO: Take into account special inheritance cases, for example: + // inline elements which have been changed to be block using + // "/Placement/Block" have slightly different rules. + if (entry->inherit) + return parent->findAttribute(attributeType, inherit, attributeOwner); + } + } + return NULL; +} + GooString* StructElement::getText(GooString *string, GBool recursive) const { - // TODO: Dummy implementation, complete + // TODO: Dummy implementation return NULL; } + static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) { // Circular reference @@ -372,8 +1119,60 @@ void StructElement::parse(Dict *element) } obj.free(); - // TODO: Attributes directly attached to the element (optional). - // TODO: Attributes referenced indirectly through the ClassMap (optional). + // Attributes directly attached to the element (optional). + if (element->lookup("A", &obj)->isDict()) { + parseAttributes(obj.getDict()); + } else if (obj.isArray()) { + Object iobj; + unsigned attrIndex = getNumAttributes(); + for (int i = 0; i < obj.arrayGetLength(); i++) { + if (obj.arrayGet(i, &iobj)->isDict()) { + attrIndex = getNumAttributes(); + parseAttributes(obj.getDict()); + } else if (iobj.isInt()) { + const int revision = iobj.getInt(); + // Set revision numbers for the elements previously created. + for (unsigned j = attrIndex; j < getNumAttributes(); j++) + getAttribute(j)->setRevision(revision); + } else { + error(errSyntaxWarning, -1, "A item is wrong type ({0:s})", iobj.getTypeName()); + } + iobj.free(); + } + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "A is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Attributes referenced indirectly through the ClassMap (optional). + if (treeRoot->getClassMap()) { + Object classes; + if (element->lookup("C", &classes)->isName()) { + Object attr; + if (treeRoot->getClassMap()->lookup(classes.getName(), &attr)->isDict()) { + parseAttributes(attr.getDict(), gTrue); + } else if (attr.isArray()) { + for (int i = 0; i < attr.arrayGetLength(); i++) { + Object iobj; + unsigned attrIndex = getNumAttributes(); + if (attr.arrayGet(i, &iobj)->isDict()) { + attrIndex = getNumAttributes(); + parseAttributes(iobj.getDict(), gTrue); + } else if (iobj.isInt()) { + // Set revision numbers for the elements previously created. + const int revision = iobj.getInt(); + for (unsigned j = attrIndex; j < getNumAttributes(); j++) + getAttribute(j)->setRevision(revision); + } else { + error(errSyntaxWarning, -1, "C item is wrong type ({0:s})", iobj.getTypeName()); + } + } + } else if (!attr.isNull()) { + error(errSyntaxWarning, -1, "C object is wrong type ({0:s})", classes.getTypeName()); + } + classes.free(); + } + } parseChildren(element); } @@ -464,3 +1263,83 @@ void StructElement::parseChildren(Dict *element) kids.free(); } + +void StructElement::parseAttributes(Dict *attributes, GBool keepExisting) +{ + Object owner; + if (attributes->lookup("O", &owner)->isName("UserProperties")) { + // In this case /P is an array of UserProperty dictionaries + Object userProperties; + if (attributes->lookup("P", &userProperties)->isArray()) { + for (int i = 0; i < userProperties.arrayGetLength(); i++) { + Object property; + if (userProperties.arrayGet(i, &property)->isDict()) { + Attribute *attribute = Attribute::parseUserProperty(property.getDict()); + if (attribute && attribute->isOk()) { + appendAttribute(attribute); + } else { + error(errSyntaxWarning, -1, "Item in P is invalid"); + delete attribute; + } + } else { + error(errSyntaxWarning, -1, "Item in P is wrong type ({0:s})", property.getTypeName()); + } + property.free(); + } + } else { + error(errSyntaxWarning, -1, "P is wrong type ({0:s})", userProperties.getTypeName()); + } + userProperties.free(); + } else if (owner.isName()) { + // In this case /P contains standard attributes. + // Check first if the owner is a valid standard one. + Attribute::Owner ownerValue = nameToOwner(owner.getName()); + if (ownerValue != Attribute::UnknownOwner) { + // Iterate over the entries of the "attributes" dictionary. + // The /O entry (owner) is skipped. + for (int i = 0; i < attributes->getLength(); i++) { + const char *key = attributes->getKey(i); + if (strcmp(key, "O") != 0) { + Attribute::Type type = Attribute::typeForName(key, this); + + // Check if the attribute is already defined. + if (keepExisting) { + GBool exists = gFalse; + for (unsigned j = 0; j < getNumAttributes(); j++) { + if (getAttribute(j)->getType() == type) { + exists = gTrue; + break; + } + } + if (exists) + continue; + } + + if (type != Attribute::Unknown) { + Object value; + GBool typeCheckOk = gTrue; + Attribute *attribute = new Attribute(type, attributes->getVal(i, &value), gFalse); + if (attribute->isOk() && (typeCheckOk = attribute->typeCheck(this))) { + appendAttribute(attribute); + } else { + // It is not needed to free "value", the Attribute instance + // owns the contents, so deleting "attribute" is enough. + if (!typeCheckOk) { + error(errSyntaxWarning, -1, "Attribute {0:s} value is of wrong type ({1:s})", + attribute->getTypeName(), attribute->getValue()->getTypeName()); + } + delete attribute; + } + } else { + error(errSyntaxWarning, -1, "Wrong Attribute '{0:s}' in element {1:s}", key, getTypeName()); + } + } + } + } else { + error(errSyntaxWarning, -1, "O object is invalid value ({0:s})", owner.getName()); + } + } else if (!owner.isNull()) { + error(errSyntaxWarning, -1, "O is wrong type ({0:s})", owner.getTypeName()); + } + owner.free(); +} diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 499ed4c..ff1dc1a 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -22,7 +22,101 @@ class GooString; class Dict; +class StructElement; class StructTreeRoot; +class TextWordList; + + +class Attribute { +public: + enum Type { + Unknown = 0, // Uninitialized, parsing error, etc. + UserProperty, // User defined attribute (i.e. non-standard) + + // Common standard attributes + Placement, WritingMode, BackgroundColor, BorderColor, BorderStyle, + BorderThickness, Color, Padding, + + // Block element standard attributes + SpaceBefore, SpaceAfter, StartIndent, EndIndent, TextIndent, TextAlign, + BBox, Width, Height, BlockAlign, InlineAlign, TBorderStyle, TPadding, + + // Inline element standard attributes + BaselineShift, LineHeight, TextDecorationColor, TextDecorationThickness, + TextDecorationType, RubyAlign, RubyPosition, GlyphOrientationVertical, + + // Column-only standard attributes + ColumnCount, ColumnGap, ColumnWidths, + + // List-only standard attributes + ListNumbering, + + // PrintField-only standard attributes + Role, checked, Desc, + + // Table-only standard attributes + RowSpan, ColSpan, Headers, Scope, Summary, + }; + + enum Owner { + UnknownOwner = 0, + // User-defined attributes + UserProperties, + // Standard attributes + Layout, List, PrintField, Table, + // Translation to other formats + XML_1_00, HTML_3_20, HTML_4_01, OEB_1_00, RTF_1_05, CSS_1_00, CSS_2_00, + }; + + // Creates a standard attribute. The name is predefined, and the + // value is type-checked to conform to the PDF specification. + Attribute(Type type, Object *value, GBool copyValue = gTrue); + + // Creates an UserProperty attribute, with an arbitrary name and value. + Attribute(const char *name, Object *value, GBool copyValue = gTrue); + + GBool isOk() const { return type != Unknown; } + + // Name, type and value can be set only on construction. + Type getType() const { return type; } + Owner getOwner() const { return owner; } + const char *getTypeName() const; + const char *getOwnerName() const; + Object *getValue() const { return &value; } + static Object *getDefaultValue(Type type); + + const char *getName() const { return type == UserProperty ? name.getCString() : getTypeName(); } + + // The revision is optional, and defaults to zero. + Guint getRevision() const { return revision; } + void setRevision(Guint revisionA) { revision = revisionA; } + + // Hidden elements should not be displayed by the user agent + GBool isHidden() const { return hidden; } + void setHidden(GBool hiddenA) { hidden = hiddenA; } + + // The formatted value may be in the PDF, or be left undefined (NULL). + // In the later case the user agent should provide a default representation. + const char *getFormattedValue() const { return formatted ? formatted->getCString() : NULL; } + void setFormattedValue(const char *formattedA); + + ~Attribute(); + +private: + Type type; + Owner owner; + Guint revision; + mutable GooString name; + mutable Object value; + GBool hidden; + GooString *formatted; + + GBool typeCheck(StructElement *element = NULL); + static Type typeForName(const char *name, StructElement *element = NULL); + static Attribute *parseUserProperty(Dict *property); + + friend class StructElement; +}; class StructElement { @@ -99,10 +193,6 @@ public: void appendElement(StructElement *element) { if (!isContent() && element && element->isOk()) s->elements.push_back(element); } -<<<<<<< HEAD - const GooString* getAltText() const { return isContent() ? NULL : s->altText; } - GooString* getAltText() { return isContent() ? NULL : s->altText; } -======= unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } const Attribute *getAttribute(int i) const { return isContent() ? NULL : s->attributes.at(i); } Attribute *getAttribute(int i) { return isContent() ? NULL : s->attributes.at(i); } @@ -115,7 +205,6 @@ public: const GooString *getAltText() const { return isContent() ? NULL : s->altText; } GooString *getAltText() { return isContent() ? NULL : s->altText; } ->>>>>>> 9932f3b... fixup! Tagged-PDF: Implement parsing of StructElem objects const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } GooString *getActualText() { return isContent() ? NULL : s->actualText; } @@ -139,6 +228,7 @@ public: ~StructElement(); private: + typedef std::vector AttrPtrArray; typedef std::vector ElemPtrArray; struct StructData { @@ -151,6 +241,7 @@ private: GooString *language; Guint revision; ElemPtrArray elements; + AttrPtrArray attributes; StructData(); ~StructData(); @@ -182,9 +273,10 @@ private: StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA); StructElement(const Ref &ref, StructTreeRoot *treeRootA, StructElement *parentA); - void parse(Dict *elementDict); - StructElement *parseChild(Object *ref, Object *childObj); - void parseChildren(Dict *element); + void parse(Dict* elementDict); + StructElement* parseChild(Object *ref, Object* childObj); + void parseChildren(Dict* element); + void parseAttributes(Dict *element, GBool keepExisting = gFalse); friend class StructTreeRoot; }; -- 1.8.3.1