From a2275b4bb73e302dfeb697f5025dc19def372a9a Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Fri, 26 Apr 2013 01:22:13 +0300 Subject: [PATCH 2/6] Tagged-PDF: Interpret the document structure Picking from StructTreeRoot, recursively creates a tree of StructTreeNode objects representing the structure of the document. The biggest missing things are: - Presenting more information in StructTreeNode, being the most important reading the structure and attribute dictionaries for the elements in the tree. - Resolving marked-content identifiers which refer to information stored in page object streams. - Creating a synthetic tree when the PDF is not tagged to use as fall-back. --- poppler/Catalog.cc | 36 +- poppler/Catalog.h | 5 +- poppler/MCOutputDev.cc | 145 +++++ poppler/MCOutputDev.h | 108 ++++ poppler/Makefile.am | 6 + poppler/PDFDoc.h | 3 +- poppler/StructElement.cc | 1361 +++++++++++++++++++++++++++++++++++++++++++++ poppler/StructElement.h | 273 +++++++++ poppler/StructTreeRoot.cc | 120 ++++ poppler/StructTreeRoot.h | 56 ++ 10 files changed, 2095 insertions(+), 18 deletions(-) create mode 100644 poppler/MCOutputDev.cc create mode 100644 poppler/MCOutputDev.h create mode 100644 poppler/StructElement.cc create mode 100644 poppler/StructElement.h create mode 100644 poppler/StructTreeRoot.cc create mode 100644 poppler/StructTreeRoot.h diff --git a/poppler/Catalog.cc b/poppler/Catalog.cc index c365e06..f24f8a3 100644 --- a/poppler/Catalog.cc +++ b/poppler/Catalog.cc @@ -55,6 +55,7 @@ #include "OptionalContent.h" #include "ViewerPreferences.h" #include "FileSpec.h" +#include "StructTreeRoot.h" #if MULTITHREADED # define catalogLocker() MutexLocker locker(&mutex) @@ -90,6 +91,7 @@ Catalog::Catalog(PDFDoc *docA) { embeddedFileNameTree = NULL; jsNameTree = NULL; viewerPrefs = NULL; + structTreeRoot = NULL; pagesList = NULL; pagesRefList = NULL; @@ -175,8 +177,8 @@ Catalog::~Catalog() { delete form; delete optContent; delete viewerPrefs; + delete structTreeRoot; metadata.free(); - structTreeRoot.free(); outline.free(); acroForm.free(); viewerPreferences.free(); @@ -837,24 +839,28 @@ PageLabelInfo *Catalog::getPageLabelInfo() return pageLabelInfo; } -Object *Catalog::getStructTreeRoot() +StructTreeRoot *Catalog::getStructTreeRoot() { catalogLocker(); - if (structTreeRoot.isNone()) - { - Object catDict; + if (!structTreeRoot) { + Object catalog; + Object root; - xref->getCatalog(&catDict); - if (catDict.isDict()) { - catDict.dictLookup("StructTreeRoot", &structTreeRoot); - } else { - error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catDict.getTypeName()); - structTreeRoot.initNull(); - } - catDict.free(); + xref->getCatalog(&catalog); + if (!catalog.isDict()) { + error(errSyntaxError, -1, "Catalog object is wrong type ({0:s})", catalog.getTypeName()); + catalog.free(); + return NULL; + } + + if (catalog.dictLookup("StructTreeRoot", &root)->isDict("StructTreeRoot")) { + structTreeRoot = new StructTreeRoot(doc, root.getDict(), getMarkInfo() & markInfoMarked); + } + + root.free(); + catalog.free(); } - - return &structTreeRoot; + return structTreeRoot; } Guint Catalog::getMarkInfo() diff --git a/poppler/Catalog.h b/poppler/Catalog.h index 35b4f87..bdba3ce 100644 --- a/poppler/Catalog.h +++ b/poppler/Catalog.h @@ -53,6 +53,7 @@ class Form; class OCGs; class ViewerPreferences; class FileSpec; +class StructTreeRoot; //------------------------------------------------------------------------ // NameTree @@ -123,7 +124,7 @@ public: GooString *readMetadata(); // Return the structure tree root object. - Object *getStructTreeRoot(); + StructTreeRoot* getStructTreeRoot(); // Return values from the MarkInfo dictionary as flags in a bitfield. enum MarkInfoFlags { @@ -227,8 +228,8 @@ private: NameTree *jsNameTree; // Java Script name-tree GooString *baseURI; // base URI for URI-type links Object metadata; // metadata stream - Object structTreeRoot; // structure tree root dictionary int markInfo; // Flags from MarkInfo dictionary + StructTreeRoot *structTreeRoot; // structure tree root Object outline; // outline dictionary Object acroForm; // AcroForm dictionary Object viewerPreferences; // ViewerPreference dictionary diff --git a/poppler/MCOutputDev.cc b/poppler/MCOutputDev.cc new file mode 100644 index 0000000..e593c78 --- /dev/null +++ b/poppler/MCOutputDev.cc @@ -0,0 +1,145 @@ +//======================================================================== +// +// MCOutputDev.cc +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#include "MCOutputDev.h" +#include "GfxFont.h" +#include "GfxState.h" +#include "Annot.h" +#include "Link.h" +#include + +struct MCOutputDev::Priv +{ + MCOpArray commands; + bool capturing; + int mcid; + GfxFont *lastFont; + Guint lastFlags; + double pageWidth; + double pageHeight; + + Priv(int mcidA): + commands(), + capturing(false), + mcid(mcidA), + lastFont(0), + lastFlags(0), + pageWidth(0.0), + pageHeight(0.0) + {} +}; + + +MCOutputDev::MCOutputDev(int mcid): + p(new Priv(mcid)) +{ +} + + +MCOutputDev::~MCOutputDev() +{ + delete p; +} + + +void MCOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) +{ + if (state) { + p->pageWidth = state->getPageWidth(); + p->pageHeight = state->getPageHeight(); + } else { + p->pageWidth = p->pageHeight = 0.0; + } +} + + +void MCOutputDev::endPage() +{ + p->pageWidth = p->pageHeight = 0.0; +} + + +void MCOutputDev::beginMarkedContent(char *name, Dict *properties) +{ + int id = -1; + if (properties && properties->lookupInt("MCID", NULL, &id) && id == p->mcid) + p->capturing = true; +} + + +void MCOutputDev::endMarkedContent(GfxState *state) +{ + p->capturing = false; +} + + +void MCOutputDev::drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen) +{ + if (!p->capturing || !uLen) + return; + + double sp, dx2, dy2, w1, h1, x1, y1; + + // Subtract char and word spacing from the (dx,dy) values + sp = state->getCharSpace(); + if (c == (CharCode) 0x20) + sp += state->getWordSpace(); + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + state->transform(xx, yy, &x1, &y1); + + // Throw away characters that are not inside the page boundaries. + if (x1 + w1 < 0 || x1 > p->pageWidth || y1 + h1 < 0 || y1 > p->pageHeight) + return; + + // Make a sanity check on character size. Note: (x != x) <-> isnan(x) + if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1) + return; + + for (int i = 0; i < uLen; i++) + p->commands.push_back(MCOp(u[i])); +} + + +void MCOutputDev::updateFont(GfxState *state) +{ + GfxFont *font = state->getFont(); + if (!font || font == p->lastFont) return; + + if (!p->lastFont || (p->lastFont->getFamily() && p->lastFont->getFamily()->cmp(font->getFamily()))) { + if (p->capturing && font->getFamily()) + p->commands.push_back(MCOp(mcOpFontName, font->getFamily()->getCString())); + if (p->lastFont) p->lastFont->decRefCnt(); + p->lastFont = font; + font->incRefCnt(); + } + + Guint flags = 0; + if (font->isBold()) flags |= mcOpFlagFontBold; + if (font->isItalic()) flags |= mcOpFlagFontItalic; + if (font->isFixedWidth()) flags |= mcOpFlagFontItalic; + + if (p->lastFlags != flags) { + if (p->capturing) + p->commands.push_back(MCOp(mcOpFlags, flags)); + p->lastFlags = flags; + } +} + + +const MCOpArray& MCOutputDev::getMCOps() const +{ + return p->commands; +} diff --git a/poppler/MCOutputDev.h b/poppler/MCOutputDev.h new file mode 100644 index 0000000..cd7c4f5 --- /dev/null +++ b/poppler/MCOutputDev.h @@ -0,0 +1,108 @@ +//======================================================================== +// +// MCOutputDev.h +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef MCDOUTPUTDEV_H +#define MCDOUTPUTDEV_H + +#include "goo/gtypes.h" +#include "goo/gmem.h" +#include "OutputDev.h" +#include + +class GfxState; +class GooString; +class Dict; + + +enum MCOpType { + mcOpUnichar, + mcOpFontName, + mcOpFlags, + mcOpColor, +}; + +enum MCOpFlags { + mcOpFlagFontBold = (1 << 0), + mcOpFlagFontItalic = (1 << 1), + mcOpFlagFontFixed = (1 << 2), +}; + +struct MCColor { + double r, g, b; + + Guint rgbPixel() const { + return ((Guint) (r * 255) & 0xFF) << 16 + | ((Guint) (g * 255) & 0xFF) << 8 + | ((Guint) (b * 255) & 0xFF); + } +}; + +struct MCOp { + MCOpType type; + union { + Unicode unichar; + char *value; + Guint flags; + MCColor color; + }; + + MCOp(const MCOp& op): type(op.type) { + switch (type) { + case mcOpFlags: flags = op.flags; break; + case mcOpUnichar: unichar = op.unichar; break; + case mcOpFontName: value = strdup(op.value); break; + case mcOpColor: memcpy(&color, &op.color, sizeof(MCColor)); break; + } + } + MCOp(): type(mcOpFontName), value(NULL) {} + MCOp(Unicode u): type(mcOpUnichar), unichar(u) {} + MCOp(MCOpType t, Guint f): type(t), flags(f) {} + MCOp(MCOpType t, const char *s = NULL): type(t), value(strdup(s)) {} + ~MCOp() { if (type == mcOpFontName) gfree(value); } +}; + + +typedef std::vector MCOpArray; + + +class MCOutputDev: public OutputDev { +public: + MCOutputDev(int mcid); + virtual ~MCOutputDev(); + + virtual GBool isOk() { return gTrue; } + virtual GBool upsideDown() { return gTrue; } + virtual GBool useDrawChar() { return gTrue; } + virtual GBool interpretType3Chars() { return gFalse; } + virtual GBool needNonText() { return gFalse; } + virtual GBool needCharCount() { return gFalse; } + + virtual void startPage(int pageNum, GfxState *state, XRef *xref); + virtual void endPage(); + + virtual void restoreState(GfxState *state) { updateFont(state); } + virtual void updateFont(GfxState *state); + + virtual void drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen); + + virtual void beginMarkedContent(char *name, Dict *properties); + virtual void endMarkedContent(GfxState *state); + + const MCOpArray& getMCOps() const; + +private: + struct Priv; + Priv *p; +}; + +#endif /* !MCOUTPUTDEV_H */ diff --git a/poppler/Makefile.am b/poppler/Makefile.am index ac51d05..eaff39d 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -236,6 +236,8 @@ poppler_include_HEADERS = \ StdinPDFDocBuilder.h \ Stream-CCITT.h \ Stream.h \ + StructElement.h \ + StructTreeRoot.h \ UnicodeMap.h \ UnicodeMapTables.h \ UnicodeTypeTable.h \ @@ -250,6 +252,7 @@ poppler_include_HEADERS = \ NameToUnicodeTable.h \ PSOutputDev.h \ TextOutputDev.h \ + MCOutputDev.h \ SecurityHandler.h \ UTF.h \ UTF8.h \ @@ -315,6 +318,8 @@ libpoppler_la_SOURCES = \ StdinCachedFile.cc \ StdinPDFDocBuilder.cc \ Stream.cc \ + StructElement.cc \ + StructTreeRoot.cc \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ @@ -323,6 +328,7 @@ libpoppler_la_SOURCES = \ XRef.cc \ PSOutputDev.cc \ TextOutputDev.cc \ + MCOutputDev.cc \ PageLabelInfo.h \ PageLabelInfo.cc \ SecurityHandler.cc \ diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h index da9bf5b..48189bc 100644 --- a/poppler/PDFDoc.h +++ b/poppler/PDFDoc.h @@ -60,6 +60,7 @@ class Outline; class Linearization; class SecurityHandler; class Hints; +class StructTreeRoot; enum PDFWriteMode { writeStandard, @@ -139,7 +140,7 @@ public: GooString *readMetadata() { return catalog->readMetadata(); } // Return the structure tree root object. - Object *getStructTreeRoot() { return catalog->getStructTreeRoot(); } + StructTreeRoot *getStructTreeRoot() { return catalog->getStructTreeRoot(); } // Get page. Page *getPage(int page); diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc new file mode 100644 index 0000000..c99c9fa --- /dev/null +++ b/poppler/StructElement.cc @@ -0,0 +1,1361 @@ +//======================================================================== +// +// StructElement.cc +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "StructElement.h" +#include "StructTreeRoot.h" +#include "TextOutputDev.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" +#include "PDFDoc.h" +#include "Dict.h" + +#include + +class GfxState; + + +static GBool isPlacementName(Object* value) +{ + return value->isName("Block") + || value->isName("Inline") + || value->isName("Before") + || value->isName("Start") + || value->isName("End"); +} + +static GBool isWritingModeName(Object* value) +{ + return value->isName("LrTb") + || value->isName("RlTb") + || value->isName("TbRl"); +} + +static GBool isBorderStyleName(Object* value) +{ + return value->isName("None") + || value->isName("Hidden") + || value->isName("Dotted") + || value->isName("Dashed") + || value->isName("Solid") + || value->isName("Double") + || value->isName("Groove") + || value->isName("Ridge") + || value->isName("Inset") + || value->isName("Outset"); +} + +static GBool isTextAlignName(Object* value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center") + || value->isName("Justify"); +} + +static GBool isBlockAlignName(Object* value) +{ + return value->isName("Before") + || value->isName("Middle") + || value->isName("After") + || value->isName("Justify"); +} + +static GBool isInlineAlignName(Object* value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center"); +} + +static GBool isNumber(Object* value); + +static GBool isLineHeight(Object* value) +{ + return value->isName("Normal") + || value->isName("Auto") + || isNumber(value); +} + +static GBool isTextDecorationName(Object* value) +{ + return value->isName("None") + || value->isName("Underline") + || value->isName("Overline") + || value->isName("LineThrough"); +} + +static GBool isRubyAlignName(Object* value) +{ + return value->isName("Start") + || value->isName("End") + || value->isName("Center") + || value->isName("Justify") + || value->isName("Distribute"); +} + +static GBool isRubyPositionName(Object* value) +{ + return value->isName("Before") + || value->isName("After") + || value->isName("Warichu") + || value->isName("Inline"); +} + +static GBool isGlyphOrientationName(Object* value) +{ + return value->isName("Auto") + || value->isName("90") + || value->isName("180") + || value->isName("270") + || value->isName("360") + || value->isName("-90") + || value->isName("-180"); +} + +static GBool isListNumberingName(Object* value) +{ + return value->isName("None") + || value->isName("Disc") + || value->isName("Circle") + || value->isName("Square") + || value->isName("Decimal") + || value->isName("UpperRoman") + || value->isName("LowerRoman") + || value->isName("UpperAlpha") + || value->isName("LowerAlpha"); +} + +static GBool isFieldRoleName(Object* value) +{ + return value->isName("rb") + || value->isName("cb") + || value->isName("pb") + || value->isName("tv"); +} + +static GBool isFieldCheckedName(Object* value) +{ + return value->isName("on") + || value->isName("off") + || value->isName("neutral"); +} + +static GBool isTableScopeName(Object* value) +{ + return value->isName("Row") + || value->isName("Column") + || value->isName("Both"); +} + +static GBool isRGBColor(Object* value) +{ + if (!(value->isArray() && value->arrayGetLength() == 3)) + return gFalse; + + GBool okay = gTrue; + for (int i = 0; i < 3; i++) { + Object obj; + if (!value->arrayGet(i, &obj)->isNum()) { + okay = gFalse; + obj.free(); + break; + } + if (obj.getNum() < 0.0 || obj.getNum() > 1.0) { + okay = gFalse; + obj.free(); + break; + } + obj.free(); + } + + return okay; +} + +static GBool isNatural(Object* value) +{ + return (value->isInt() && value->getInt() > 0) + || (value->isInt64() && value->getInt64() > 0); +} + +static GBool isPositive(Object* value) +{ + return value->isNum() && value->getNum() >= 0.0; +} + +static GBool isNumber(Object* value) +{ + return value->isNum(); +} + +static GBool isNumber_or_AutoName(Object* value) +{ + return isNumber(value) || value->isName("Auto"); +} + +static GBool isTextString(Object* value) +{ + // XXX: Shall isName() also be checked? + return value->isString(); +} + + +#define ARRAY_CHECKER(name, checkItem, length, allowSingle, allowNulls) \ + static GBool name(Object* value) { \ + if (!value->isArray()) \ + return allowSingle ? checkItem(value) : gFalse; \ + \ + if (length && value->arrayGetLength() != length) \ + return gFalse; \ + \ + GBool okay = gTrue; \ + for (int i = 0; i < value->arrayGetLength(); i++) { \ + Object obj; \ + value->arrayGet(i, &obj); \ + if ((!allowNulls && obj.isNull()) || !checkItem(&obj)) { \ + okay = gFalse; \ + obj.free(); \ + break; \ + } \ + obj.free(); \ + } \ + return okay; \ + } + +ARRAY_CHECKER(isRGBColor_or_OptX4, isRGBColor, 4, gTrue, gTrue ); +ARRAY_CHECKER(isPositive_or_OptX4, isPositive, 4, gTrue, gTrue ); +ARRAY_CHECKER(isPositive_or_X4, isPositive, 4, gTrue, gFalse); +ARRAY_CHECKER(isBorderStyle, isBorderStyleName, 4, gTrue, gTrue ); +ARRAY_CHECKER(isNumber_X4, isNumber, 4, gFalse, gFalse); +ARRAY_CHECKER(isNumber_or_Xn, isNumber, 0, gTrue, gFalse); +ARRAY_CHECKER(isTableHeaders, isTextString, 0, gFalse, gFalse); + + +// Type of functions used to do type-checking on attribute values +typedef GBool (*AttributeCheckFunc)(Object*); + +// Maps attributes to their names and whether the attribute can be inherited. +struct AttributeMapEntry { + Attribute::Type type; + const char* name; + const Object* defval; + GBool inherit; + AttributeCheckFunc check; +}; + +struct AttributeDefaults { + Object Inline; + Object LrTb; + Object Normal; + Object Distribute; + Object off; + Object Zero; + Object Auto; + Object Start; + Object None; + Object Before; + Object Nat1; + + AttributeDefaults() { + Inline.initName("Inline"); + LrTb.initName("LrTb"); + Normal.initName("Normal"); + Distribute.initName("Distribute"); + off.initName("off"); + + Zero.initReal(0.0); + Auto.initName("Auto"); + Start.initName("Start"); + None.initName("None"); + Before.initName("Before"); + Nat1.initInt(1); + } +}; + +static const AttributeDefaults attributeDefaults; + + +#define ATTR_LIST_END { Attribute::Unknown, NULL, NULL, gFalse, NULL } +#define ATTR_D(x, i, c, v) { Attribute::x, #x, &attributeDefaults.v, i, c } +#define ATTR_N(x, i, c) { Attribute::x, #x, NULL, i, c } + +static const AttributeMapEntry attributeMapCommonShared[] = +{ + ATTR_D(Placement, gFalse, isPlacementName, Inline), + ATTR_D(WritingMode, gFalse, isWritingModeName, LrTb), + ATTR_N(BackgroundColor, gFalse, isRGBColor), + ATTR_N(BorderColor, gTrue, isRGBColor_or_OptX4), + ATTR_D(BorderStyle, gFalse, isBorderStyle, None), + ATTR_N(BorderThickness, gTrue, isPositive_or_OptX4), + ATTR_D(Padding, gFalse, isPositive_or_X4, Zero), + ATTR_N(Color, gTrue, isRGBColor), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonBlock[] = +{ + ATTR_D(SpaceBefore, gFalse, isPositive, Zero), + ATTR_D(SpaceAfter, gFalse, isPositive, Zero), + ATTR_D(StartIndent, gTrue, isNumber, Zero), + ATTR_D(EndIndent, gTrue, isNumber, Zero), + ATTR_D(TextIndent, gTrue, isNumber, Zero), + ATTR_D(TextAlign, gTrue, isTextAlignName, Start), + ATTR_N(BBox, gFalse, isNumber_X4), + ATTR_D(Width, gFalse, isNumber_or_AutoName, Auto), + ATTR_D(Height, gFalse, isNumber_or_AutoName, Auto), + ATTR_D(BlockAlign, gTrue, isBlockAlignName, Before), + ATTR_D(InlineAlign, gTrue, isInlineAlignName, Start), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonInline[] = +{ + ATTR_D(BaselineShift, gFalse, isNumber, Zero), + ATTR_D(LineHeight, gTrue, isLineHeight, Normal), + ATTR_N(TextDecorationColor, gTrue, isRGBColor), + ATTR_N(TextDecorationThickness, gTrue, isPositive), + ATTR_D(TextDecorationType, gFalse, isTextDecorationName, None), + ATTR_D(GlyphOrientationVertical, gTrue, isGlyphOrientationName, Auto), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonRubyText[] = +{ + ATTR_D(RubyPosition, gTrue, isRubyPositionName, Before), + ATTR_D(RubyAlign, gTrue, isRubyAlignName, Distribute), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonColumns[] = +{ + ATTR_D(ColumnCount, gFalse, isNatural, Nat1), + ATTR_N(ColumnGap, gFalse, isNumber_or_Xn), + ATTR_N(ColumnWidths, gFalse, isNumber_or_Xn), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonList[] = { + ATTR_D(ListNumbering, gFalse, isListNumberingName, None), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonPrintField[] = +{ + ATTR_N(Role, gFalse, isFieldRoleName), + ATTR_D(checked, gFalse, isFieldCheckedName, off), + ATTR_N(Desc, gFalse, isTextString), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonTable[] = +{ + ATTR_N(Headers, gFalse, isTableHeaders), + ATTR_N(Scope, gFalse, isTableScopeName), + ATTR_N(Summary, gFalse, isTextString), + ATTR_LIST_END +}; + +static const AttributeMapEntry attributeMapCommonTableCell[] = +{ + ATTR_D(RowSpan, gFalse, isNatural, Nat1), + ATTR_D(ColSpan, gFalse, isNatural, Nat1), + ATTR_D(TBorderStyle, gTrue, isBorderStyle, None), + ATTR_D(TPadding, gTrue, isPositive_or_X4, Zero), + ATTR_LIST_END +}; + + +static const AttributeMapEntry* attributeMapAll[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonInline, + attributeMapCommonRubyText, + attributeMapCommonColumns, + attributeMapCommonList, + attributeMapCommonPrintField, + attributeMapCommonTable, + attributeMapCommonTableCell, + NULL, +}; + +static const AttributeMapEntry* attributeMapShared[] = { + attributeMapCommonShared, + NULL, +}; + +static const AttributeMapEntry* attributeMapBlock[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + NULL, +}; + +static const AttributeMapEntry* attributeMapInline[] = { + attributeMapCommonShared, + attributeMapCommonInline, + NULL, +}; + +static const AttributeMapEntry* attributeMapTableCell[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonTable, + attributeMapCommonTableCell, + NULL, +}; + +static const AttributeMapEntry* attributeMapRubyText[] = { + attributeMapCommonShared, + attributeMapCommonInline, + attributeMapCommonRubyText, + NULL, +}; + +static const AttributeMapEntry* attributeMapColumns[] = { + attributeMapCommonShared, + attributeMapCommonInline, + attributeMapCommonColumns, + NULL, +}; + +static const AttributeMapEntry* attributeMapList[] = { + attributeMapCommonShared, + attributeMapCommonList, + NULL, +}; + +static const AttributeMapEntry* attributeMapPrintField[] = { + attributeMapCommonShared, + attributeMapCommonPrintField, + NULL, +}; + +static const AttributeMapEntry* attributeMapTable[] = { + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonTable, + NULL, +}; + +static const AttributeMapEntry* attributeMapIllustration[] = { + // XXX: Illustrations may have some attributes from the "shared", "inline", + // the "block" sets. This is a loose specification; making it better + // means duplicating entries from the sets. This seems good enough... + attributeMapCommonShared, + attributeMapCommonBlock, + attributeMapCommonInline, + NULL, +}; + +// Table mapping owners of attributes to their names. +static const struct OwnerMapEntry { + Attribute::Owner owner; + const char* name; +} ownerMap[] = { + // XXX: Those are sorted in the owner priority resolution order. If the + // same attribute is defined with two owners, the order in the table + // can be used to know which one has more priority. + { Attribute::XML_1_00, "XML-1.00" }, + { Attribute::HTML_3_20, "HTML-3.20" }, + { Attribute::HTML_4_01, "HTML-4.01" }, + { Attribute::OEB_1_00, "OEB-1.00" }, + { Attribute::RTF_1_05, "RTF-1.05" }, + { Attribute::CSS_1_00, "CSS-1.00" }, + { Attribute::CSS_2_00, "CSS-2.00" }, + { Attribute::Layout, "Layout" }, + { Attribute::PrintField, "PrintField" }, + { Attribute::Table, "Table" }, + { Attribute::List, "List" }, + { Attribute::UserProperties, "UserProperties" }, +}; + + +static GBool ownerHasMorePriority(Attribute::Owner a, Attribute::Owner b) +{ + unsigned a_index, b_index; + + for (unsigned i = a_index = b_index = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (ownerMap[i].owner == a) + a_index = i; + if (ownerMap[i].owner == b) + b_index = i; + } + + return a_index < b_index; +} + + +// Maps element types to their names and also serves as lookup table +// for additional element type attributes. + +enum ElementType { + elementTypeUndefined, + elementTypeInline, + elementTypeBlock, +}; + +static const struct TypeMapEntry { + StructElement::Type type; + const char* name; + ElementType elementType; + const AttributeMapEntry** attributes; +} typeMap[] = { + { StructElement::Document, "Document", elementTypeInline, attributeMapShared }, + { StructElement::Part, "Part", elementTypeInline, attributeMapShared }, + { StructElement::Art, "Art", elementTypeInline, attributeMapColumns }, + { StructElement::Sect, "Sect", elementTypeInline, attributeMapColumns }, + { StructElement::Div, "Div", elementTypeInline, attributeMapColumns }, + { StructElement::BlockQuote, "BlockQuote", elementTypeInline, attributeMapInline }, + { StructElement::Caption, "Caption", elementTypeInline, attributeMapInline }, + { StructElement::NonStruct, "NonStruct", elementTypeInline, attributeMapInline }, + { StructElement::Index, "Index", elementTypeInline, attributeMapInline }, + { StructElement::Private, "Private", elementTypeInline, attributeMapInline }, + { StructElement::Span, "Span", elementTypeInline, attributeMapInline }, + { StructElement::Quote, "Quote", elementTypeInline, attributeMapInline }, + { StructElement::Note, "Note", elementTypeInline, attributeMapInline }, + { StructElement::Reference, "Reference", elementTypeInline, attributeMapInline }, + { StructElement::BibEntry, "BibEntry", elementTypeInline, attributeMapInline }, + { StructElement::Code, "Code", elementTypeInline, attributeMapInline }, + { StructElement::Link, "Link", elementTypeInline, attributeMapInline }, + { StructElement::Annot, "Annot", elementTypeInline, attributeMapInline }, + { StructElement::Ruby, "Ruby", elementTypeInline, attributeMapRubyText }, + { StructElement::RB, "RB", elementTypeUndefined, attributeMapRubyText }, + { StructElement::RT, "RT", elementTypeUndefined, attributeMapRubyText }, + { StructElement::RP, "RP", elementTypeUndefined, attributeMapShared }, + { StructElement::Warichu, "Warichu", elementTypeInline, attributeMapRubyText }, + { StructElement::WT, "WT", elementTypeUndefined, attributeMapShared }, + { StructElement::WP, "WP", elementTypeUndefined, attributeMapShared }, + { StructElement::P, "P", elementTypeBlock, attributeMapBlock }, + { StructElement::H, "H", elementTypeBlock, attributeMapBlock }, + { StructElement::H1, "H1", elementTypeBlock, attributeMapBlock }, + { StructElement::H2, "H2", elementTypeBlock, attributeMapBlock }, + { StructElement::H3, "H3", elementTypeBlock, attributeMapBlock }, + { StructElement::H4, "H4", elementTypeBlock, attributeMapBlock }, + { StructElement::H5, "H5", elementTypeBlock, attributeMapBlock }, + { StructElement::H6, "H6", elementTypeBlock, attributeMapBlock }, + { StructElement::L, "L", elementTypeBlock, attributeMapList }, + { StructElement::LI, "LI", elementTypeBlock, attributeMapBlock }, + { StructElement::Lbl, "Lbl", elementTypeBlock, attributeMapBlock }, + { StructElement::Table, "Table", elementTypeBlock, attributeMapTable }, + { StructElement::TR, "TR", elementTypeUndefined, attributeMapShared }, + { StructElement::TH, "TH", elementTypeUndefined, attributeMapTableCell }, + { StructElement::TD, "TD", elementTypeUndefined, attributeMapTableCell }, + { StructElement::THead, "THead", elementTypeUndefined, attributeMapShared }, + { StructElement::TFoot, "TFoot", elementTypeUndefined, attributeMapShared }, + { StructElement::TBody, "TBody", elementTypeUndefined, attributeMapShared }, + { StructElement::Figure, "Figure", elementTypeUndefined, attributeMapIllustration }, + { StructElement::Formula, "Formula", elementTypeUndefined, attributeMapIllustration }, + { StructElement::Form, "Form", elementTypeUndefined, attributeMapIllustration }, + { StructElement::TOC, "TOC", elementTypeUndefined, attributeMapShared }, + { StructElement::TOCI, "TOCI", elementTypeUndefined, attributeMapShared }, +}; + + +//------------------------------------------------------------------------ +// Helpers for the attribute and structure type tables +//------------------------------------------------------------------------ + +static inline const AttributeMapEntry* +getAttributeMapEntry(const AttributeMapEntry** entryList, Attribute::Type type) +{ + assert(entryList); + while (*entryList) { + const AttributeMapEntry* entry = *entryList; + while (entry->type != Attribute::Unknown) { + assert(entry->name); + if (type == entry->type) + return entry; + entry++; + } + entryList++; + } + return NULL; +} + +static inline const AttributeMapEntry* +getAttributeMapEntry(const AttributeMapEntry** entryList, const char* name) +{ + assert(entryList); + while (*entryList) { + const AttributeMapEntry* entry = *entryList; + while (entry->type != Attribute::Unknown) { + assert(entry->name); + if (strcmp(name, entry->name) == 0) + return entry; + entry++; + } + entryList++; + } + return NULL; +} + +static inline const OwnerMapEntry* getOwnerMapEntry(Attribute::Owner owner) +{ + for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (owner == ownerMap[i].owner) + return &ownerMap[i]; + } + return NULL; +} + +static inline const OwnerMapEntry* getOwnerMapEntry(const char* name) +{ + for (unsigned i = 0; i < sizeof(ownerMap) / sizeof(ownerMap[0]); i++) { + if (strcmp(name, ownerMap[i].name) == 0) + return &ownerMap[i]; + } + return NULL; +} + +static const char* ownerToName(Attribute::Owner owner) +{ + const OwnerMapEntry* entry = getOwnerMapEntry(owner); + return entry ? entry->name : "UnknownOwner"; +} + +Attribute::Owner nameToOwner(const char* name) +{ + const OwnerMapEntry* entry = getOwnerMapEntry(name); + return entry ? entry->owner : Attribute::UnknownOwner; +} + +static inline const TypeMapEntry* getTypeMapEntry(StructElement::Type type) +{ + for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { + if (type == typeMap[i].type) + return &typeMap[i]; + } + return NULL; +} + +static inline const TypeMapEntry* getTypeMapEntry(const char* name) +{ + for (unsigned i = 0; i < sizeof(typeMap) / sizeof(typeMap[0]); i++) { + if (strcmp(name, typeMap[i].name) == 0) + return &typeMap[i]; + } + return NULL; +} + +static const char* typeToName(StructElement::Type type) +{ + if (type == StructElement::MCID) + return "MarkedContent"; + + const TypeMapEntry* entry = getTypeMapEntry(type); + return entry ? entry->name : "Unknown"; +} + +static StructElement::Type nameToType(const char* name) +{ + const TypeMapEntry* entry = getTypeMapEntry(name); + return entry ? entry->type : StructElement::Unknown; +} + + +//------------------------------------------------------------------------ +// Attribute +//------------------------------------------------------------------------ + +Attribute::Attribute(const char* nameA, Object* valueA, GBool copyValue): + type(UserProperty), + owner(UserProperties), + revision(0), + name(nameA), + value(), + hidden(gFalse), + formatted(NULL) +{ + assert(valueA); + + if (copyValue) + valueA->copy(&value); + else + valueA->shallowCopy(&value); +} + +Attribute::Attribute(Type type, Object* valueA, GBool copyValue): + type(type), + owner(UserProperties), // TODO: Determine corresponding owner from Type + revision(0), + name(), + value(), + hidden(gFalse), + formatted(NULL) +{ + assert(valueA); + + if (copyValue) + valueA->copy(&value); + else + valueA->shallowCopy(&value); + + if (!typeCheck()) { + type = Unknown; + } +} + +Attribute::~Attribute() +{ + delete formatted; + value.free(); +} + +const char* Attribute::getTypeName() const +{ + if (type == UserProperty) + return name.getCString(); + + const AttributeMapEntry* entry = getAttributeMapEntry(attributeMapAll, type); + if (entry) + return entry->name; + + return "Unknown"; +} + +const char* Attribute::getOwnerName() const +{ + return ownerToName(owner); +} + +Object* Attribute::getDefaultValue(Attribute::Type type) +{ + const AttributeMapEntry* entry = getAttributeMapEntry(attributeMapAll, type); + return entry ? const_cast(entry->defval) : NULL; +} + +void Attribute::setFormattedValue(const char* formattedA) +{ + if (formattedA) { + if (formatted) + formatted->Set(formattedA); + else + formatted = new GooString(formattedA); + } else { + delete formatted; + } +} + +GBool Attribute::typeCheck(StructElement* element) +{ + // If an element is passed, tighther type-checking can be done. + if (element) { + const TypeMapEntry* elementTypeEntry = getTypeMapEntry(element->getType()); + if (elementTypeEntry && elementTypeEntry->attributes) { + const AttributeMapEntry* entry = getAttributeMapEntry(elementTypeEntry->attributes, type); + if (entry) { + if (entry->check && !((*entry->check)(&value))) { + return gFalse; + } + } else { + // No entry: the attribute is not valid for the containing element. + return gFalse; + } + } + } + + return gTrue; +} + +Attribute::Type Attribute::typeForName(const char* name, StructElement* element) +{ + const AttributeMapEntry** attributes = attributeMapAll; + if (element) { + const TypeMapEntry* elementTypeEntry = getTypeMapEntry(element->getType()); + if (elementTypeEntry && elementTypeEntry->attributes) { + attributes = elementTypeEntry->attributes; + } + } + + const AttributeMapEntry* entry = getAttributeMapEntry(attributes, name); + return entry ? entry->type : Unknown; +} + +Attribute* Attribute::parseUserProperty(Dict* property) +{ + Object obj, value; + const char* name = NULL; + + if (property->lookup("N", &obj)->isString()) + name = obj.getString()->getCString(); + else if (obj.isName()) + name = obj.getName(); + else { + error(errSyntaxError, -1, "N object is wrong type ({0:s})", obj.getTypeName()); + obj.free(); + return NULL; + } + + if (property->lookup("V", &value)->isNull()) { + error(errSyntaxError, -1, "V object is wrong type ({0:s})", value.getTypeName()); + value.free(); + obj.free(); + return NULL; + } + + Attribute *attribute = new Attribute(name, &value, gFalse); + obj.free(); + + if (property->lookup("F", &obj)->isString()) { + attribute->setFormattedValue(obj.getString()->getCString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "F object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + if (property->lookup("H", &obj)->isBool()) { + attribute->setHidden(obj.getBool()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "H object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + return attribute; +} + + +//------------------------------------------------------------------------ +// StructElement +//------------------------------------------------------------------------ + +StructElement::StructData::StructData(): + parentRef(), + altText(0), + actualText(0), + id(0), + title(0), + expandedAbbr(0), + language(0), + revision(0), + elements(), + attributes() +{ +} + +StructElement::StructData::~StructData() +{ + delete altText; + delete actualText; + delete id; + delete title; + gfree(language); + parentRef.free(); + for (std::vector::iterator i = elements.begin(); i != elements.end(); ++i) delete *i; + for (std::vector::iterator i = attributes.begin(); i != attributes.end(); ++i) delete *i; +} + + +StructElement::StructElement(Dict* element, StructTreeRoot* treeRootA, StructElement* parentA): + type(Unknown), + treeRoot(treeRootA), + parent(parentA), + pageRef(), + s(new StructData()) +{ + assert(treeRoot); + assert(element); + parse(element); +} + +StructElement::StructElement(int mcid, StructTreeRoot* treeRootA, StructElement* parentA): + type(MCID), + treeRoot(treeRootA), + parent(parentA), + pageRef(), + c(new ContentData(mcid)) +{ + assert(treeRoot); + assert(parent); + assert(c->mcid != InvalidMCID); +} + +StructElement::~StructElement() +{ + if (isContent()) + delete c; + else + delete s; + pageRef.free(); +} + +GBool StructElement::isBlock() const +{ + const TypeMapEntry* entry = getTypeMapEntry(type); + return entry ? (entry->elementType == elementTypeBlock) : gFalse; +} + +GBool StructElement::isInline() const +{ + const TypeMapEntry* entry = getTypeMapEntry(type); + return entry ? (entry->elementType == elementTypeInline) : gFalse; +} + +GBool StructElement::hasPageRef() const +{ + return pageRef.isRef() || (parent && parent->hasPageRef()); +} + +Ref StructElement::getPageRef() const +{ + if (pageRef.isRef()) + return pageRef.getRef(); + + if (parent) + return parent->getPageRef(); + + static const Ref invalidRef = { -1, -1 }; + return invalidRef; +} + +const char* StructElement::getTypeName() const +{ + return typeToName(type); +} + +const Attribute* StructElement::findAttribute(Attribute::Type attributeType, GBool inherit, + Attribute::Owner attributeOwner) const +{ + if (isContent()) + return parent->findAttribute(attributeType, inherit, attributeOwner); + + if (attributeType != Attribute::Unknown && attributeType != Attribute::UserProperty) { + const Attribute *result = NULL; + + if (attributeOwner == Attribute::UnknownOwner) { + // Search for the attribute, no matter who the owner is + for (unsigned i = 0; i < getNumAttributes(); i++) { + const Attribute *attr = getAttribute(i); + if (attributeType == attr->getType()) { + if (!result || ownerHasMorePriority(attr->getOwner(), result->getOwner())) + result = attr; + } + } + } else { + // Search for the attribute, with a specific owner + for (unsigned i = 0; i < getNumAttributes(); i++) { + const Attribute *attr = getAttribute(i); + if (attributeType == attr->getType() && attributeOwner == attr->getOwner()) { + result = attr; + break; + } + } + } + + if (result) + return result; + + if (inherit && parent) { + const AttributeMapEntry *entry = getAttributeMapEntry(attributeMapAll, attributeType); + assert(entry); + // TODO: Take into account special inheritance cases, for example: + // inline elements which have been changed to be block using + // "/Placement/Block" have slightly different rules. + if (entry->inherit) + return parent->findAttribute(attributeType, inherit, attributeOwner); + } + } + return NULL; +} + +GooString* StructElement::getText(GooString *string, GBool recursive) const +{ + if (isContent()) { + UnicodeMap *map = globalParams->getTextEncoding(); + if (!map) { + GooString mapName("UTF-8"); + map = UnicodeMap::parse(&mapName); + } + assert(map); + + const MCOpArray& ops(getMCOps()); + if (!ops.size()) + return NULL; + + if (!string) + string = new GooString(); + + char buf[9]; + int n; + + for (MCOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) { + if (i->type == mcOpUnichar) { + n = map->mapUnicode(i->unichar, buf, sizeof(buf)); + string->append(buf, n); + } + } + map->decRefCnt(); + return string; + } + + if (!recursive) + return NULL; + + // Do a depth-first traversal, to get elements in logical order + if (!string) + string = new GooString(); + + for (unsigned i = 0; i < getNumElements(); i++) + getElement(i)->getText(string, recursive); + + return string; +} + + +const MCOpArray StructElement::getMCOps() const +{ + if (!isContent()) + return MCOpArray(); // Empty array + + MCOutputDev mcdev(getMCID()); + int startPage = 0, endPage = 0; + + if (hasPageRef()) { + Ref ref = getPageRef(); + startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen); + } + + if (!(startPage && endPage)) { + startPage = 1; + endPage = treeRoot->getDoc()->getNumPages(); + } + + treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse); + return mcdev.getMCOps(); +} + + +void StructElement::parse(Dict* element) +{ + Object obj; + + // Type is optional, but if present must be StructElem + if (!element->lookup("Type", &obj)->isNull() && !obj.isName("StructElem")) { + error(errSyntaxError, -1, "Type of StructElem object is wrong"); + obj.free(); + return; + } + obj.free(); + + // Parent object reference (required). + if (!element->lookupNF("P", &s->parentRef)->isRef()) { + error(errSyntaxError, -1, "P object is wrong type ({0:s})", obj.getTypeName()); + return; + } + + // Check whether the S-type is valid for the top level + // element and create a node of the appropriate type. + if (!element->lookup("S", &obj)->isName()) { + error(errSyntaxError, -1, "S object is wrong type ({0:s})", obj.getTypeName()); + obj.free(); + return; + } + + // Type name may not be standard, resolve through RoleMap first. + // TODO: roleMap entries may need to be resolved recursively until + // a known standard name is found, cycles may be present. + if (treeRoot->getRoleMap()) { + Object resolved; + if (treeRoot->getRoleMap()->lookup(obj.getName(), &resolved)->isName()) { + type = nameToType(resolved.getName()); + } else if (resolved.isNull()) { + type = nameToType(obj.getName()); + } else { + error(errSyntaxError, -1, "Value in RoleMap is wrong type ({0:s})", resolved.getTypeName()); + resolved.free(); + obj.free(); + return; + } + resolved.free(); + } else { + type = nameToType(obj.getName()); + } + if (type == Unknown) { + error(errSyntaxError, -1, "StructElem object is wrong type ({0:s})", obj.getName()); + obj.free(); + return; + } + obj.free(); + + // Object ID (optional), to be looked at the IDTree in the tree root. + if (element->lookup("ID", &obj)->isString()) { + s->id = new GooString(obj.getString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "ID object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Page reference (optional) in which at least one of the child items + // is to be rendered in. Note: each element stores only the /Pg value + // contained by it, and StructElement::getPageRef() may look in parent + // elements to find the page where an element belongs. + if (!element->lookupNF("Pg", &pageRef)->isRef() && !pageRef.isNull()) { + error(errSyntaxWarning, -1, "Pg object is wrong type ({0:s})", pageRef.getTypeName()); + } + + // Revision number (optional). + if (element->lookup("R", &obj)->isInt()) { + s->revision = obj.getInt(); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "R object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Element title (optional). + if (element->lookup("T", &obj)->isString()) { + s->title = new GooString(obj.getString()); + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "T object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Language (optional). + if (element->lookup("Lang", &obj)->isString()) { + s->language = obj.getString()->getCString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "Lang object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Alternative text (optional). + if (element->lookup("Alt", &obj)->isString()) { + s->altText = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "Alt object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Expanded form of an abbreviation (optional). + if (element->lookup("E", &obj)->isString()) { + s->expandedAbbr = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "E object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Actual text (optional). + if (element->lookup("ActualText", &obj)->isString()) { + s->actualText = obj.getString(); + obj.initNull(); // The StructElement takes ownership of the GooString + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "ActualText object is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Attributes directly attached to the element (optional). + if (element->lookup("A", &obj)->isDict()) { + parseAttributes(obj.getDict()); + } else if (obj.isArray()) { + Object iobj; + unsigned attrIndex = getNumAttributes(); + for (int i = 0; i < obj.arrayGetLength(); i++) { + if (obj.arrayGet(i, &iobj)->isDict()) { + attrIndex = getNumAttributes(); + parseAttributes(obj.getDict()); + } else if (iobj.isInt()) { + const int revision = iobj.getInt(); + // Set revision numbers for the elements previously created. + for (unsigned j = attrIndex; j < getNumAttributes(); j++) + getAttribute(j)->setRevision(revision); + } else { + error(errSyntaxWarning, -1, "A item is wrong type ({0:s})", iobj.getTypeName()); + } + iobj.free(); + } + } else if (!obj.isNull()) { + error(errSyntaxWarning, -1, "A is wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + + // Attributes referenced indirectly through the ClassMap (optional). + if (treeRoot->getClassMap()) { + Object classes; + if (element->lookup("C", &classes)->isName()) { + Object attr; + if (treeRoot->getClassMap()->lookup(classes.getName(), &attr)->isDict()) { + parseAttributes(attr.getDict(), gTrue); + } else if (attr.isArray()) { + for (int i = 0; i < attr.arrayGetLength(); i++) { + Object iobj; + unsigned attrIndex = getNumAttributes(); + if (attr.arrayGet(i, &iobj)->isDict()) { + attrIndex = getNumAttributes(); + parseAttributes(iobj.getDict(), gTrue); + } else if (iobj.isInt()) { + // Set revision numbers for the elements previously created. + const int revision = iobj.getInt(); + for (unsigned j = attrIndex; j < getNumAttributes(); j++) + getAttribute(j)->setRevision(revision); + } else { + error(errSyntaxWarning, -1, "C item is wrong type ({0:s})", iobj.getTypeName()); + } + } + } else if (!attr.isNull()) { + error(errSyntaxWarning, -1, "C object is wrong type ({0:s})", classes.getTypeName()); + } + classes.free(); + } + } + + parseChildren(element); +} + +StructElement* StructElement::parseChild(Object* childObj) +{ + assert(childObj); + StructElement* child = NULL; + + if (childObj->isInt()) { + child = new StructElement(childObj->getInt(), treeRoot, this); + } else if (childObj->isDict("MCR")) { + /* + * TODO: The optional Stm/StwOwn attributes are not handled, so all the + * page will be always scanned when calling StructElement::getText(). + */ + Object mcidObj; + Object refObj; + + if (!childObj->dictLookup("MCID", &mcidObj)->isInt()) { + error(errSyntaxError, -1, "MCID object is wrong type ({0:s})", mcidObj.getTypeName()); + mcidObj.free(); + return NULL; + } + + child = new StructElement(mcidObj.getInt(), treeRoot, this); + + if (childObj->dictLookupNF("Pg", &refObj)->isRef()) { + // XXX Unclassy manipulation of the page reference (ugh!) + child->pageRef = refObj; + } else { + refObj.free(); + } + } else if (childObj->isDict("OBJR")) { + // TODO: PDF Object Reference + } else if (childObj->isDict()) { + child = new StructElement(childObj->getDict(), treeRoot, this); + } else { + error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", childObj->getTypeName()); + } + + if (child) { + if (child->isOk()) + appendElement(child); + else { + delete child; + child = NULL; + } + } + + return child; +} + +void StructElement::parseChildren(Dict* element) +{ + Object kids; + + if (element->lookup("K", &kids)->isArray()) { + for (int i = 0; i < kids.arrayGetLength(); i++) { + Object obj; + kids.arrayGet(i, &obj); + parseChild(&obj); + obj.free(); + } + } else if (kids.isDict() || kids.isInt()) { + parseChild(&kids); + } else if (!kids.isNull()) { + error(errSyntaxWarning, -1, "K in StructElement is wrong type ({0:s})", kids.getTypeName()); + } + + kids.free(); +} + +void StructElement::parseAttributes(Dict* attributes, GBool keepExisting) +{ + Object owner; + if (attributes->lookup("O", &owner)->isName("UserProperties")) { + // In this case /P is an array of UserProperty dictionaries + Object userProperties; + if (attributes->lookup("P", &userProperties)->isArray()) { + for (int i = 0; i < userProperties.arrayGetLength(); i++) { + Object property; + if (userProperties.arrayGet(i, &property)->isDict()) { + Attribute* attribute = Attribute::parseUserProperty(property.getDict()); + if (attribute && attribute->isOk()) { + appendAttribute(attribute); + } else { + error(errSyntaxWarning, -1, "Item in P is invalid"); + delete attribute; + } + } else { + error(errSyntaxWarning, -1, "Item in P is wrong type ({0:s})", property.getTypeName()); + } + property.free(); + } + } else { + error(errSyntaxWarning, -1, "P is wrong type ({0:s})", userProperties.getTypeName()); + } + userProperties.free(); + } else if (owner.isName()) { + // In this case /P contains standard attributes. + // Check first if the owner is a valid standard one. + Attribute::Owner ownerValue = nameToOwner(owner.getName()); + if (ownerValue != Attribute::UnknownOwner) { + // Iterate over the entries of the "attributes" dictionary. + // The /O entry (owner) is skipped. + for (int i = 0; i < attributes->getLength(); i++) { + const char* key = attributes->getKey(i); + if (strcmp(key, "O") != 0) { + Attribute::Type type = Attribute::typeForName(key, this); + + // Check if the attribute is already defined. + if (keepExisting) { + GBool exists = gFalse; + for (unsigned j = 0; j < getNumAttributes(); j++) { + if (getAttribute(j)->getType() == type) { + exists = gTrue; + break; + } + } + if (exists) + continue; + } + + if (type != Attribute::Unknown) { + Object value; + GBool typeCheckOk = gTrue; + Attribute* attribute = new Attribute(type, attributes->getVal(i, &value), gFalse); + if (attribute->isOk() && (typeCheckOk = attribute->typeCheck(this))) { + appendAttribute(attribute); + } else { + // It is not needed to free "value", the Attribute instance + // owns the contents, so deleting "attribute" is enough. + if (!typeCheckOk) { + error(errSyntaxWarning, -1, "Attribute {0:s} value is of wrong type ({1:s})", + attribute->getTypeName(), attribute->getValue()->getTypeName()); + } + delete attribute; + } + } else { + error(errSyntaxWarning, -1, "Wrong Attribute '{0:s}' in element {1:s}", key, getTypeName()); + } + } + } + } else { + error(errSyntaxWarning, -1, "O object is invalid value ({0:s})", owner.getName()); + } + } else if (!owner.isNull()) { + error(errSyntaxWarning, -1, "O is wrong type ({0:s})", owner.getTypeName()); + } + owner.free(); +} diff --git a/poppler/StructElement.h b/poppler/StructElement.h new file mode 100644 index 0000000..63fb051 --- /dev/null +++ b/poppler/StructElement.h @@ -0,0 +1,273 @@ +//======================================================================== +// +// StructElement.h +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef STRUCTELEMENT_H +#define STRUCTELEMENT_H + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "goo/gtypes.h" +#include "goo/GooString.h" +#include "MCOutputDev.h" +#include "Object.h" +#include + +class GooString; +class Dict; +class StructElement; +class StructTreeRoot; +class TextWordList; + + +class Attribute { +public: + enum Type { + Unknown = 0, // Uninitialized, parsing error, etc. + UserProperty, // User defined attribute (i.e. non-standard) + + // Common standard attributes + Placement, WritingMode, BackgroundColor, BorderColor, BorderStyle, + BorderThickness, Color, Padding, + + // Block element standard attributes + SpaceBefore, SpaceAfter, StartIndent, EndIndent, TextIndent, TextAlign, + BBox, Width, Height, BlockAlign, InlineAlign, TBorderStyle, TPadding, + + // Inline element standard attributes + BaselineShift, LineHeight, TextDecorationColor, TextDecorationThickness, + TextDecorationType, RubyAlign, RubyPosition, GlyphOrientationVertical, + + // Column-only standard attributes + ColumnCount, ColumnGap, ColumnWidths, + + // List-only standard attributes + ListNumbering, + + // PrintField-only standard attributes + Role, checked, Desc, + + // Table-only standard attributes + RowSpan, ColSpan, Headers, Scope, Summary, + }; + + enum Owner { + UnknownOwner = 0, + // User-defined attributes + UserProperties, + // Standard attributes + Layout, List, PrintField, Table, + // Translation to other formats + XML_1_00, HTML_3_20, HTML_4_01, OEB_1_00, RTF_1_05, CSS_1_00, CSS_2_00, + }; + + // Creates a standard attribute. The name is predefined, and the + // value is type-checked to conform to the PDF specification. + Attribute(Type type, Object* value, GBool copyValue = gTrue); + + // Creates an UserProperty attribute, with an arbitrary name and value. + Attribute(const char* name, Object* value, GBool copyValue = gTrue); + + GBool isOk() const { return type != Unknown; } + + // Name, type and value can be set only on construction. + Type getType() const { return type; } + Owner getOwner() const { return owner; } + const char* getTypeName() const; + const char* getOwnerName() const; + Object* getValue() const { return &value; } + static Object* getDefaultValue(Type type); + + const char* getName() const { return type == UserProperty ? name.getCString() : getTypeName(); } + + // The revision is optional, and defaults to zero. + Guint getRevision() const { return revision; } + void setRevision(Guint revisionA) { revision = revisionA; } + + // Hidden elements should not be displayed by the user agent + GBool isHidden() const { return hidden; } + void setHidden(GBool hiddenA) { hidden = hiddenA; } + + // The formatted value may be in the PDF, or be left undefined (NULL). + // In the later case the user agent should provide a default representation. + const char* getFormattedValue() const { return formatted ? formatted->getCString() : NULL; } + void setFormattedValue(const char *formattedA); + + ~Attribute(); + +private: + Type type; + Owner owner; + Guint revision; + mutable GooString name; + mutable Object value; + GBool hidden; + GooString *formatted; + + GBool typeCheck(StructElement* element = NULL); + static Type typeForName(const char* name, StructElement* element = NULL); + static Attribute* parseUserProperty(Dict* property); + + friend class StructElement; +}; + + +class StructElement { +public: + enum Type { + Unknown = 0, + MCID, // MCID reference, used internally + + Document, Part, Art, Sect, Div, // Structural elements + + Span, Quote, Note, Reference, BibEntry, // Inline elements + Code, Link, Annot, + BlockQuote, Caption, NonStruct, + TOC, TOCI, Index, Private, + + P, H, H1, H2, H3, H4, H5, H6, // Paragraph-like + + L, LI, Lbl, // List elements + + Table, TR, TH, TD, THead, TFoot, TBody, // Table elements + + Ruby, RB, RT, RP, // Ruby text elements + Warichu, WT, WP, + + Figure, Formula, Form, // Illustration-like elements + }; + + static const int InvalidMCID = -1; + + const char* getTypeName() const; + Type getType() const { return type; } + GBool isOk() const { return type != Unknown; } + GBool isBlock() const; + GBool isInline() const; + + // TODO Handle object references (OBJR) + inline GBool isContent() const { return (type == MCID) && (c->mcid != InvalidMCID); } + + int getMCID() const { return isContent() ? c->mcid : InvalidMCID; } + Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef.getRef(); } + GBool hasPageRef() const; + Ref getPageRef() const; + StructTreeRoot* getStructTreeRoot() const { return treeRoot; } + + // Optional element identifier. + const GooString* getID() const { return isContent() ? NULL : s->id; } + + // Optional ISO language name, e.g. en_US + const char* getLang(GBool recursive = gTrue) const + { return isContent() ? parent->getLang(recursive) + : (s->language ? s->language : (recursive && parent ? parent->getLang() : NULL)); } + + // Optional revision number, defaults to zero. + Guint getRevision() const { return isContent() ? 0 : s->revision; } + void setRevision(Guint revision) { if (isContent()) s->revision = revision; } + + // Optional element title, in human-readable form. + const GooString* getTitle() const { return isContent() ? NULL : s->title; } + + // Optional element expanded abbreviation text. + const GooString* getExpandedAbbr() const { return isContent() ? NULL : s->expandedAbbr; } + + unsigned getNumElements() const { return isContent() ? 0 : s->elements.size(); } + const StructElement* getElement(int i) const { return isContent() ? NULL : s->elements.at(i); } + StructElement* getElement(int i) { return isContent() ? NULL : s->elements.at(i); } + + void appendElement(StructElement* element) + { if (!isContent() && element && element->isOk()) s->elements.push_back(element); } + + unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } + const Attribute* getAttribute(int i) const { return isContent() ? NULL : s->attributes.at(i); } + Attribute* getAttribute(int i) { return isContent() ? NULL : s->attributes.at(i); } + + void appendAttribute(Attribute* attribute) + { if (!isContent() && attribute) s->attributes.push_back(attribute); } + + const Attribute* findAttribute(Attribute::Type attributeType, GBool inherit = gFalse, + Attribute::Owner owner = Attribute::UnknownOwner) const; + + GooString* getAltText() const { return isContent() ? NULL : s->altText; } + GooString* getActualText() const { return isContent() ? NULL : s->actualText; } + + // Content text referenced by the element: + // + // - For MCID reference elements, this is just the text of the + // corresponding marked content object in the page stream, regardless + // of the setting of the "recursive" flag. + // - For other elements, if the "recursive" flag is set, the text + // enclosed by *all* the child MCID reference elements of the subtree + // is returned. The text is assembled by traversing the leaf MCID + // reference elements in logical order. + // - In any other case, the function returns NULL. + // + // The text will be appended to the passed GooString. If NULL is passed, + // a new string is returned, and the ownership passed to the caller. + // + GooString* getText(GooString *string = NULL, GBool recursive = gTrue) const; + + const MCOpArray getMCOps() const; + + ~StructElement(); + +private: + typedef std::vector AttrPtrArray; + typedef std::vector ElemPtrArray; + + struct StructData { + Object parentRef; + GooString *altText; + GooString *actualText; + GooString *id; + GooString *title; + GooString *expandedAbbr; + char *language; + Guint revision; + ElemPtrArray elements; + AttrPtrArray attributes; + + StructData(); + ~StructData(); + }; + + // Data in content elements (MCID, MCR) + struct ContentData { + int mcid; + + ContentData(int mcidA = InvalidMCID): mcid(mcidA) {} + }; + + // Common data + Type type; + StructTreeRoot* treeRoot; + StructElement* parent; + mutable Object pageRef; + + union { + StructData *s; + ContentData *c; + }; + + StructElement(Dict* elementDict, StructTreeRoot* treeRootA, StructElement* parentA = 0); + StructElement(int mcid, StructTreeRoot* treeRootA, StructElement* parentA); + + void parse(Dict* elementDict); + StructElement* parseChild(Object* childObj); + void parseChildren(Dict* element); + void parseAttributes(Dict* element, GBool keepExisting = gFalse); + + friend class StructTreeRoot; +}; + +#endif + diff --git a/poppler/StructTreeRoot.cc b/poppler/StructTreeRoot.cc new file mode 100644 index 0000000..727bfe9 --- /dev/null +++ b/poppler/StructTreeRoot.cc @@ -0,0 +1,120 @@ +//======================================================================== +// +// StructTreeRoot.cc +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "goo/GooString.h" +#include "StructTreeRoot.h" +#include "StructElement.h" +#include "Object.h" +#include "Dict.h" + +#include + + +StructTreeRoot::StructTreeRoot(PDFDoc *docA, Dict* structTreeRootDict, GBool marked): + doc(docA), + roleMap(), + classMap(), + elements() +{ + assert(doc); + assert(structTreeRootDict); + parse(structTreeRootDict, marked); +} + +StructTreeRoot::~StructTreeRoot() +{ + for (ElemPtrArray::iterator i = elements.begin(); i != elements.end(); ++i) + delete *i; + classMap.free(); + roleMap.free(); +} + +void StructTreeRoot::parse(Dict* root, GBool marked) +{ + // The RoleMap/ClassMap dictionaries are needed by all the parsing + // functions, which will resolve the custom names to canonical + // standard names. + root->lookup("RoleMap", &roleMap); + root->lookup("ClassMap", &classMap); + + Object kids; + if (root->lookup("K", &kids)->isArray()) { + if (marked && kids.arrayGetLength() > 1) { + error(errSyntaxWarning, -1, "K in StructTreeRoot has more than one children in a tagged PDF"); + } + for (int i = 0; i < kids.arrayGetLength(); i++) { + Object obj; + kids.arrayGet(i, &obj); + if (obj.isDict()) { + StructElement* child = new StructElement(obj.getDict(), this); + if (child->isOk()) { + if (marked && !(child->getType() == StructElement::Document || + child->getType() == StructElement::Part || + child->getType() == StructElement::Art || + child->getType() == StructElement::Div)) { + error(errSyntaxWarning, -1, "StructTreeRoot element of tagged PDF is wrong type ({0:s})", child->getTypeName()); + } + appendElement(child); + } else { + error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed"); + delete child; + } + } else { + error(errSyntaxWarning, -1, "K has a child of wrong type ({0:s})", obj.getTypeName()); + } + obj.free(); + } + } else if (kids.isDict()) { + if (marked) { + error(errSyntaxWarning, -1, "K has a child of wrong type for a tagged PDF ({0:s})", kids.getTypeName()); + } + StructElement* child = new StructElement(kids.getDict(), this); + if (child->isOk()) { + appendElement(child); + } else { + error(errSyntaxWarning, -1, "StructTreeRoot element could not be parsed"); + delete child; + } + } else if (!kids.isNull()) { + error(errSyntaxWarning, -1, "K in StructTreeRoot is wrong type ({0:s})", kids.getTypeName()); + } + + kids.free(); +} + +static const StructElement* findElementAux(GooString* elementId, const StructElement* element) +{ + if (element->getID()->cmp(elementId) == 0) { + return element; + } + for (unsigned i = 0; i < element->getNumElements(); i++) { + const StructElement* child = findElementAux(elementId, element->getElement(i)); + if (child) { + return child; + } + } + return NULL; +} + +const StructElement* StructTreeRoot::findElement(GooString* elementId) const +{ + assert(elementId); + for (unsigned i = 0; i < getNumElements(); i++) { + const StructElement* element = findElementAux(elementId, getElement(i)); + if (element) { + return element; + } + } + return NULL; +} diff --git a/poppler/StructTreeRoot.h b/poppler/StructTreeRoot.h new file mode 100644 index 0000000..2952d93 --- /dev/null +++ b/poppler/StructTreeRoot.h @@ -0,0 +1,56 @@ +//======================================================================== +// +// StructTreeRoot.h +// +// This file is licensed under the GPLv2 or later +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef STRUCTTREEROOT_H +#define STRUCTTREEROOT_H + +#ifdef USE_GCC_PRAGMAS +#pragma interface +#endif + +#include "goo/gtypes.h" +#include "Object.h" +#include "StructElement.h" +#include + +class Dict; +class PDFDoc; + + +class StructTreeRoot +{ +public: + StructTreeRoot(PDFDoc *docA, Dict* rootDict, GBool marked); + ~StructTreeRoot(); + + PDFDoc* getDoc() { return doc; } + Dict* getRoleMap() { return roleMap.isDict() ? roleMap.getDict() : NULL; } + Dict* getClassMap() { return classMap.isDict() ? classMap.getDict() : NULL; } + unsigned getNumElements() const { return elements.size(); } + const StructElement* getElement(int i) const { return elements.at(i); } + StructElement* getElement(int i) { return elements.at(i); } + void appendElement(StructElement* element) + { if (element && element->isOk()) elements.push_back(element); } + const StructElement* findElement(GooString* elementId) const; + +private: + PDFDoc *doc; + + Object roleMap; + Object classMap; + + typedef std::vector ElemPtrArray; + ElemPtrArray elements; + + void parse(Dict* rootDict, GBool marked); +}; + +#endif + -- 1.8.3