From 44254f526996ee0c89f4cf650feb728f782c403c Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Thu, 9 May 2013 19:11:26 +0300 Subject: [PATCH v3 3/7] Tagged-PDF: Modify pdfinfo to show the document structure --- utils/pdfinfo.cc | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index 14e4f6c..3a31184 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -3,6 +3,7 @@ // pdfinfo.cc // // Copyright 1998-2003 Glyph & Cog, LLC +// Copyright 2013 Igalia S.L. // //======================================================================== @@ -19,6 +20,7 @@ // Copyright (C) 2011 Vittal Aithal // Copyright (C) 2012, 2013 Adrian Johnson // Copyright (C) 2012 Fabio D'Urso +// Copyright (C) 2013 Igalia S.L. // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -53,11 +55,15 @@ #include "UTF.h" #include "Error.h" #include "DateInfo.h" +#include "StructTreeRoot.h" +#include "StructElement.h" static void printInfoString(Dict *infoDict, const char *key, const char *text, UnicodeMap *uMap); static void printInfoDate(Dict *infoDict, const char *key, const char *text); static void printBox(const char *text, PDFRectangle *box); +static void printStruct(const StructElement* element, unsigned indent = 0); +static void printIndent(unsigned level); static int firstPage = 1; static int lastPage = 0; @@ -70,6 +76,8 @@ static char userPassword[33] = "\001"; static GBool printVersion = gFalse; static GBool printHelp = gFalse; static GBool printEnc = gFalse; +static GBool printStructure = gFalse; +static GBool printStructureText = gFalse; static const ArgDesc argDesc[] = { {"-f", argInt, &firstPage, 0, @@ -80,6 +88,10 @@ static const ArgDesc argDesc[] = { "print the page bounding boxes"}, {"-meta", argFlag, &printMetadata, 0, "print the document metadata (XML)"}, + {"-struct", argFlag, &printStructure, 0, + "print the logical document structure (for tagged files)"}, + {"-struct-text", argFlag, &printStructureText, 0, + "print text contents along with document structure (for tagged files)"}, {"-rawdates", argFlag, &rawDates, 0, "print the undecoded date strings directly from the PDF file"}, {"-enc", argString, textEncName, sizeof(textEncName), @@ -136,6 +148,9 @@ int main(int argc, char *argv[]) { goto err0; } + if (printStructureText) + printStructure = gTrue; + // read config file globalParams = new GlobalParams(); @@ -226,7 +241,13 @@ int main(int argc, char *argv[]) { // print tagging info printf("Tagged: %s\n", - doc->getStructTreeRoot()->isDict() ? "yes" : "no"); + doc->getStructTreeRoot() ? "yes" : "no"); + printf("Marked: %s\n", + (doc->getCatalog()->getMarkInfo() & Catalog::markInfoMarked) ? "yes" : "no"); + printf("UserProperties: %s\n", + (doc->getCatalog()->getMarkInfo() & Catalog::markInfoUserProperties) ? "yes" : "no"); + printf("Suspects: %s\n", + (doc->getCatalog()->getMarkInfo() & Catalog::markInfoSuspects) ? "yes" : "no"); // print form info switch (doc->getCatalog()->getFormType()) @@ -371,6 +392,15 @@ int main(int argc, char *argv[]) { delete metadata; } + // print the structure + const StructTreeRoot* structTree; + if (printStructure && (structTree = doc->getCatalog()->getStructTreeRoot())) { + fputs("Structure:\n", stdout); + for (unsigned i = 0; i < structTree->getNumElements(); i++) { + printStruct(structTree->getElement(i), 1); + } + } + exitCode = 0; // clean up @@ -451,3 +481,74 @@ static void printBox(const char *text, PDFRectangle *box) { printf("%s%8.2f %8.2f %8.2f %8.2f\n", text, box->x1, box->y1, box->x2, box->y2); } + +static void printIndent(unsigned indent) { + while (indent--) { + putchar(' '); + putchar(' '); + } +} + +static void printAttribute(const Attribute* attribute, unsigned indent) +{ + printIndent(indent); + printf(" /%s ", attribute->getTypeName()); + if (attribute->getType() == Attribute::UserProperty) { + printf("(%s) ", attribute->getName()); + } + attribute->getValue()->print(stdout); + if (attribute->getFormattedValue()) { + printf(" \"%s\"", attribute->getFormattedValue()); + } + if (attribute->isHidden()) { + printf(" [hidden]"); + } +} + +static void printStruct(const StructElement* element, unsigned indent) { + if (element->isObjectRef()) { + printIndent(indent); + printf("Object %i %i\n", element->getObjectRef().num, element->getObjectRef().gen); + return; + } + + if (printStructureText && element->isContent()) { + GooString *text = element->getText(NULL, gFalse); + printIndent(indent); + if (text) { + printf("\"%s\"\n", text->getCString()); + } else { + printf("(No content?)\n"); + } + delete text; + } + + if (!element->isContent()) { + printIndent(indent); + printf("%s", element->getTypeName()); + if (element->getID()) { + printf(" <%s>", element->getID()->getCString()); + } + if (element->getTitle()) { + printf(" \"%s\"", element->getTitle()->getCString()); + } + if (element->getRevision() > 0) { + printf(" r%u", element->getRevision()); + } + if (element->isInline() || element->isBlock()) { + printf(" (%s)", element->isInline() ? "inline" : "block"); + } + if (element->getNumAttributes()) { + putchar(':'); + for (unsigned i = 0; i < element->getNumAttributes(); i++) { + putchar('\n'); + printAttribute(element->getAttribute(i), indent + 1); + } + } + + putchar('\n'); + for (unsigned i = 0; i < element->getNumElements(); i++) { + printStruct(element->getElement(i), indent + 1); + } + } +} -- 1.8.3