From fadbb3b1abf39eaf6956f1bbabed33f211668557 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Thu, 9 May 2013 19:11:26 +0300 Subject: [PATCH v8 06/15] Tagged-PDF: Modify pdfinfo to show the document structure --- utils/pdfinfo.cc | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index 22d9edd..785f460 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -3,6 +3,7 @@ // pdfinfo.cc // // Copyright 1998-2003 Glyph & Cog, LLC +// Copyright 2013 Igalia S.L. // //======================================================================== @@ -19,7 +20,7 @@ // Copyright (C) 2011 Vittal Aithal // Copyright (C) 2012, 2013 Adrian Johnson // Copyright (C) 2012 Fabio D'Urso -// Copyright (C) 2013 Adrian Perez de Castro +// Copyright (C) 2013 Igalia S.L. // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -55,11 +56,15 @@ #include "Error.h" #include "DateInfo.h" #include "JSInfo.h" +#include "StructTreeRoot.h" +#include "StructElement.h" static void printInfoString(Dict *infoDict, const char *key, const char *text, UnicodeMap *uMap); static void printInfoDate(Dict *infoDict, const char *key, const char *text); static void printBox(const char *text, PDFRectangle *box); +static void printStruct(const StructElement *element, unsigned indent = 0); +static void printIndent(unsigned level); static int firstPage = 1; static int lastPage = 0; @@ -73,6 +78,8 @@ static char userPassword[33] = "\001"; static GBool printVersion = gFalse; static GBool printHelp = gFalse; static GBool printEnc = gFalse; +static GBool printStructure = gFalse; +static GBool printStructureText = gFalse; static const ArgDesc argDesc[] = { {"-f", argInt, &firstPage, 0, @@ -85,6 +92,10 @@ static const ArgDesc argDesc[] = { "print the document metadata (XML)"}, {"-js", argFlag, &printJS, 0, "print all JavaScript in the PDF"}, + {"-struct", argFlag, &printStructure, 0, + "print the logical document structure (for tagged files)"}, + {"-struct-text", argFlag, &printStructureText, 0, + "print text contents along with document structure (for tagged files)"}, {"-rawdates", argFlag, &rawDates, 0, "print the undecoded date strings directly from the PDF file"}, {"-enc", argString, textEncName, sizeof(textEncName), @@ -141,6 +152,9 @@ int main(int argc, char *argv[]) { goto err0; } + if (printStructureText) + printStructure = gTrue; + // read config file globalParams = new GlobalParams(); @@ -394,6 +408,15 @@ int main(int argc, char *argv[]) { jsInfo.scanJS(lastPage - firstPage + 1, stdout, uMap); } + // print the structure + const StructTreeRoot *structTree; + if (printStructure && (structTree = doc->getCatalog()->getStructTreeRoot())) { + fputs("Structure:\n", stdout); + for (unsigned i = 0; i < structTree->getNumElements(); i++) { + printStruct(structTree->getElement(i), 1); + } + } + exitCode = 0; // clean up @@ -474,3 +497,74 @@ static void printBox(const char *text, PDFRectangle *box) { printf("%s%8.2f %8.2f %8.2f %8.2f\n", text, box->x1, box->y1, box->x2, box->y2); } + +static void printIndent(unsigned indent) { + while (indent--) { + putchar(' '); + putchar(' '); + } +} + +static void printAttribute(const Attribute *attribute, unsigned indent) +{ + printIndent(indent); + printf(" /%s ", attribute->getTypeName()); + if (attribute->getType() == Attribute::UserProperty) { + printf("(%s) ", attribute->getName()); + } + attribute->getValue()->print(stdout); + if (attribute->getFormattedValue()) { + printf(" \"%s\"", attribute->getFormattedValue()); + } + if (attribute->isHidden()) { + printf(" [hidden]"); + } +} + +static void printStruct(const StructElement *element, unsigned indent) { + if (element->isObjectRef()) { + printIndent(indent); + printf("Object %i %i\n", element->getObjectRef().num, element->getObjectRef().gen); + return; + } + + if (printStructureText && element->isContent()) { + GooString *text = element->getText(NULL, gFalse); + printIndent(indent); + if (text) { + printf("\"%s\"\n", text->getCString()); + } else { + printf("(No content?)\n"); + } + delete text; + } + + if (!element->isContent()) { + printIndent(indent); + printf("%s", element->getTypeName()); + if (element->getID()) { + printf(" <%s>", element->getID()->getCString()); + } + if (element->getTitle()) { + printf(" \"%s\"", element->getTitle()->getCString()); + } + if (element->getRevision() > 0) { + printf(" r%u", element->getRevision()); + } + if (element->isInline() || element->isBlock()) { + printf(" (%s)", element->isInline() ? "inline" : "block"); + } + if (element->getNumAttributes()) { + putchar(':'); + for (unsigned i = 0; i < element->getNumAttributes(); i++) { + putchar('\n'); + printAttribute(element->getAttribute(i), indent + 1); + } + } + + putchar('\n'); + for (unsigned i = 0; i < element->getNumElements(); i++) { + printStruct(element->getElement(i), indent + 1); + } + } +} -- 1.8.4