From b5690e554e6c0d0104a5cef28848706400f97f1c Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Mon, 27 May 2013 21:26:57 +0300 Subject: [PATCH v8 07/15] Tagged-PDF: Implement the utils/pdfstructtohtml tool This adds a new tool "utils/pdfstructtohtml" that uses the document structure from tagged PDFs to generate an HTML document from it, trying to preserve as much of the structure and some of the styling in the output. This serves also as a demonstration of how to use the Attribute and StructElement classes. --- utils/Makefile.am | 5 + utils/pdfstructtohtml.cc | 555 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 560 insertions(+) create mode 100644 utils/pdfstructtohtml.cc diff --git a/utils/Makefile.am b/utils/Makefile.am index 1dd9a12..609bbd8 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -58,6 +58,7 @@ bin_PROGRAMS = \ pdftohtml \ pdfseparate \ pdfunite \ + pdfstructtohtml \ $(pdftoppm_binary) \ $(pdftocairo_binary) @@ -135,6 +136,10 @@ pdfunite_SOURCES = \ pdfunite.cc \ $(common) +pdfstructtohtml_SOURCES = \ + pdfstructtohtml.cc \ + $(common) + # Yay, automake! It should be able to figure out that it has to dist # pdftoppm.1, but nooo. So we just add it here. diff --git a/utils/pdfstructtohtml.cc b/utils/pdfstructtohtml.cc new file mode 100644 index 0000000..762f9e1 --- /dev/null +++ b/utils/pdfstructtohtml.cc @@ -0,0 +1,555 @@ +//======================================================================== +// +// pdfstructtohtml.cc +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#include "config.h" +#include +#include "GlobalParams.h" +#include "parseargs.h" +#include "PDFDocFactory.h" +#include "PDFDoc.h" +#include "StructElement.h" +#include "StructTreeRoot.h" +#include "TextOutputDev.h" +#include "UnicodeMap.h" +#include +#include + + +static void arrayToRGBColor(Object *value, double &r, double &g, double &b) +{ + Object obj; + r = value->arrayGet(0, &obj)->getNum(); + g = value->arrayGet(1, &obj)->getNum(); + b = value->arrayGet(2, &obj)->getNum(); +} + +static const char *tagDecideL(const StructElement *elem) +{ + const Attribute *attr = elem->findAttribute(Attribute::ListNumbering, gTrue); + Object *value = attr ? attr->getValue() : Attribute::getDefaultValue(Attribute::ListNumbering); + + if (value) { + if (value->isName("Decimal") || + value->isName("UpperRoman") || + value->isName("LowerRoman") || + value->isName("UpperAlpha") || + value->isName("LowerAlphs")) + return "ol"; + } + return "ul"; +} + + +static void attrStyleListNumbering(const StructElement*, Attribute::Type, Object *value, GooString *style) +{ + if (value->isName("None")) style->append("list-style-type: none;"); + else if (value->isName("Disc")) style->append("list-style-type: disc"); + else if (value->isName("Circle")) style->append("list-style-type: circle;"); + else if (value->isName("Square")) style->append("list-style-type: square;"); + else if (value->isName("Decimal")) style->append("list-style-type: decimal;"); + else if (value->isName("UpperRoman")) style->append("list-style-type: upper-roman;"); + else if (value->isName("LowerRoman")) style->append("list-style-type: lower-roman;"); + else if (value->isName("UpperAlpha")) style->append("list-style-type: upper-latin;"); + else if (value->isName("LowerAlphs")) style->append("list-style-type: lower-latin;"); +} + +static void attrStyleTextAlign(const StructElement*, Attribute::Type, Object *value, GooString *style) +{ + if (value->isName("Justify")) style->append("text-align: justify;"); + else if (value->isName("Start")) style->append("text-align: left;"); + else if (value->isName("End")) style->append("text-align: right;"); + else if (value->isName("Center")) style->append("text-align: center;"); +} + +static void attrStyleTextDecoration(const StructElement*, Attribute::Type, Object *value, GooString *style) +{ + if (value->isName("Underline")) style->append("text-decoration: underline;"); + else if (value->isName("Overline")) style->append("text-decoration: overline;"); + else if (value->isName("LineThrough")) style->append("text-decoration: line-through;"); + else if (value->isName("None")) style->append("text-decoration: none;"); +} + +static void attrColRowSpan(const StructElement*, Attribute::Type type, Object *value, GooString *attrs) +{ + if (value->isNum() && value->getNum() > 0.0) { + attrs->appendf(" {0:s}span='{1:u}'", + (type == Attribute::RowSpan) ? "row" : "col", + (unsigned) value->getNum()); + } +} + +static void attrStyleColor(const StructElement*, Attribute::Type type, Object *value, GooString *style) +{ + double r, g, b; + arrayToRGBColor(value, r, g, b); + + if (type == Attribute::BackgroundColor) + style->append("background-"); + style->appendf("color: rgb({0:u}, {1:u}, {2:u});", + ((Guint) (r * 255)) & 0xFF, + ((Guint) (g * 255)) & 0xFF, + ((Guint) (b * 255)) & 0xFF); +} + + +typedef void (*TagAttrBuildFunc)(const StructElement*, Attribute::Type, Object*, GooString*); + +static const struct AttrMapEntry { + Attribute::Type type; + GBool style; + TagAttrBuildFunc buildAttr; +} attrBuildMap[] = { + { Attribute::Color, gTrue, attrStyleColor }, + { Attribute::BackgroundColor, gTrue, attrStyleColor }, + { Attribute::ListNumbering, gTrue, attrStyleListNumbering }, + { Attribute::TextAlign, gTrue, attrStyleTextAlign }, + { Attribute::TextDecorationType, gTrue, attrStyleTextDecoration }, + { Attribute::RowSpan, gFalse, attrColRowSpan }, + { Attribute::ColSpan, gFalse, attrColRowSpan }, +}; + + +typedef const char* (*TagDecideFunc)(const StructElement*); + +static const struct ElementMapEntry { + StructElement::Type type; + const char *tagName; + TagDecideFunc tagDecide; +} elementMap[] = { + { StructElement::P, "p", NULL }, + { StructElement::H1, "h1", NULL }, + { StructElement::H2, "h2", NULL }, + { StructElement::H3, "h3", NULL }, + { StructElement::H4, "h4", NULL }, + { StructElement::H5, "h5", NULL }, + { StructElement::H6, "h6", NULL }, + { StructElement::L, NULL, tagDecideL }, + { StructElement::LI, "li", NULL }, + { StructElement::Table, "table", NULL }, + { StructElement::TR, "tr", NULL }, + { StructElement::TH, "th", NULL }, + { StructElement::TD, "td", NULL }, + { StructElement::TBody, "tbody", NULL }, + { StructElement::THead, "thead", NULL }, + { StructElement::Link, "a", NULL }, +}; + + +static void xmlEscape(FILE *out, const char *str) +{ + while (*str) { + int ch = *str++; + switch (ch) { + case '&' : fputs("&", out); break; + case '\'': fputs("'", out); break; + case '"' : fputs(""", out); break; + case '<' : fputs("<", out); break; + case '>' : fputs(">", out); break; + default : fputc(ch, out); + } + } +} + + +class StructVisitor +{ +public: + StructVisitor(PDFDoc *docA, FILE *output): + inTable(gFalse), + doc(docA), + out(output) + { + assert(doc); + assert(out); + } + + GBool process(); + +private: + void start(); + void finish(); + void visit(const StructElement *elem); + GooString *buildTagAttributes(const StructElement *elem); + + void E(const char *str) { xmlEscape(out, str); } + void O(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(out, fmt, args); + va_end(args); + } + + GBool inTable; + PDFDoc *doc; + FILE *out; +}; + + +class MCOpProcessor { +public: + MCOpProcessor(FILE *output): + out(output), + map(globalParams->getTextEncoding()), + text(), + font(), + flags(0) + { + assert(output); + } + + ~MCOpProcessor() { + emitText(); + map->decRefCnt(); + } + + void process(const MCOpArray& ops) { + for (MCOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) + process(*i); + } + + void process(const MCOp& op) { + if (op.type == MCOp::Unichar) { + int n = map->mapUnicode(op.unichar, buf, sizeof(buf)); + text.append(buf, n); + return; + } + + Guint oldFlags = flags; + if (op.type == MCOp::Flags) { + if (op.flags & MCOp::FlagFontBold) + flags |= FBold; + else + flags &= ~FBold; + + if (op.flags & MCOp::FlagFontFixed) + flags |= FFixed; + else + flags &= ~FFixed; + + if (op.flags & MCOp::FlagFontItalic) + flags |= FItalic; + else + flags &= ~FItalic; + } + + if (op.type == MCOp::FontName) { + if (op.value) { + flags |= FFont; + font.append(op.value); + } else { + flags &= ~FFont; + } + } + + if (op.type == MCOp::Color && (color = op.color.rgbPixel())) + flags |= FColor; + else + flags &= ~FColor; + + if (flags != oldFlags) + emitText(); + } + +private: + enum { + FColor = (1 << 0), + FBold = (1 << 1), + FFixed = (1 << 2), + FItalic = (1 << 3), + FFont = (1 << 4), + + FUseSpan = (FColor | FFont), + }; + + void E(const char *str) { xmlEscape(out, str); } + void O(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + vfprintf(out, fmt, args); + va_end(args); + } + + void emitText() { + if (!flags) { + E(text.c_str()); + text.clear(); + return; + } + + if (flags & FBold) O(""); + if (flags & FItalic) O(""); + if (flags & FFixed) O(""); + + if (flags & FUseSpan) { + O(""); + } + + E(text.c_str()); + + if (flags & FUseSpan) + O(""); + + if (flags & FFixed) O(""); + if (flags & FItalic) O(""); + if (flags & FBold) O(""); + + font.clear(); + text.clear(); + } + + FILE *out; + UnicodeMap *map; + std::string text; + std::string font; + Guint flags; + char buf[8]; + Guint color; +}; + + +GBool StructVisitor::process() +{ + StructTreeRoot *root = doc->getStructTreeRoot(); + if (!root) + return gFalse; + + start(); + for (unsigned i = 0; i < root->getNumElements(); i++) + visit(root->getElement(i)); + finish(); + + return gTrue; +} + +void StructVisitor::start() +{ + O("\n" + "\n" + " \n" + " \n"); + + Object info; + if (doc->getDocInfo(&info)->isDict()) { + Object obj; + if (info.dictLookup("Title", &obj)->isString()) { + O(" "); E(obj.getString()->getCString()); O("\n"); + } + } + + O(" \n" + " \n" + " \n\n"); +} + +void StructVisitor::finish() +{ + O("\n" + " \n" + "\n"); +} + + +GooString *StructVisitor::buildTagAttributes(const StructElement *elem) +{ + GooString *style = NULL; + GooString *attrs = NULL; + + if (elem->getType() == StructElement::Link) { + // XXX This is crude, but gets the job done. + for (unsigned i = 0; i < elem->getNumElements(); i++) { + if (elem->getElement(i)->isObjectRef()) { + Ref ref = elem->getElement(i)->getObjectRef(); + Object value; + if (doc->getXRef()->fetch(ref.num, ref.gen, &value)->isDict("Annot")) { + Object action; + if (value.dictLookup("A", &action)->isDict("Action")) { + Object uri; + if (action.dictLookup("URI", &uri)->isString()) { + if (!attrs) attrs = new GooString(); + attrs->appendf(" href='{0:s}'", uri.getString()); + } + uri.free(); + } + action.free(); + } + value.free(); + break; + } + } + } + + for (unsigned i = 0; i < elem->getNumAttributes(); i++) { + const Attribute *attr = elem->getAttribute(i); + const AttrMapEntry *entry = NULL; + for (unsigned j = 0; j < sizeof(attrBuildMap) / sizeof(attrBuildMap[0]); j++) { + if (attrBuildMap[j].type == attr->getType()) { + entry = &attrBuildMap[j]; + break; + } + } + if (entry) { + if (entry->style) { + if (!style) style = new GooString(); + (*entry->buildAttr)(elem, attr->getType(), attr->getValue(), style); + } else { + if (!attrs) attrs = new GooString(); + (*entry->buildAttr)(elem, attr->getType(), attr->getValue(), attrs); + } + } + } + + if (style) { + if (!attrs) attrs = new GooString(); + attrs->append(" style='"); + attrs->append(style); + attrs->append("'"); + } + + return attrs; +} + + +void StructVisitor::visit(const StructElement *elem) +{ + if (elem->isContent()) { + const GooString *text; + + if ((text = elem->getAltText()) || (text = elem->getActualText())) + E(text->getCString()); + else { + const MCOpArray& ops(elem->getMCOps()); + if (ops.size()) { + MCOpProcessor processor(out); + processor.process(ops); + } else if ((text = elem->getText())) { + E(text->getCString()); + delete text; + } + } + } else { + const ElementMapEntry *entry = NULL; + for (unsigned i = 0; i < sizeof(elementMap) / sizeof(elementMap[0]); i++) + if (elementMap[i].type == elem->getType()) + entry = &elementMap[i]; + + if (entry) { + assert(entry->tagName || entry->tagDecide); + const char *tag = entry->tagName ? entry->tagName : (*entry->tagDecide)(elem); + if (!(inTable && entry->type == StructElement::P)) { + GooString *attrs = buildTagAttributes(elem); + O("<%s", tag); + if (attrs) O("%s", attrs->getCString()); + delete attrs; + O(">"); + } + if (entry->type == StructElement::Table) + inTable = gTrue; + for (unsigned i = 0; i < elem->getNumElements(); i++) + visit(elem->getElement(i)); + inTable = gFalse; + if (!(inTable && entry->type == StructElement::P)) + O("\n", tag); + } else { + for (unsigned i = 0; i < elem->getNumElements(); i++) + visit(elem->getElement(i)); + } + } +} + + +static char ownerPassword[33] = "\001"; +static char userPassword[33] = "\001"; +static GBool printHelp = gFalse; + +static const ArgDesc argDesc[] = { + { "-opw", argString, ownerPassword, sizeof(ownerPassword), + "owner password (for encrypted files)" }, + { "-upw", argString, userPassword, sizeof(userPassword), + "user password (for encrypted files)" }, + { "-h", argFlag, &printHelp, 0, + "print usage information" }, + { "-help", argFlag, &printHelp, 0, + "print usage information" }, + { "--help", argFlag, &printHelp, 0, + "print usage information" }, + { NULL } +}; + +int main(int argc, char **argv) +{ + PDFDoc *doc; + GooString *ownerPW, *userPW, *fileName; + GBool ok; + int exitCode = 99; + StructVisitor *v = NULL; + FILE *output = stdout; + + ok = parseArgs(argDesc, &argc, argv); + if (!ok || (argc < 2) || (argc > 3) || printHelp) { + fprintf(stderr, "pdfstructtohtml version " PACKAGE_VERSION "\n"); + fprintf(stderr, "%s\n", popplerCopyright); + fprintf(stderr, "%s\n", xpdfCopyright); + printUsage("pdfstructtohtml", " []", argDesc); + if (printHelp) + exitCode = EXIT_SUCCESS; + goto err0; + } + + ownerPW = (ownerPassword[0] != '\001') ? new GooString(ownerPassword) : NULL; + userPW = (userPassword[0] != '\001') ? new GooString(userPassword) : NULL; + fileName = new GooString(argv[1]); + + if (fileName->cmp("-") == 0) + fileName->Set("fd://0"); + + globalParams = new GlobalParams(); + globalParams->setTextEncoding("UTF-8"); + + doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW); + delete ownerPW; + delete userPW; + + if (!doc->isOk()) { + exitCode = 1; + goto err1; + } + + if (argc == 3) { + if (!(output = fopen(argv[2], "wb"))) { + exitCode = 2; + goto err1; + } + } + + v = new StructVisitor(doc, output); + v->process(); + + if (output != stdout) + fclose(output); + + delete v; + +err1: + delete doc; + delete fileName; + delete globalParams; + +err0: + Object::memCheck(stderr); + gMemReport(stderr); + return exitCode; +} -- 1.8.4