From cb35464ab9aa0c23650b705a33c9c5a1cf176476 Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 26 Nov 2017 20:43:15 +1030 Subject: [PATCH 2/8] cairo: write document logical structure if output is pdf Now that cairo can create tagged pdf, make CairoOutputDev copy the logical structure from the input pdf if available. Added setLogicalStructure() to enable. Added -struct option to pdftocairo to enable. Bug #103912 --- glib/poppler-page.cc | 2 +- poppler/CairoOutputDev.cc | 415 +++++++++++++++++++++++++++++++++++++- poppler/CairoOutputDev.h | 42 +++- poppler/Gfx.cc | 6 +- poppler/MarkedContentOutputDev.cc | 4 +- poppler/MarkedContentOutputDev.h | 4 +- poppler/OutputDev.cc | 2 +- poppler/OutputDev.h | 4 +- poppler/PSOutputDev.cc | 4 +- poppler/PSOutputDev.h | 2 +- poppler/Page.cc | 12 +- poppler/Page.h | 5 + poppler/PreScanOutputDev.cc | 2 +- poppler/PreScanOutputDev.h | 2 +- poppler/SplashOutputDev.cc | 2 +- poppler/SplashOutputDev.h | 2 +- poppler/StructElement.h | 1 + poppler/TextOutputDev.cc | 4 +- poppler/TextOutputDev.h | 2 +- poppler/UTF.cc | 92 ++++++--- poppler/UTF.h | 13 +- qt4/src/ArthurOutputDev.cc | 2 +- qt4/src/ArthurOutputDev.h | 2 +- qt5/src/ArthurOutputDev.cc | 2 +- qt5/src/ArthurOutputDev.h | 2 +- test/gtk-test.cc | 6 +- utils/HtmlOutputDev.cc | 2 +- utils/HtmlOutputDev.h | 2 +- utils/pdftocairo.1 | 4 + utils/pdftocairo.cc | 14 +- 30 files changed, 584 insertions(+), 74 deletions(-) diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index a44edac6..87d21e5a 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -274,7 +274,7 @@ poppler_page_get_text_page (PopplerPage *page) gFalse, /* printing */ NULL, NULL); page->page->display(gfx); - text_dev->endPage(); + text_dev->endPage(nullptr); page->text = text_dev->takeText(); delete gfx; diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc index cc8a161b..afdd3a21 100644 --- a/poppler/CairoOutputDev.cc +++ b/poppler/CairoOutputDev.cc @@ -46,6 +46,7 @@ #include #include #include +#include #include "goo/gfile.h" #include "GlobalParams.h" @@ -64,6 +65,7 @@ #include "CairoFontEngine.h" #include "CairoRescaleBox.h" #include "UnicodeMap.h" +#include "UTF.h" #include "JBIG2Stream.h" //------------------------------------------------------------------------ @@ -166,6 +168,10 @@ CairoOutputDev::CairoOutputDev() { text = NULL; actualText = NULL; + logicalStruct = false; + pdfPageNum = 0; + cairoPageNum = 0; + forwardLinkCount = 0; // the SA parameter supposedly defaults to false, but Acrobat // apparently hardwires it to true @@ -173,6 +179,7 @@ CairoOutputDev::CairoOutputDev() { align_stroke_coords = gFalse; adjusted_stroke_width = gFalse; xref = NULL; + annotations = nullptr; } CairoOutputDev::~CairoOutputDev() { @@ -193,7 +200,9 @@ CairoOutputDev::~CairoOutputDev() { if (text) text->decRefCnt(); if (actualText) - delete actualText; + delete actualText; + if (annotations) + delete annotations; } void CairoOutputDev::setCairo(cairo_t *cairo) @@ -201,7 +210,7 @@ void CairoOutputDev::setCairo(cairo_t *cairo) if (this->cairo != NULL) { cairo_status_t status = cairo_status (this->cairo); if (status) { - error(errInternal, -1, "cairo context error: {0:s}\n", cairo_status_to_string(status)); + error(errInternal, -1, "cairo context error: {0:s}", cairo_status_to_string(status)); } cairo_destroy (this->cairo); assert(!cairo_shape); @@ -218,6 +227,13 @@ void CairoOutputDev::setCairo(cairo_t *cairo) } } +GBool CairoOutputDev::isPDF() +{ + if (cairo) + return cairo_surface_get_type (cairo_get_target (cairo)) == CAIRO_SURFACE_TYPE_PDF; + return gFalse; +} + void CairoOutputDev::setTextPage(TextPage *text) { if (this->text) @@ -267,10 +283,75 @@ void CairoOutputDev::startDoc(PDFDoc *docA, fontEngine_owner = gTrue; } xref = doc->getXRef(); + + destsMap.clear(); + pdfPageRefToCairoPageNumMap.clear(); + cairoPageNum = 0; + forwardLinkCount = 0; + firstPage = gTrue; +} + +void CairoOutputDev::textStringToQuotedUtf8(GooString *text, GooString *s) +{ + char *utf8; + int len; + + utf8 = TextStringToUtf8(text); + len = strlen(utf8); + s->Set("'"); + for (int i = 0; i < len; i++) { + if (utf8[i] == '\\') { + s->append("\\\\"); + } else if (utf8[i] == '\'') { + s->append("\\'"); + } else { + s->append(utf8[i]); + } + } + s->append("'"); + gfree(utf8); +} + +// Initialization that needs to be performed after setCairo() is called. +void CairoOutputDev::startFirstPage(int pageNum, GfxState *state, XRef *xrefA) +{ + if (xrefA != NULL) { + xref = xrefA; + } + + if (logicalStruct && isPDF()) { + int numDests = doc->getCatalog()->numDestNameTree(); + for (int i = 0; i < numDests; i++) { + GooString *name = doc->getCatalog()->getDestNameTreeName(i); + LinkDest *dest = doc->getCatalog()->getDestNameTreeDest(i); + if (dest->isPageRef()) { + destsMap[dest->getPageRef()].insert( + std::make_pair(std::unique_ptr(name->copy()), std::unique_ptr(dest))); + } else { + delete dest; + } + } + + numDests = doc->getCatalog()->numDests(); + for (int i = 0; i < numDests; i++) { + const char *name = doc->getCatalog()->getDestsName(i); + LinkDest *dest = doc->getCatalog()->getDestsDest(i); + if (dest->isPageRef()) { + destsMap[dest->getPageRef()].insert( + std::make_pair(std::unique_ptr(new GooString(name)), + std::unique_ptr(dest))); + } + } + } } void CairoOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA) { - /* set up some per page defaults */ + if (firstPage) { + startFirstPage(pageNum, state, xrefA); + firstPage = gFalse; + } + +/* set up some per page defaults */ cairo_pattern_destroy(fill_pattern); cairo_pattern_destroy(stroke_pattern); @@ -281,16 +362,67 @@ void CairoOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA) { if (text) text->startPage(state); - if (xrefA != NULL) { - xref = xrefA; + pdfPageNum = pageNum; + cairoPageNum++; + + if (logicalStruct && isPDF()) { + if (annotations) + delete annotations; + Object obj = doc->getPage(pageNum)->getAnnotsObject(xref); + annotations = new Annots(doc, pageNum, &obj); + + // remove non Links + int i = 0; + while (i < annotations->getNumAnnots()) { + if (annotations->getAnnot(i)->getType() != Annot::typeLink) + annotations->removeAnnot(annotations->getAnnot(i)); + else + i++; + } + + // emit dests + Ref *ref = doc->getCatalog()->getPageRef(pageNum); + pdfPageRefToCairoPageNumMap[*ref] = cairoPageNum; + auto pageDests = destsMap.find(*ref); + if (pageDests != destsMap.end()) { + for (auto& it: pageDests->second) { + GooString name; + textStringToQuotedUtf8(it.first.get(), &name); + + GooString attrib; + attrib.appendf("name={0:t} ", &name); + if (it.second->getChangeLeft()) + attrib.appendf("x={0:g} ", it.second->getLeft()); + if (it.second->getChangeTop()) + attrib.appendf("y={0:g} ", state->getPageHeight() - it.second->getTop()); + +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_begin (cairo, CAIRO_TAG_DEST, attrib.getCString()); + cairo_tag_end (cairo, CAIRO_TAG_DEST); +#endif + } + } } } -void CairoOutputDev::endPage() { +void CairoOutputDev::endPage(GfxState *state) { if (text) { text->endPage(); text->coalesce(gTrue, 0, gFalse); } + + if (logicalStruct && annotations && isPDF()) { + for (int i = 0; i < annotations->getNumAnnots(); i++) { + Annot *annot = annotations->getAnnot(i); + if (annot->getType() == Annot::typeLink) { + AnnotLink *linkAnnot = static_cast(annot); + beginLinkTag(state, linkAnnot); +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_end (cairo, "Link"); +#endif + } + } + } } void CairoOutputDev::saveState(GfxState *state) { @@ -399,7 +531,7 @@ void CairoOutputDev::updateCTM(GfxState *state, double m11, double m12, * instead of having to invert the matrix. */ invert_matrix = matrix; if (cairo_matrix_invert(&invert_matrix)) { - error(errSyntaxWarning, -1, "matrix not invertible\n"); + error(errSyntaxWarning, -1, "matrix not invertible"); return; } @@ -1017,7 +1149,7 @@ GBool CairoOutputDev::functionShadedFill(GfxState *state, GfxFunctionShading *sh mat.x0 = matrix[4]; mat.y0 = matrix[5]; if (cairo_matrix_invert(&mat)) { - error(errSyntaxWarning, -1, "matrix not invertible\n"); + error(errSyntaxWarning, -1, "matrix not invertible"); return gFalse; } @@ -3279,6 +3411,273 @@ void CairoOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, cairo_pattern_destroy (pattern); } +void CairoOutputDev::findLinkObject(const StructElement *elem) +{ + if (elem->isObjectRef()) { + Ref ref = elem->getObjectRef(); + for (int i = 0; i < annotations->getNumAnnots(); i++) { + Annot *annot = annotations->getAnnot(i); + if (annot->getType() == Annot::typeLink && annot->match(&ref)) { + linkAnnot = static_cast(annot); + break; + } + } + return; + } + + for (unsigned i = 0; i < elem->getNumChildren(); i++) { + findLinkObject(elem->getChild(i)); + } +} + +void CairoOutputDev::quadToCairoRect(AnnotQuadrilaterals *quads, + int idx, + double pageHeight, + cairo_rectangle_t *rect) +{ + double x1, x2, y1, y2; + x1 = x2 = quads->getX1(idx); + y1 = y2 = quads->getX2(idx); + + x1 = std::min(x1, quads->getX2(idx)); + x1 = std::min(x1, quads->getX3(idx)); + x1 = std::min(x1, quads->getX4(idx)); + + y1 = std::min(y1, quads->getY2(idx)); + y1 = std::min(y1, quads->getY3(idx)); + y1 = std::min(y1, quads->getY4(idx)); + + x2 = std::max(x2, quads->getX2(idx)); + x2 = std::max(x2, quads->getX3(idx)); + x2 = std::max(x2, quads->getX4(idx)); + + y2 = std::max(y2, quads->getY2(idx)); + y2 = std::max(y2, quads->getY3(idx)); + y2 = std::max(y2, quads->getY4(idx)); + + rect->x = x1; + rect->y = pageHeight - y2; + rect->width = x2 - x1; + rect->height = y2 - y1; +} + + +void CairoOutputDev::appendLinkDestRef(GooString *s, LinkDest *dest) +{ + Ref ref = dest->getPageRef(); + auto pageNum = pdfPageRefToCairoPageNumMap.find(ref); + if (pageNum != pdfPageRefToCairoPageNumMap.end()) { + // Link page ref is to a page already emitted. + s->appendf("page={0:d} ", pageNum->second); + double destPageHeight = doc->getPageMediaHeight(dest->getPageNum()); + appendLinkDestXY(s, dest, destPageHeight); + } else { + // Link page ref is to a page that has not been emitted. + // Create a named destination and add the name to destsMap so destination will + // be create if/when the page is emitted. + GooString *name = new GooString(); + name->appendf("poppler-cairo-dest-{0:d}", ++forwardLinkCount); + s->appendf("dest='{0:t}' ", name); + destsMap[ref].insert(std::make_pair(std::unique_ptr(name), + std::unique_ptr(dest->copy()))); + } +} + +void CairoOutputDev::appendLinkDestXY(GooString *s, LinkDest *dest, double destPageHeight) +{ + double x = 0; + double y = 0; + + if (dest->getChangeLeft()) + x = dest->getLeft(); + + if (dest->getChangeTop()) + y = dest->getTop(); + + // if pageHeight is 0, dest is remote document, cairo uses PDF coords in this + // case. So don't flip coords when pageHeight is 0. + s->appendf("pos=[{0:g} {1:g}] ", + x, + destPageHeight ? destPageHeight - y : y); +} + +void CairoOutputDev::beginLinkTag(GfxState *state, AnnotLink *linkAnnot) +{ + GooString attrib; + attrib.append("rect=["); + AnnotQuadrilaterals *quads = linkAnnot->getQuadrilaterals(); + if (quads && quads->getQuadrilateralsLength() > 0) { + for (int i = 0; i < quads->getQuadrilateralsLength(); i++) { + cairo_rectangle_t rect; + quadToCairoRect(quads, i, state->getPageHeight(), &rect); + attrib.appendf("{0:g} {1:g} {2:g} {3:g} ", rect.x, rect.y, rect.width, rect.height); + } + } else { + double x1, x2, y1, y2; + linkAnnot->getRect(&x1, &y1, &x2, &y2); + attrib.appendf("{0:g} {1:g} {2:g} {3:g} ", + x1, + state->getPageHeight() - y2, + x2 - x1, + y2 - y1); + } + attrib.append("] "); + + LinkAction *action = linkAnnot->getAction(); + if (action->getKind() == actionGoTo) { + LinkGoTo *act = static_cast(action); + if (act->isOk()) { + GooString *namedDest = act->getNamedDest(); + LinkDest *linkDest = act->getDest(); + if (namedDest) { + GooString name; + textStringToQuotedUtf8(namedDest, &name); + attrib.appendf("dest={0:t} ", &name); + } else if (linkDest && linkDest->isOk() && linkDest->isPageRef()) { + appendLinkDestRef(&attrib, linkDest); + } + } + } else if (action->getKind() == actionGoToR) { + LinkGoToR *act = static_cast(action); + attrib.appendf("file='{0:s}' ", act->getFileName()); + GooString *namedDest = act->getNamedDest(); + LinkDest *linkDest = act->getDest(); + if (namedDest) { + GooString name; + textStringToQuotedUtf8(namedDest, &name); + attrib.appendf("dest={0:t} ", &name); + } else if (linkDest && linkDest->isOk() && !linkDest->isPageRef()) { + attrib.appendf("page={0:d} ", linkDest->getPageNum()); + appendLinkDestXY(&attrib, linkDest, 0.0); + } + } else if (action->getKind() == actionURI) { + LinkURI *act = static_cast(action); + if (act->isOk()) { + attrib.appendf("uri='{0:t}'", act->getURI()); + } + } +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_begin (cairo, "Link", attrib.getCString()); +#endif +} + +void CairoOutputDev::beginLink(GfxState *state, const StructElement *linkElem) +{ + linkAnnot = nullptr; + findLinkObject(linkElem); + if (linkAnnot) { + beginLinkTag (state, linkAnnot); + annotations->removeAnnot(linkAnnot); + } else { +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_begin (cairo, linkElem->getTypeName(), nullptr); +#endif + } +} + +void CairoOutputDev::beginMarkedContent(GfxState *state, char *name, Dict *properties) +{ + if (!logicalStruct || !isPDF()) + return; + + markedContentStack.push_back (nullptr); + + int mcid = -1; + if (properties) + properties->lookupInt("MCID", nullptr, &mcid); + + if (mcid == -1) + return; + + // find StructElement for this marked content + int structParents = doc->getPage(pdfPageNum)->getStructParents(); + if (structParents < 0) + return; + + StructTreeRoot *root; + const StructElement *contentElem; + root = doc->getStructTreeRoot(); + if (!root) + return; + + contentElem = root->findParentElement(structParents, (unsigned)mcid); + if (!contentElem) + return; + + markedContentStack.back() = contentElem; + + // Walk back up tree to build a list of parent elements that need to + // be open before contentElem. + std::deque elemStack; + const StructElement *parent = contentElem->getParent(); + while (parent) { + elemStack.push_front(parent); + parent = parent->getParent(); + } + + // Find the common ancestors on elemStack with cairoTagStack. When we + // find the first noncommon element on cairoTagStack, close all subsequent + // tags then open the noncommon elemStack elements. + unsigned i = 0; + while (i < cairoTagStack.size() && i < elemStack.size()) { + if (cairoTagStack[i] != elemStack[i]) + break; + i++; + } + + // close noncommon cairo tags + while (cairoTagStack.size() > i) { +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_end (cairo, cairoTagStack.back()->getTypeName()); +#endif + cairoTagStack.pop_back(); + } + + // open noncommon elemStack tags + while (i < elemStack.size()) { + if (elemStack[i]->getType() == StructElement::Link) { + beginLink(state, elemStack[i]); + } else { +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_begin (cairo, elemStack[i]->getTypeName(), nullptr); +#endif + } + cairoTagStack.push_back(elemStack[i]); + i++; + } + + // emit contentElem + if (contentElem->getType() == StructElement::Link) { + beginLink(state, contentElem); + } else { +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_begin (cairo, contentElem->getTypeName(), nullptr); +#endif + } + cairoTagStack.push_back(contentElem); +} + +void CairoOutputDev::endMarkedContent(GfxState *state) +{ + if (!logicalStruct || !isPDF()) + return; + + if (markedContentStack.size() == 0) + return; + + const StructElement *elem = markedContentStack.back(); + markedContentStack.pop_back(); + if (elem) { + if (cairoTagStack.size() == 0 || cairoTagStack.back() != elem) { + assert(0 && "mismatch between cairoTagStack and markedContentStack\n"); + } +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + cairo_tag_end (cairo, elem->getTypeName()); +#endif + cairoTagStack.pop_back(); + } +} + //------------------------------------------------------------------------ // ImageOutputDev diff --git a/poppler/CairoOutputDev.h b/poppler/CairoOutputDev.h index 1fee8121..01ba43b8 100644 --- a/poppler/CairoOutputDev.h +++ b/poppler/CairoOutputDev.h @@ -36,11 +36,18 @@ #pragma interface #endif +#include +#include + #include "goo/gtypes.h" #include #include "OutputDev.h" #include "TextOutputDev.h" #include "GfxState.h" +#include "StructElement.h" +#include "StructTreeRoot.h" +#include "Annot.h" +#include "Link.h" class PDFDoc; class GfxState; @@ -133,7 +140,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- save/restore graphics state void saveState(GfxState *state) override; @@ -256,17 +263,22 @@ public: void type3D1(GfxState *state, double wx, double wy, double llx, double lly, double urx, double ury) override; + virtual void beginMarkedContent(GfxState *state, char *name, Dict *properties) override; + virtual void endMarkedContent(GfxState *state) override; + //----- special access // Called to indicate that a new PDF document has been loaded. void startDoc(PDFDoc *docA, CairoFontEngine *fontEngine = NULL); - + void startFirstPage(int pageNum, GfxState *state, XRef *xrefA); + GBool isReverseVideo() { return gFalse; } void setCairo (cairo_t *cr); void setTextPage (TextPage *text); void setPrinting (GBool printing) { this->printing = printing; needFontUpdate = gTrue; } void setAntialias(cairo_antialias_t antialias); + void setLogicalStructure(GBool logStruct) { this->logicalStruct = logStruct; } void setInType3Char(GBool inType3Char) { this->inType3Char = inType3Char; } void getType3GlyphWidth (double *wx, double *wy) { *wx = t3_glyph_wx; *wy = t3_glyph_wy; } @@ -286,10 +298,19 @@ protected: GfxImageColorMap *colorMap, cairo_surface_t *image); void fillToStrokePathClip(GfxState *state); void alignStrokeCoords(GfxSubpath *subpath, int i, double *x, double *y); + void findLinkObject(const StructElement *elem); + void quadToCairoRect(AnnotQuadrilaterals *quads, int idx, double destPageHeight, cairo_rectangle_t *rect); + void appendLinkDestRef(GooString *s, LinkDest *dest); + void appendLinkDestXY(GooString *s, LinkDest *dest, double destPageHeight); + void beginLinkTag(GfxState *state, AnnotLink *linkAnnot); + void beginLink(GfxState *state, const StructElement *linkElem); #if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 14, 0) GBool setMimeDataForJBIG2Globals (Stream *str, cairo_surface_t *image); #endif static void setContextAntialias(cairo_t *cr, cairo_antialias_t antialias); + static void textStringToQuotedUtf8(GooString *text, GooString *s); + GBool isPDF(); + GfxRGB fill_color, stroke_color; cairo_pattern_t *fill_pattern, *stroke_pattern; @@ -344,6 +365,14 @@ protected: double t3_glyph_bbox[4]; cairo_antialias_t antialias; GBool prescaleImages; + GBool logicalStruct; + GBool firstPage; + int pdfPageNum; // page number of the PDF file + int cairoPageNum; // page number in cairo output + std::vector cairoTagStack; + std::vector markedContentStack; + Annots *annotations; + AnnotLink *linkAnnot; TextPage *text; // text for the current page ActualText *actualText; @@ -368,6 +397,15 @@ protected: struct MaskStack *next; } *maskStack; + struct GooStringCompare { + bool operator() (const std::unique_ptr& lhs, const std::unique_ptr& rhs) const { + return lhs->cmp(rhs.get()) < 0; + } + }; + + std::map,std::unique_ptr,GooStringCompare>, RefCompare > destsMap; + std::map pdfPageRefToCairoPageNumMap; + int forwardLinkCount; }; //------------------------------------------------------------------------ diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index 63346a4c..52809846 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -669,7 +669,7 @@ Gfx::~Gfx() { popStateGuard(); } if (!subPage) { - out->endPage(); + out->endPage(state); } // There shouldn't be more saves, but pop them if there were any while (state->hasSaves()) { @@ -5063,9 +5063,9 @@ void Gfx::opBeginMarkedContent(Object args[], int numArgs) { ocState = !contentIsHidden(); if (numArgs == 2 && args[1].isDict()) { - out->beginMarkedContent(args[0].getName(), args[1].getDict()); + out->beginMarkedContent(state, args[0].getName(), args[1].getDict()); } else if(numArgs == 1) { - out->beginMarkedContent(args[0].getName(), NULL); + out->beginMarkedContent(state, args[0].getName(), NULL); } } diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc index 7fdd8f54..c5a2f5b8 100644 --- a/poppler/MarkedContentOutputDev.cc +++ b/poppler/MarkedContentOutputDev.cc @@ -63,13 +63,13 @@ void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) } -void MarkedContentOutputDev::endPage() +void MarkedContentOutputDev::endPage(GfxState *state) { pageWidth = pageHeight = 0.0; } -void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties) +void MarkedContentOutputDev::beginMarkedContent(GfxState *state, char *name, Dict *properties) { int id = -1; if (properties) diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h index 5b9de05f..03f6b55b 100644 --- a/poppler/MarkedContentOutputDev.h +++ b/poppler/MarkedContentOutputDev.h @@ -96,7 +96,7 @@ public: GBool needCharCount() override { return gFalse; } void startPage(int pageNum, GfxState *state, XRef *xref) override; - void endPage() override; + void endPage(GfxState *state) override; void drawChar(GfxState *state, double xx, double yy, @@ -105,7 +105,7 @@ public: CharCode c, int nBytes, Unicode *u, int uLen) override; - void beginMarkedContent(char *name, Dict *properties) override; +void beginMarkedContent(GfxState *state, char *name, Dict *properties) override; void endMarkedContent(GfxState *state) override; const TextSpanArray& getTextSpans() const; diff --git a/poppler/OutputDev.cc b/poppler/OutputDev.cc index 0acdcbde..656ae65f 100644 --- a/poppler/OutputDev.cc +++ b/poppler/OutputDev.cc @@ -160,7 +160,7 @@ void OutputDev::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, void OutputDev::endMarkedContent(GfxState *state) { } -void OutputDev::beginMarkedContent(char *name, Dict *properties) { +void OutputDev::beginMarkedContent(GfxState *state, char *name, Dict *properties) { } void OutputDev::markPoint(char *name) { diff --git a/poppler/OutputDev.h b/poppler/OutputDev.h index 107c94d2..b932b275 100644 --- a/poppler/OutputDev.h +++ b/poppler/OutputDev.h @@ -148,7 +148,7 @@ public: virtual void startPage(int pageNum, GfxState *state, XRef *xref) {} // End a page. - virtual void endPage() {} + virtual void endPage(GfxState *state) {} // Dump page contents to display. virtual void dump() {} @@ -302,8 +302,8 @@ public: //----- grouping operators + virtual void beginMarkedContent(GfxState *state, char *name, Dict *properties); virtual void endMarkedContent(GfxState *state); - virtual void beginMarkedContent(char *name, Dict *properties); virtual void markPoint(char *name); virtual void markPoint(char *name, Dict *properties); diff --git a/poppler/PSOutputDev.cc b/poppler/PSOutputDev.cc index ac4f028f..3c77d8d5 100644 --- a/poppler/PSOutputDev.cc +++ b/poppler/PSOutputDev.cc @@ -3682,7 +3682,7 @@ GBool PSOutputDev::checkPageSlice(Page *page, double /*hDPI*/, double /*vDPI*/, delete splashOut; // finish the PS page - endPage(); + endPage(state); return gFalse; @@ -3958,7 +3958,7 @@ void PSOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA) { writePS("%%EndPageSetup\n"); } -void PSOutputDev::endPage() { +void PSOutputDev::endPage(GfxState *state) { if (overlayCbk) { restoreState(NULL); (*overlayCbk)(this, overlayCbkData); diff --git a/poppler/PSOutputDev.h b/poppler/PSOutputDev.h index fd1bea17..e050e705 100644 --- a/poppler/PSOutputDev.h +++ b/poppler/PSOutputDev.h @@ -195,7 +195,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- save/restore graphics state void saveState(GfxState *state) override; diff --git a/poppler/Page.cc b/poppler/Page.cc index ca4a5a6d..c4134184 100644 --- a/poppler/Page.cc +++ b/poppler/Page.cc @@ -258,6 +258,7 @@ Page::Page(PDFDoc *docA, int numA, Object *pageDict, Ref pageRefA, PageAttrs *at num = numA; duration = -1; annots = NULL; + structParents = -1; pageObj = pageDict->copy(); pageRef = pageRefA; @@ -315,7 +316,16 @@ Page::Page(PDFDoc *docA, int numA, Object *pageDict, Ref pageRefA, PageAttrs *at num, actions.getTypeName()); actions.setToNull(); } - + + // structParents + tmp = pageDict->dictLookupNF("StructParents"); + if (!(tmp.isInt() || tmp.isNull())) { + error(errSyntaxError, -1, "Page StructParents object (page {0:d}) is wrong type ({1:s})", + num, tmp.getTypeName()); + } else if (tmp.isInt()) { + structParents = tmp.getInt(); + } + return; err2: diff --git a/poppler/Page.h b/poppler/Page.h index 97b70a0c..35858bca 100644 --- a/poppler/Page.h +++ b/poppler/Page.h @@ -223,6 +223,10 @@ public: LinkAction *getAdditionalAction(PageAdditionalActionsType type); + // Get the integer key of the page's entry in the structural parent tree. + // Returns -1 if the page dict does not contain a StructParents key. + int getStructParents() { return structParents; } + Gfx *createGfx(OutputDev *out, double hDPI, double vDPI, int rotate, GBool useMediaBox, GBool crop, int sliceX, int sliceY, int sliceW, int sliceH, @@ -281,6 +285,7 @@ private: Object trans; // page transition Object actions; // page additional actions double duration; // page duration + int structParents; // integer key of page in structure parent tree GBool ok; // true if page is valid #ifdef MULTITHREADED GooMutex mutex; diff --git a/poppler/PreScanOutputDev.cc b/poppler/PreScanOutputDev.cc index b2af18d8..3bf59792 100644 --- a/poppler/PreScanOutputDev.cc +++ b/poppler/PreScanOutputDev.cc @@ -55,7 +55,7 @@ PreScanOutputDev::~PreScanOutputDev() { void PreScanOutputDev::startPage(int /*pageNum*/, GfxState * /*state*/, XRef * /*xref*/) { } -void PreScanOutputDev::endPage() { +void PreScanOutputDev::endPage(GfxState *state) { } void PreScanOutputDev::stroke(GfxState *state) { diff --git a/poppler/PreScanOutputDev.h b/poppler/PreScanOutputDev.h index a1fd6b50..706864a5 100644 --- a/poppler/PreScanOutputDev.h +++ b/poppler/PreScanOutputDev.h @@ -79,7 +79,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- path painting void stroke(GfxState *state) override; diff --git a/poppler/SplashOutputDev.cc b/poppler/SplashOutputDev.cc index 28174356..68856877 100644 --- a/poppler/SplashOutputDev.cc +++ b/poppler/SplashOutputDev.cc @@ -1556,7 +1556,7 @@ void SplashOutputDev::startPage(int pageNum, GfxState *state, XRef *xrefA) { splash->clear(paperColor, 0); } -void SplashOutputDev::endPage() { +void SplashOutputDev::endPage(GfxState *state) { if (colorMode != splashModeMono1 && !keepAlphaChannel) { splash->compositeBackground(paperColor); } diff --git a/poppler/SplashOutputDev.h b/poppler/SplashOutputDev.h index 05b7e943..3e63d482 100644 --- a/poppler/SplashOutputDev.h +++ b/poppler/SplashOutputDev.h @@ -235,7 +235,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- save/restore graphics state void saveState(GfxState *state) override; diff --git a/poppler/StructElement.h b/poppler/StructElement.h index cd89a970..571d7228 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -164,6 +164,7 @@ public: int getMCID() const { return c->mcid; } Ref getObjectRef() const { return c->ref; } Ref getParentRef() { return isContent() ? parent->getParentRef() : s->parentRef.getRef(); } + StructElement *getParent() const { return parent; } // returns NULL if parent is StructTreeRoot GBool hasPageRef() const; GBool getPageRef(Ref& ref) const; StructTreeRoot *getStructTreeRoot() { return treeRoot; } diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 9a77d050..70dccb5e 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -4705,7 +4705,7 @@ void TextSelectionPainter::endPage() } out->restoreState(state); - out->endPage (); + out->endPage (state); } void TextWord::visitSelection(TextSelectionVisitor *visitor, @@ -5619,7 +5619,7 @@ void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) { text->startPage(state); } -void TextOutputDev::endPage() { +void TextOutputDev::endPage(GfxState *state) { text->endPage(); text->coalesce(physLayout, fixedPitch, doHTML); if (outputStream) { diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 380301fd..0d9ef9e7 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -776,7 +776,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- save/restore graphics state void restoreState(GfxState *state) override; diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 90771943..36b44062 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -329,35 +329,37 @@ inline uint32_t decodeUtf16(uint32_t* state, uint32_t* codePoint, uint16_t codeU // Count number of UTF-8 bytes required to convert a UTF-16 string to // UTF-8 (excluding terminating NULL). -int utf16CountUtf8Bytes(const uint16_t *utf16) +int utf16CountUtf8Bytes(const uint16_t *utf16, int maxUtf16) { - uint32_t codepoint; - uint32_t state = 0; - int count = 0; + uint32_t codepoint; + uint32_t state = 0; + int count = 0; + int nIn = 0; - while (*utf16) { - decodeUtf16(&state, &codepoint, *utf16); - if (state == UTF16_ACCEPT) { - if (codepoint < 0x80) - count++; - else if (codepoint < 0x800) - count += 2; - else if (codepoint < 0x10000) - count += 3; - else if (codepoint <= UCS4_MAX) - count += 4; - else - count += 3; // replace with REPLACEMENT_CHAR - } else if (state == UTF16_REJECT) { - count += 3; // replace with REPLACEMENT_CHAR - state = 0; - } - utf16++; + while (*utf16 && nIn < maxUtf16) { + decodeUtf16(&state, &codepoint, *utf16); + if (state == UTF16_ACCEPT) { + if (codepoint < 0x80) + count++; + else if (codepoint < 0x800) + count += 2; + else if (codepoint < 0x10000) + count += 3; + else if (codepoint <= UCS4_MAX) + count += 4; + else + count += 3; // replace with REPLACEMENT_CHAR + } else if (state == UTF16_REJECT) { + count += 3; // replace with REPLACEMENT_CHAR + state = 0; } - if (state != UTF8_ACCEPT && state != UTF8_REJECT) - count++; // replace with REPLACEMENT_CHAR + utf16++; + nIn++; + } + if (state != UTF8_ACCEPT && state != UTF8_REJECT) + count++; // replace with REPLACEMENT_CHAR - return count; + return count; } // Convert UTF-16 to UTF-8 @@ -406,12 +408,42 @@ int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16) } // Allocate utf8 string and convert utf16 into it. -char *utf16ToUtf8(const uint16_t *utf16, int *len) +// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when +// either this count is reached or a null is encountered. +char *utf16ToUtf8(const uint16_t *utf16, int maxUtf16) { - int n = utf16CountUtf8Bytes(utf16); - if (len) - *len = n; + int n = utf16CountUtf8Bytes(utf16, maxUtf16); char *utf8 = (char*)gmalloc(n + 1); - utf16ToUtf8(utf16, utf8); + utf16ToUtf8(utf16, utf8, n + 1, maxUtf16); + return utf8; +} + +// Convert a PDF Text String to UTF-8 +// textStr - PDF text string +// returns UTF-8 string. +char *TextStringToUtf8(GooString *textStr) +{ + int i, len; + const char *s; + char *utf8; + + len = textStr->getLength(); + s = textStr->getCString(); + if (textStr->hasUnicodeMarker()) { + uint16_t *utf16; + len = len/2 - 1; + utf16 = new uint16_t[len]; + for (i = 0 ; i < len; i++) { + utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff); + } + utf8 = utf16ToUtf8(utf16, len); + delete[] utf16; + } else { + utf8 = (char*)gmalloc(len + 1); + for (i = 0 ; i < len; i++) { + utf8[i] = pdfDocEncoding[s[i] & 0xff]; + } + utf8[i] = 0; + } return utf8; } diff --git a/poppler/UTF.h b/poppler/UTF.h index bddb926d..3f62c528 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -63,7 +63,9 @@ uint16_t *utf8ToUtf16(const char *utf8, int *len = nullptr); // Count number of UTF-8 bytes required to convert a UTF-16 string to // UTF-8 (excluding terminating NULL). -int utf16CountUtf8Bytes(const uint16_t *utf16); +// maxUtf16 - maximum number of UTF-16 code units to count. Conversion stops when +// either this count is reached or a null is encountered. +int utf16CountUtf8Bytes(const uint16_t *utf16, int maxUtf16 = INT_MAX); // Convert UTF-16 to UTF-8 // utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num @@ -76,6 +78,13 @@ int utf16CountUtf8Bytes(const uint16_t *utf16); int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX); // Allocate utf8 string and convert utf16 into it. -char *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr); +// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when +// either this count is reached or a null is encountered. +char *utf16ToUtf8(const uint16_t *utf16, int maxUtf16 = INT_MAX); + +// Convert a PDF Text String to UTF-8 +// textStr - PDF text string +// returns UTF-8 string. +char *TextStringToUtf8(GooString *textStr); #endif diff --git a/qt4/src/ArthurOutputDev.cc b/qt4/src/ArthurOutputDev.cc index f2fa6f17..55ef49a8 100644 --- a/qt4/src/ArthurOutputDev.cc +++ b/qt4/src/ArthurOutputDev.cc @@ -134,7 +134,7 @@ void ArthurOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) m_painter->restore(); } -void ArthurOutputDev::endPage() { +void ArthurOutputDev::endPage(GfxState *state) { } void ArthurOutputDev::saveState(GfxState *state) diff --git a/qt4/src/ArthurOutputDev.h b/qt4/src/ArthurOutputDev.h index 9d5e8679..4fb883e4 100644 --- a/qt4/src/ArthurOutputDev.h +++ b/qt4/src/ArthurOutputDev.h @@ -90,7 +90,7 @@ public: virtual void startPage(int pageNum, GfxState *state, XRef *xref); // End a page. - virtual void endPage(); + virtual void endPage(GfxState *state); //----- save/restore graphics state virtual void saveState(GfxState *state); diff --git a/qt5/src/ArthurOutputDev.cc b/qt5/src/ArthurOutputDev.cc index 0ec029f2..aa44d3ab 100644 --- a/qt5/src/ArthurOutputDev.cc +++ b/qt5/src/ArthurOutputDev.cc @@ -133,7 +133,7 @@ void ArthurOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) m_painter->restore(); } -void ArthurOutputDev::endPage() { +void ArthurOutputDev::endPage(GfxState *state) { } void ArthurOutputDev::saveState(GfxState *state) diff --git a/qt5/src/ArthurOutputDev.h b/qt5/src/ArthurOutputDev.h index 480c7827..b9e2086c 100644 --- a/qt5/src/ArthurOutputDev.h +++ b/qt5/src/ArthurOutputDev.h @@ -96,7 +96,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- save/restore graphics state void saveState(GfxState *state) override; diff --git a/test/gtk-test.cc b/test/gtk-test.cc index 6f5e6216..68fcef2d 100644 --- a/test/gtk-test.cc +++ b/test/gtk-test.cc @@ -55,7 +55,7 @@ public: //----- initialization and control // End a page. - void endPage() override; + void endPage(GfxState *state) override; // Dump page contents to display. void dump() override; @@ -119,8 +119,8 @@ void GDKSplashOutputDev::clear() { startPage(0, NULL, NULL); } -void GDKSplashOutputDev::endPage() { - SplashOutputDev::endPage(); +void GDKSplashOutputDev::endPage(GfxState *state) { + SplashOutputDev::endPage(nullptr); if (!incrementalUpdate) { (*redrawCbk)(redrawCbkData); } diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 7f933f0c..9f3f933b 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -1277,7 +1277,7 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) { } -void HtmlOutputDev::endPage() { +void HtmlOutputDev::endPage(GfxState *state) { Links *linksList = docPage->getLinks(); for (int i = 0; i < linksList->getNumLinks(); ++i) { diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h index 704c16bb..a6419f51 100644 --- a/utils/HtmlOutputDev.h +++ b/utils/HtmlOutputDev.h @@ -283,7 +283,7 @@ public: void startPage(int pageNum, GfxState *state, XRef *xref) override; // End a page. - void endPage() override; + void endPage(GfxState *state) override; //----- update text state void updateFont(GfxState *state) override; diff --git a/utils/pdftocairo.1 b/utils/pdftocairo.1 index 3a60b73e..356111c8 100644 --- a/utils/pdftocairo.1 +++ b/utils/pdftocairo.1 @@ -211,6 +211,10 @@ Generate Level 2 PostScript (PS only). Generate Level 3 PostScript (PS only). This enables all Level 2 features plus shading patterns and masked images. This is the default setting. .TP +.B \-struct +If the input file contains structural information about the document's content, +write this information to the output file (PDF only). +.TP .B \-origpagesizes This option is the same as "\-paper match". .TP diff --git a/utils/pdftocairo.cc b/utils/pdftocairo.cc index 7a5ef981..9f1c3ca4 100644 --- a/utils/pdftocairo.cc +++ b/utils/pdftocairo.cc @@ -122,6 +122,7 @@ static GBool noShrink = gFalse; static GBool noCenter = gFalse; static GBool duplex = gFalse; static char tiffCompressionStr[16] = ""; +static GBool docStruct = gFalse; static char ownerPassword[33] = ""; static char userPassword[33] = ""; @@ -256,6 +257,11 @@ static const ArgDesc argDesc[] = { {"-duplex", argFlag, &duplex, 0, "enable duplex printing"}, +#if CAIRO_VERSION >= CAIRO_VERSION_ENCODE(1, 15, 8) + {"-struct", argFlag, &docStruct, 0, + "enable logical document structure (PDF)"}, +#endif + {"-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), @@ -1064,6 +1070,11 @@ int main(int argc, char *argv[]) { if (!level2 && !level3) level3 = gTrue; + if (docStruct && !pdf) { + fprintf(stderr, "Error: -struct may only be used with pdf output.\n"); + exit(99); + } + if (eps && (origPageSizes || paperSize[0] || paperWidth > 0 || paperHeight > 0)) { fprintf(stderr, "Error: page size options may not be used with eps output.\n"); exit(99); @@ -1214,6 +1225,7 @@ int main(int argc, char *argv[]) { firstPage++; cairoOut = new CairoOutputDev(); + cairoOut->setLogicalStructure(docStruct); cairoOut->startDoc(doc); if (sz != 0) crop_w = crop_h = sz; @@ -1270,10 +1282,10 @@ int main(int argc, char *argv[]) { renderPage(doc, cairoOut, pg, pg_w, pg_h, output_w, output_h); endPage(imageFileName); } + delete cairoOut; endDocument(); // clean up - delete cairoOut; delete doc; delete globalParams; if (fileName) -- 2.11.0