From 2145df39a90cf76b84b7ab2b92006d19d8787b71 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Tue, 18 Jun 2013 00:35:51 +0300 Subject: [PATCH v10 02/12] Tagged-PDF: Text content extraction from structure elements Implement StructElement::getText(), by using MCOutputDev. This output device captures the a sequence MCOp structures representing the text drawing operations for a particular marked content text object from the page stream. Those are then used to convert the individual Unicode characters to the returned string. --- poppler/Makefile.am | 2 + poppler/MarkedContentOutputDev.cc | 139 ++++++++++++++++++++++++++++++++++++++ poppler/MarkedContentOutputDev.h | 116 +++++++++++++++++++++++++++++++ poppler/StructElement.cc | 65 ++++++++++++++++++ poppler/StructElement.h | 22 ++++++ 5 files changed, 344 insertions(+) create mode 100644 poppler/MarkedContentOutputDev.cc create mode 100644 poppler/MarkedContentOutputDev.h diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 9f90c9d..5f0c795 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -232,6 +232,7 @@ poppler_include_HEADERS = \ NameToUnicodeTable.h \ PSOutputDev.h \ TextOutputDev.h \ + MarkedContentOutputDev.h \ SecurityHandler.h \ UTF.h \ UTF8.h \ @@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \ XRef.cc \ PSOutputDev.cc \ TextOutputDev.cc \ + MarkedContentOutputDev.cc \ PageLabelInfo.h \ PageLabelInfo.cc \ SecurityHandler.cc \ diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc new file mode 100644 index 0000000..e3173db --- /dev/null +++ b/poppler/MarkedContentOutputDev.cc @@ -0,0 +1,139 @@ +//======================================================================== +// +// MarkedContentOutputDev.cc +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#include "MarkedContentOutputDev.h" +#include "GfxFont.h" +#include "GfxState.h" +#include "Annot.h" +#include "Link.h" +#include + + +MarkedContentOutputDev::MarkedContentOutputDev(int mcidA): + inMarkedContent(false), + mcid(mcidA), + lastFont(0), + lastFlags(0), + pageWidth(0.0), + pageHeight(0.0) +{ +} + + +void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) +{ + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0.0; + } +} + + +void MarkedContentOutputDev::endPage() +{ + pageWidth = pageHeight = 0.0; +} + + +void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties) +{ + int id = -1; + if (properties && properties->lookupInt("MCID", NULL, &id) && id == mcid) + inMarkedContent = true; +} + + +void MarkedContentOutputDev::endMarkedContent(GfxState *state) +{ + inMarkedContent = false; +} + + +void MarkedContentOutputDev::drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen) +{ + if (!inMarkedContent || !uLen) + return; + + double sp, dx2, dy2, w1, h1, x1, y1; + + // Subtract char and word spacing from the (dx,dy) values + sp = state->getCharSpace(); + if (c == (CharCode) 0x20) + sp += state->getWordSpace(); + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + state->transform(xx, yy, &x1, &y1); + + // Throw away characters that are not inside the page boundaries. + if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight) + return; + + // Make a sanity check on character size. Note: (x != x) <-> isnan(x) + if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1) + return; + + for (int i = 0; i < uLen; i++) { + // Soft hyphen markers are skipped, as they are invisible unless + // rendering is done to an actual device and the hyphenation hint + // used. MarkedContentOutputDev extracts the *visible* text content. + if (u[i] != 0x00AD) + markedContentOps.push_back(MarkedContentOp(u[i])); + } +} + + +void MarkedContentOutputDev::updateFont(GfxState *state) +{ + GfxFont *font = state->getFont(); + if (!font || font == lastFont) return; + + if (!lastFont || (lastFont->getFamily() && lastFont->getFamily()->cmp(font->getFamily()))) { + if (inMarkedContent && font->getFamily()) + markedContentOps.push_back(MarkedContentOp(font->getFamily()->getCString())); + if (lastFont) lastFont->decRefCnt(); + lastFont = font; + font->incRefCnt(); + } + + Guint flags = 0; + + if (font->isFixedWidth()) flags |= MarkedContentOp::FlagFontItalic; + if (font->isItalic()) flags |= MarkedContentOp::FlagFontItalic; + if (font->isBold()) flags |= MarkedContentOp::FlagFontBold; + else { + switch (font->getWeight()) { + case GfxFont::W700: // Font weights over 600 are bold + case GfxFont::W800: + case GfxFont::W900: + flags |= MarkedContentOp::FlagFontBold; + default: + break; + } + } + + if (lastFlags != flags) { + if (inMarkedContent) + markedContentOps.push_back(MarkedContentOp(MarkedContentOp::Flags, flags)); + lastFlags = flags; + } +} + + +const MarkedContentOpArray& MarkedContentOutputDev::getMarkedContentOps() const +{ + return markedContentOps; +} diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h new file mode 100644 index 0000000..313ba9a --- /dev/null +++ b/poppler/MarkedContentOutputDev.h @@ -0,0 +1,116 @@ +//======================================================================== +// +// MarkedContentOutputDev.h +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef MARKEDCONTENTOUTPUTDEV_H +#define MARKEDCONTENTOUTPUTDEV_H + +#include "goo/gtypes.h" +#include "goo/gmem.h" +#include "OutputDev.h" +#include "GfxState.h" +#include + +class GfxFont; +class GfxState; +class Dict; + +struct MarkedContentOp { + enum Type { + Unichar, + FontName, + Flags, + Color, + }; + + enum Flags { + FlagFontBold = (1 << 0), + FlagFontItalic = (1 << 1), + FlagFontFixed = (1 << 2), + }; + + Type type; + + union { + Unicode unichar; + char *value; + Guint flags; + GfxRGB color; + }; + + MarkedContentOp(const MarkedContentOp &op): type(op.type) { + switch (type) { + case Flags: flags = op.flags; break; + case Unichar: unichar = op.unichar; break; + case Color: memcpy(&color, &op.color, sizeof(GfxRGB)); break; + case FontName: value = strdup(op.value); break; + } + } + ~MarkedContentOp() { + switch (type) { + case FontName: gfree(value); break; + default: /* nothing */ break; + } + } + MarkedContentOp(): type(FontName), value(NULL) {} + MarkedContentOp(Unicode u): type(Unichar), unichar(u) {} + MarkedContentOp(const char *s): type(FontName), value(strdup(s)) {} + MarkedContentOp(GfxRGB c): type(Color) { memcpy(&color, &c, sizeof(GfxRGB)); } + MarkedContentOp(const GfxRGB* c): type(Color) { memcpy(&color, c, sizeof(GfxRGB)); } + MarkedContentOp(Type t, Guint f = 0): type(t), flags(f) {} + + Guint rgbPixel() const { + return colToByte(color.r) << 16 + | colToByte(color.g) << 8 + | colToByte(color.b); + } +}; + + +typedef std::vector MarkedContentOpArray; + + +class MarkedContentOutputDev: public OutputDev { +public: + MarkedContentOutputDev(int mcidA); + + virtual GBool isOk() { return gTrue; } + virtual GBool upsideDown() { return gTrue; } + virtual GBool useDrawChar() { return gTrue; } + virtual GBool interpretType3Chars() { return gFalse; } + virtual GBool needNonText() { return gFalse; } + virtual GBool needCharCount() { return gFalse; } + + virtual void startPage(int pageNum, GfxState *state, XRef *xref); + virtual void endPage(); + + virtual void restoreState(GfxState *state) { updateFont(state); } + virtual void updateFont(GfxState *state); + + virtual void drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen); + + virtual void beginMarkedContent(char *name, Dict *properties); + virtual void endMarkedContent(GfxState *state); + + const MarkedContentOpArray& getMarkedContentOps() const; + +private: + MarkedContentOpArray markedContentOps; + bool inMarkedContent; + int mcid; + GfxFont *lastFont; + Guint lastFlags; + double pageWidth; + double pageHeight; +}; + +#endif /* !MARKEDCONTENTOUTPUTDEV_H */ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 7d893df..67c0622 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -14,6 +14,8 @@ #include "StructElement.h" #include "StructTreeRoot.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" #include "PDFDoc.h" #include "Dict.h" @@ -981,6 +983,69 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo return NULL; } +GooString* StructElement::appendSubTreeText(GooString *string, GBool recursive) const +{ + if (isContent() && !isObjectRef()) { + const MarkedContentOpArray& ops(getMarkedContentOps()); + + UnicodeMap *map = globalParams->getTextEncoding(); + if (!map) { + GooString mapName("UTF-8"); + map = UnicodeMap::parse(&mapName); + } + assert(map); + + if (!string) + string = new GooString(); + + char buf[9]; + int n; + + for (MarkedContentOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) { + if (i->type == MarkedContentOp::Unichar) { + n = map->mapUnicode(i->unichar, buf, sizeof(buf)); + string->append(buf, n); + } + } + map->decRefCnt(); + return string; + } + + if (!recursive) + return NULL; + + // Do a depth-first traversal, to get elements in logical order + if (!string) + string = new GooString(); + + for (unsigned i = 0; i < getNumElements(); i++) + getElement(i)->appendSubTreeText(string, recursive); + + return string; +} + +const MarkedContentOpArray StructElement::getMarkedContentOps() const +{ + if (!isContent()) + return MarkedContentOpArray(); // Empty array + + MarkedContentOutputDev mcdev(getMCID()); + int startPage = 0, endPage = 0; + + Ref ref; + if (getPageRef(ref)) { + startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen); + } + + if (!(startPage && endPage)) { + startPage = 1; + endPage = treeRoot->getDoc()->getNumPages(); + } + + treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse); + return mcdev.getMarkedContentOps(); +} + static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) { // Circular reference diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 00deef4..5a2c90c 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -17,6 +17,7 @@ #include "goo/gtypes.h" #include "goo/GooString.h" +#include "MarkedContentOutputDev.h" #include "Object.h" #include #include @@ -218,9 +219,30 @@ public: const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } GooString *getActualText() { return isContent() ? NULL : s->actualText; } + // Content text referenced by the element: + // + // - For MCID reference elements, this is just the text of the + // corresponding marked content object in the page stream, regardless + // of the setting of the "recursive" flag. + // - For other elements, if the "recursive" flag is set, the text + // enclosed by *all* the child MCID reference elements of the subtree + // is returned. The text is assembled by traversing the leaf MCID + // reference elements in logical order. + // - In any other case, the function returns NULL. + // + // A new string is returned, and the ownership passed to the caller. + // + GooString *getText(GBool recursive = gTrue) const { + return appendSubTreeText(NULL, recursive); + } + + const MarkedContentOpArray getMarkedContentOps() const; + ~StructElement(); private: + GooString* appendSubTreeText(GooString *string, GBool recursive) const; + typedef std::vector AttrPtrArray; typedef std::vector ElemPtrArray; -- 1.8.4.2