From ea8e27376b21c6c6c25f4f073e90b08cb17fc3c9 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Tue, 18 Jun 2013 00:35:51 +0300 Subject: [PATCH v11 01/11] Tagged-PDF: Text content extraction from structure elements Implement StructElement::getText(), by using MCOutputDev. This output device captures pieces of text (aka "spans") which have the same attributes into a list of TextSpan objects. --- poppler/Makefile.am | 2 + poppler/MarkedContentOutputDev.cc | 178 ++++++++++++++++++++++++++++++++++++++ poppler/MarkedContentOutputDev.h | 142 ++++++++++++++++++++++++++++++ poppler/StructElement.cc | 50 +++++++++++ poppler/StructElement.h | 28 ++++++ 5 files changed, 400 insertions(+) create mode 100644 poppler/MarkedContentOutputDev.cc create mode 100644 poppler/MarkedContentOutputDev.h diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 9f90c9d..5f0c795 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -232,6 +232,7 @@ poppler_include_HEADERS = \ NameToUnicodeTable.h \ PSOutputDev.h \ TextOutputDev.h \ + MarkedContentOutputDev.h \ SecurityHandler.h \ UTF.h \ UTF8.h \ @@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \ XRef.cc \ PSOutputDev.cc \ TextOutputDev.cc \ + MarkedContentOutputDev.cc \ PageLabelInfo.h \ PageLabelInfo.cc \ SecurityHandler.cc \ diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc new file mode 100644 index 0000000..96b5322 --- /dev/null +++ b/poppler/MarkedContentOutputDev.cc @@ -0,0 +1,178 @@ +//======================================================================== +// +// MarkedContentOutputDev.cc +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#include "MarkedContentOutputDev.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" +#include "GfxState.h" +#include "GfxFont.h" +#include "Annot.h" +#include "Link.h" +#include + + +MarkedContentOutputDev::MarkedContentOutputDev(int mcidA): + currentFont(NULL), + currentLink(NULL), + currentText(NULL), + inMarkedContent(false), + mcid(mcidA), + pageWidth(0.0), + pageHeight(0.0), + unicodeMap(NULL) +{ + currentColor.r = currentColor.g = currentColor.b = 0; +} + + +MarkedContentOutputDev::~MarkedContentOutputDev() +{ + if (unicodeMap) + unicodeMap->decRefCnt(); + if (currentFont) + currentFont->decRefCnt(); + delete currentText; + delete currentLink; +} + + +void MarkedContentOutputDev::endSpan() +{ + if (currentText && currentText->getLength()) { + TextSpan span(currentText, currentFont, currentLink); + memcpy(&span.data->color, ¤tColor, sizeof(GfxRGB)); + textSpans.push_back(span); + } + + currentText = currentLink = NULL; +} + + +void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) +{ + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0.0; + } +} + + +void MarkedContentOutputDev::endPage() +{ + pageWidth = pageHeight = 0.0; +} + + +void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties) +{ + int id = -1; + if (properties && properties->lookupInt("MCID", NULL, &id) && id == mcid) + inMarkedContent = true; +} + + +void MarkedContentOutputDev::endMarkedContent(GfxState *state) +{ + if (inMarkedContent) { + endSpan(); + inMarkedContent = false; + } +} + + +void MarkedContentOutputDev::drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen) +{ + if (!inMarkedContent || !uLen) + return; + + double sp, dx2, dy2, w1, h1, x1, y1; + + // Subtract char and word spacing from the (dx,dy) values + sp = state->getCharSpace(); + if (c == (CharCode) 0x20) + sp += state->getWordSpace(); + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + state->transform(xx, yy, &x1, &y1); + + // Throw away characters that are not inside the page boundaries. + if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight) + return; + + // Make a sanity check on character size. Note: (x != x) <-> isnan(x) + if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1) + return; + + for (int i = 0; i < uLen; i++) { + // Soft hyphen markers are skipped, as they are invisible unless + // rendering is done to an actual device and the hyphenation hint + // used. MarkedContentOutputDev extracts the *visible* text content. + if (u[i] != 0x00AD) { + // Add the UTF-8 sequence to the current text span. + if (!unicodeMap) + unicodeMap = globalParams->getTextEncoding(); + + char buf[8]; + int n = unicodeMap->mapUnicode(u[i], buf, sizeof(buf)); + if (n > 0) { + if (currentText == NULL) + currentText = new GooString(); + currentText->append(buf, n); + } + } + } +} + + +void MarkedContentOutputDev::updateFont(GfxState *state) +{ + GfxFont *font = state->getFont(); + + if (font == currentFont) + return; + + endSpan(); + + if (currentFont != NULL) + currentFont->decRefCnt(); + + currentFont = font; + + if (currentFont != NULL) + currentFont->incRefCnt(); +} + + +void MarkedContentOutputDev::updateFillColor(GfxState *state) +{ + GfxRGB color; + state->getFillRGB(&color); + + if (color.r == currentColor.r && + color.g == currentColor.g && + color.b == currentColor.b) + return; + + endSpan(); + currentColor = color; +} + + +const TextSpanArray& MarkedContentOutputDev::getTextSpans() const +{ + return textSpans; +} diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h new file mode 100644 index 0000000..0f99e62 --- /dev/null +++ b/poppler/MarkedContentOutputDev.h @@ -0,0 +1,142 @@ +//======================================================================== +// +// MarkedContentOutputDev.h +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef MARKEDCONTENTOUTPUTDEV_H +#define MARKEDCONTENTOUTPUTDEV_H + +#include "goo/gtypes.h" +#include "goo/gmem.h" +#include "OutputDev.h" +#include "GfxState.h" +#include "GfxFont.h" +#include + +class Dict; +class UnicodeMap; + + +class TextSpan { +public: + enum { + Italic = 1 << 0, + Bold = 1 << 1, + Fixed = 1 << 2, + }; + + TextSpan(const TextSpan& other): data(other.data) { + data->refcount++; + } + + TextSpan& operator=(const TextSpan& other) { + if (this != &other) { + data = other.data; + data->refcount++; + } + return *this; + } + + ~TextSpan() { + if (data && --data->refcount == 0) + delete data; + } + + GfxFont* getFont() const { return data->font; } + GooString* getText() const { return data->text; } + GooString* getLink() const { return data->link; } + GfxRGB getColor() const { return data->color; } + +private: + // NOTE: Takes ownership of strings, increases refcount for font. + TextSpan(GooString *text, GfxFont *font = NULL, GooString *link = NULL) + : data(new Data) { + data->text = text; + data->link = link; + data->font = font; + if (data->font) + data->font->incRefCnt(); + } + + struct Data { + GfxFont *font; + GooString *text; + GooString *link; + GfxRGB color; + unsigned refcount; + + Data(): refcount(1) {} + + ~Data() { + assert(refcount == 0); + if (font) + font->decRefCnt(); + delete text; + delete link; + } + }; + + Data *data; + + friend class MarkedContentOutputDev; +}; + + +typedef std::vector TextSpanArray; + + +class MarkedContentOutputDev: public OutputDev { +public: + MarkedContentOutputDev(int mcidA); + virtual ~MarkedContentOutputDev(); + + virtual GBool isOk() { return gTrue; } + virtual GBool upsideDown() { return gTrue; } + virtual GBool useDrawChar() { return gTrue; } + virtual GBool interpretType3Chars() { return gFalse; } + virtual GBool needNonText() { return gFalse; } + virtual GBool needCharCount() { return gFalse; } + + virtual void startPage(int pageNum, GfxState *state, XRef *xref); + virtual void endPage(); + + virtual void restoreState(GfxState *state) { + updateFillColor(state); + updateFont(state); + } + + virtual void updateFont(GfxState *state); + virtual void updateFillColor(GfxState *state); + + virtual void drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen); + + virtual void beginMarkedContent(char *name, Dict *properties); + virtual void endMarkedContent(GfxState *state); + + const TextSpanArray& getTextSpans() const; + +private: + + void endSpan(); + + GfxFont *currentFont; + GooString *currentLink; + GooString *currentText; + GfxRGB currentColor; + TextSpanArray textSpans; + bool inMarkedContent; + int mcid; + double pageWidth; + double pageHeight; + UnicodeMap *unicodeMap; +}; + +#endif /* !MARKEDCONTENTOUTPUTDEV_H */ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 7d893df..3a98658 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -14,6 +14,8 @@ #include "StructElement.h" #include "StructTreeRoot.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" #include "PDFDoc.h" #include "Dict.h" @@ -981,6 +983,54 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo return NULL; } +GooString* StructElement::appendSubTreeText(GooString *string, GBool recursive) const +{ + if (isContent() && !isObjectRef()) { + MarkedContentOutputDev mcdev(getMCID()); + const TextSpanArray& spans(getTextSpansInternal(mcdev)); + + if (!string) + string = new GooString(); + + for (TextSpanArray::const_iterator i = spans.begin(); i != spans.end(); ++i) + string->append(i->getText()); + + return string; + } + + if (!recursive) + return NULL; + + // Do a depth-first traversal, to get elements in logical order + if (!string) + string = new GooString(); + + for (unsigned i = 0; i < getNumElements(); i++) + getElement(i)->appendSubTreeText(string, recursive); + + return string; +} + +const TextSpanArray& StructElement::getTextSpansInternal(MarkedContentOutputDev& mcdev) const +{ + assert(isContent()); + + int startPage = 0, endPage = 0; + + Ref ref; + if (getPageRef(ref)) { + startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen); + } + + if (!(startPage && endPage)) { + startPage = 1; + endPage = treeRoot->getDoc()->getNumPages(); + } + + treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse); + return mcdev.getTextSpans(); +} + static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) { // Circular reference diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 00deef4..b9eef8a 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -17,6 +17,7 @@ #include "goo/gtypes.h" #include "goo/GooString.h" +#include "MarkedContentOutputDev.h" #include "Object.h" #include #include @@ -218,9 +219,36 @@ public: const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } GooString *getActualText() { return isContent() ? NULL : s->actualText; } + // Content text referenced by the element: + // + // - For MCID reference elements, this is just the text of the + // corresponding marked content object in the page stream, regardless + // of the setting of the "recursive" flag. + // - For other elements, if the "recursive" flag is set, the text + // enclosed by *all* the child MCID reference elements of the subtree + // is returned. The text is assembled by traversing the leaf MCID + // reference elements in logical order. + // - In any other case, the function returns NULL. + // + // A new string is returned, and the ownership passed to the caller. + // + GooString *getText(GBool recursive = gTrue) const { + return appendSubTreeText(NULL, recursive); + } + + const TextSpanArray getTextSpans() const { + if (!isContent()) + return TextSpanArray(); + MarkedContentOutputDev mcdev(getMCID()); + return getTextSpansInternal(mcdev); + } + ~StructElement(); private: + GooString* appendSubTreeText(GooString *string, GBool recursive) const; + const TextSpanArray& getTextSpansInternal(MarkedContentOutputDev& mcdev) const; + typedef std::vector AttrPtrArray; typedef std::vector ElemPtrArray; -- 1.8.4.2