From 237385aac41c26322201c691271ecfde86f51696 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Tue, 18 Jun 2013 00:35:51 +0300 Subject: [PATCH v12 01/11] Tagged-PDF: Text content extraction from structure elements Implement StructElement::getText(), by using MCOutputDev. This output device captures pieces of text (aka "spans") which have the same attributes into a list of TextSpan objects. --- poppler/Makefile.am | 2 + poppler/MarkedContentOutputDev.cc | 216 ++++++++++++++++++++++++++++++++++++++ poppler/MarkedContentOutputDev.h | 128 ++++++++++++++++++++++ poppler/StructElement.cc | 50 +++++++++ poppler/StructElement.h | 28 +++++ 5 files changed, 424 insertions(+) create mode 100644 poppler/MarkedContentOutputDev.cc create mode 100644 poppler/MarkedContentOutputDev.h diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 9f90c9d..5f0c795 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -232,6 +232,7 @@ poppler_include_HEADERS = \ NameToUnicodeTable.h \ PSOutputDev.h \ TextOutputDev.h \ + MarkedContentOutputDev.h \ SecurityHandler.h \ UTF.h \ UTF8.h \ @@ -306,6 +307,7 @@ libpoppler_la_SOURCES = \ XRef.cc \ PSOutputDev.cc \ TextOutputDev.cc \ + MarkedContentOutputDev.cc \ PageLabelInfo.h \ PageLabelInfo.cc \ SecurityHandler.cc \ diff --git a/poppler/MarkedContentOutputDev.cc b/poppler/MarkedContentOutputDev.cc new file mode 100644 index 0000000..ab27fcd --- /dev/null +++ b/poppler/MarkedContentOutputDev.cc @@ -0,0 +1,216 @@ +//======================================================================== +// +// MarkedContentOutputDev.cc +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#include "MarkedContentOutputDev.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" +#include "GfxState.h" +#include "GfxFont.h" +#include "Annot.h" +#include + + +MarkedContentOutputDev::MarkedContentOutputDev(int mcidA): + currentFont(NULL), + currentText(NULL), + mcid(mcidA), + pageWidth(0.0), + pageHeight(0.0), + unicodeMap(NULL) +{ + currentColor.r = currentColor.g = currentColor.b = 0; +} + + +MarkedContentOutputDev::~MarkedContentOutputDev() +{ + if (unicodeMap) + unicodeMap->decRefCnt(); + if (currentFont) + currentFont->decRefCnt(); + delete currentText; +} + + +void MarkedContentOutputDev::endSpan() +{ + if (currentText && currentText->getLength()) { + // The TextSpan takes ownership of currentText and + // increases the reference count for currentFont. + textSpans.push_back(TextSpan(currentText, + currentFont, + currentColor)); + } + currentText = NULL; +} + + +void MarkedContentOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) +{ + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0.0; + } +} + + +void MarkedContentOutputDev::endPage() +{ + pageWidth = pageHeight = 0.0; +} + + +void MarkedContentOutputDev::beginMarkedContent(char *name, Dict *properties) +{ + int id = -1; + if (properties) + properties->lookupInt("MCID", NULL, &id); + + if (id == -1) + return; + + // The stack keep track of MCIDs of nested marked content. + if (inMarkedContent() || id == mcid) + mcidStack.push_back(id); +} + + +void MarkedContentOutputDev::endMarkedContent(GfxState *state) +{ + if (inMarkedContent()) { + mcidStack.pop_back(); + // The outer marked content sequence MCID was popped, ensure + // that the last piece of text collected ends up in a TextSpan. + if (!inMarkedContent()) + endSpan(); + } +} + + +bool MarkedContentOutputDev::needFontChange(GfxFont* font) const +{ + if (currentFont == font) + return gFalse; + + if (!currentFont) + return font != NULL && font->isOk(); + + if (font == NULL) + return gTrue; + + // Two non-null valid fonts are the same if they point to the same Ref + if (currentFont->getID()->num == font->getID()->num && + currentFont->getID()->gen == font->getID()->gen) + return gFalse; + + // As a last quick comparison option before falling back on comparing + // the members, check the font tags. + if (font->matches(currentFont->getTag()->getCString())) + return gFalse; + + // As a last resort, compare the font names. + return font->getName()->cmp(currentFont->getName()) != 0; +} + + +void MarkedContentOutputDev::drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen) +{ + if (!inMarkedContent() || !uLen) + return; + + + // Color changes are tracked here so the color can be chosen depending on + // the render mode (for mode 1 stroke color is used), so there is no need + // to implement both updateFillColor() and updateStrokeColor(). + GBool colorChange = gFalse; + GfxRGB color; + if (state->getRender() == 1) + state->getStrokeRGB(&color); + else + state->getFillRGB(&color); + + colorChange = (color.r != currentColor.r || + color.g != currentColor.g || + color.b != currentColor.b); + + // Check also for font changes. + GBool fontChange = needFontChange(state->getFont()); + + // Save a span with the current changes. + if (colorChange || fontChange) { + endSpan(); + } + + // Perform the color/font changes. + if (colorChange) + currentColor = color; + + if (fontChange) { + if (currentFont != NULL) { + currentFont->decRefCnt(); + currentFont = NULL; + } + if (state->getFont() != NULL) { + currentFont = state->getFont(); + currentFont->incRefCnt(); + } + } + + + double sp, dx2, dy2, w1, h1, x1, y1; + + // Subtract char and word spacing from the (dx,dy) values + sp = state->getCharSpace(); + if (c == (CharCode) 0x20) + sp += state->getWordSpace(); + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + state->transform(xx, yy, &x1, &y1); + + // Throw away characters that are not inside the page boundaries. + if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight) + return; + + // Make a sanity check on character size. Note: (x != x) <-> isnan(x) + if (x1 != x1 || y1 != y1 || w1 != w1 || h1 != h1) + return; + + for (int i = 0; i < uLen; i++) { + // Soft hyphen markers are skipped, as they are invisible unless + // rendering is done to an actual device and the hyphenation hint + // used. MarkedContentOutputDev extracts the *visible* text content. + if (u[i] != 0x00AD) { + // Add the UTF-8 sequence to the current text span. + if (!unicodeMap) + unicodeMap = globalParams->getTextEncoding(); + + char buf[8]; + int n = unicodeMap->mapUnicode(u[i], buf, sizeof(buf)); + if (n > 0) { + if (currentText == NULL) + currentText = new GooString(); + currentText->append(buf, n); + } + } + } +} + + +const TextSpanArray& MarkedContentOutputDev::getTextSpans() const +{ + return textSpans; +} diff --git a/poppler/MarkedContentOutputDev.h b/poppler/MarkedContentOutputDev.h new file mode 100644 index 0000000..6decc9b --- /dev/null +++ b/poppler/MarkedContentOutputDev.h @@ -0,0 +1,128 @@ +//======================================================================== +// +// MarkedContentOutputDev.h +// +// Copyright 2013 Igalia S.L. +// +//======================================================================== + +#ifndef MARKEDCONTENTOUTPUTDEV_H +#define MARKEDCONTENTOUTPUTDEV_H + +#include "goo/gtypes.h" +#include "goo/gmem.h" +#include "OutputDev.h" +#include "GfxState.h" +#include "GfxFont.h" +#include + +class Dict; +class UnicodeMap; + + +class TextSpan { +public: + TextSpan(const TextSpan& other): data(other.data) { + data->refcount++; + } + + TextSpan& operator=(const TextSpan& other) { + if (this != &other) { + data = other.data; + data->refcount++; + } + return *this; + } + + ~TextSpan() { + if (data && --data->refcount == 0) + delete data; + } + + GfxFont* getFont() const { return data->font; } + GooString* getText() const { return data->text; } + GfxRGB& getColor() const { return data->color; } + +private: + // Note: Takes ownership of strings, increases refcount for font. + TextSpan(GooString *text, + GfxFont *font, + const GfxRGB& color) + : data(new Data) { + data->text = text; + data->font = font; + data->color = color; + if (data->font) + data->font->incRefCnt(); + } + + struct Data { + GfxFont *font; + GooString *text; + GfxRGB color; + unsigned refcount; + + Data(): refcount(1) {} + + ~Data() { + assert(refcount == 0); + if (font) + font->decRefCnt(); + delete text; + } + }; + + Data *data; + + friend class MarkedContentOutputDev; +}; + + +typedef std::vector TextSpanArray; + + +class MarkedContentOutputDev: public OutputDev { +public: + MarkedContentOutputDev(int mcidA); + virtual ~MarkedContentOutputDev(); + + virtual GBool isOk() { return gTrue; } + virtual GBool upsideDown() { return gTrue; } + virtual GBool useDrawChar() { return gTrue; } + virtual GBool interpretType3Chars() { return gFalse; } + virtual GBool needNonText() { return gFalse; } + virtual GBool needCharCount() { return gFalse; } + + virtual void startPage(int pageNum, GfxState *state, XRef *xref); + virtual void endPage(); + + virtual void drawChar(GfxState *state, + double xx, double yy, + double dx, double dy, + double ox, double oy, + CharCode c, int nBytes, + Unicode *u, int uLen); + + virtual void beginMarkedContent(char *name, Dict *properties); + virtual void endMarkedContent(GfxState *state); + + const TextSpanArray& getTextSpans() const; + +private: + + void endSpan(); + bool inMarkedContent() const { return mcidStack.size() > 0; } + bool needFontChange(GfxFont* font) const; + + GfxFont *currentFont; + GooString *currentText; + GfxRGB currentColor; + TextSpanArray textSpans; + int mcid; + std::vector mcidStack; + double pageWidth; + double pageHeight; + UnicodeMap *unicodeMap; +}; + +#endif /* !MARKEDCONTENTOUTPUTDEV_H */ diff --git a/poppler/StructElement.cc b/poppler/StructElement.cc index 7d893df..3a98658 100644 --- a/poppler/StructElement.cc +++ b/poppler/StructElement.cc @@ -14,6 +14,8 @@ #include "StructElement.h" #include "StructTreeRoot.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" #include "PDFDoc.h" #include "Dict.h" @@ -981,6 +983,54 @@ const Attribute *StructElement::findAttribute(Attribute::Type attributeType, GBo return NULL; } +GooString* StructElement::appendSubTreeText(GooString *string, GBool recursive) const +{ + if (isContent() && !isObjectRef()) { + MarkedContentOutputDev mcdev(getMCID()); + const TextSpanArray& spans(getTextSpansInternal(mcdev)); + + if (!string) + string = new GooString(); + + for (TextSpanArray::const_iterator i = spans.begin(); i != spans.end(); ++i) + string->append(i->getText()); + + return string; + } + + if (!recursive) + return NULL; + + // Do a depth-first traversal, to get elements in logical order + if (!string) + string = new GooString(); + + for (unsigned i = 0; i < getNumElements(); i++) + getElement(i)->appendSubTreeText(string, recursive); + + return string; +} + +const TextSpanArray& StructElement::getTextSpansInternal(MarkedContentOutputDev& mcdev) const +{ + assert(isContent()); + + int startPage = 0, endPage = 0; + + Ref ref; + if (getPageRef(ref)) { + startPage = endPage = treeRoot->getDoc()->findPage(ref.num, ref.gen); + } + + if (!(startPage && endPage)) { + startPage = 1; + endPage = treeRoot->getDoc()->getNumPages(); + } + + treeRoot->getDoc()->displayPages(&mcdev, startPage, endPage, 72.0, 72.0, 0, gTrue, gFalse, gFalse); + return mcdev.getTextSpans(); +} + static StructElement::Type roleMapResolve(Dict *roleMap, const char *name, const char *curName, Object *resolved) { // Circular reference diff --git a/poppler/StructElement.h b/poppler/StructElement.h index 00deef4..b9eef8a 100644 --- a/poppler/StructElement.h +++ b/poppler/StructElement.h @@ -17,6 +17,7 @@ #include "goo/gtypes.h" #include "goo/GooString.h" +#include "MarkedContentOutputDev.h" #include "Object.h" #include #include @@ -218,9 +219,36 @@ public: const GooString *getActualText() const { return isContent() ? NULL : s->actualText; } GooString *getActualText() { return isContent() ? NULL : s->actualText; } + // Content text referenced by the element: + // + // - For MCID reference elements, this is just the text of the + // corresponding marked content object in the page stream, regardless + // of the setting of the "recursive" flag. + // - For other elements, if the "recursive" flag is set, the text + // enclosed by *all* the child MCID reference elements of the subtree + // is returned. The text is assembled by traversing the leaf MCID + // reference elements in logical order. + // - In any other case, the function returns NULL. + // + // A new string is returned, and the ownership passed to the caller. + // + GooString *getText(GBool recursive = gTrue) const { + return appendSubTreeText(NULL, recursive); + } + + const TextSpanArray getTextSpans() const { + if (!isContent()) + return TextSpanArray(); + MarkedContentOutputDev mcdev(getMCID()); + return getTextSpansInternal(mcdev); + } + ~StructElement(); private: + GooString* appendSubTreeText(GooString *string, GBool recursive) const; + const TextSpanArray& getTextSpansInternal(MarkedContentOutputDev& mcdev) const; + typedef std::vector AttrPtrArray; typedef std::vector ElemPtrArray; -- 1.8.4.2