From 06cbea7482f3296979613ffc7d1eac7805bff8ab Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 9 Dec 2007 20:37:49 +1030 Subject: [PATCH] Add support for ActualText attached to span tags ActualText can be either part of a tagged pdf or in marked content sequences that are not part of tagged pdf by using the span tag. This patch implements ActualText support for non tagged pdfs. Add TextOutputDev::beginMarkedContent and TextOutputDev::beginMarkedContent In beginMarkedContent check if we are in an ActualText span. TextOutputDev::drawChar has been modified to disable the call to addChar when inside an ActualText span and instead find the extent of the span from the extent of each glyph. endMarkedContent calls drawChar with the replacement text and the extent of the span. --- poppler/Form.cc | 2 +- poppler/Gfx.cc | 2 +- poppler/OutputDev.cc | 2 +- poppler/OutputDev.h | 2 +- poppler/PDFDocEncoding.h | 3 + poppler/TextOutputDev.cc | 97 +++++++++++++++++++++++++++++++++++++++++++++- poppler/TextOutputDev.h | 11 +++++ 7 files changed, 114 insertions(+), 5 deletions(-) diff --git a/poppler/Form.cc b/poppler/Form.cc index 334e45c..5cb4b87 100644 --- a/poppler/Form.cc +++ b/poppler/Form.cc @@ -27,7 +27,7 @@ #include "Catalog.h" //return a newly allocated char* containing an UTF16BE string of size length -static char* pdfDocEncodingToUTF16 (GooString* orig, int* length) +char* pdfDocEncodingToUTF16 (GooString* orig, int* length) { //double size, a unicode char takes 2 char, add 2 for the unicode marker *length = 2+2*orig->getLength(); diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index 163b340..d2b3cb8 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -4036,7 +4036,7 @@ void Gfx::opBeginMarkedContent(Object args[], int numArgs) { } void Gfx::opEndMarkedContent(Object args[], int numArgs) { - out->endMarkedContent(); + out->endMarkedContent(state); } void Gfx::opMarkPoint(Object args[], int numArgs) { diff --git a/poppler/OutputDev.cc b/poppler/OutputDev.cc index 59184a9..dedffd3 100644 --- a/poppler/OutputDev.cc +++ b/poppler/OutputDev.cc @@ -123,7 +123,7 @@ void OutputDev::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, drawImage(state, ref, str, width, height, colorMap, NULL, gFalse); } -void OutputDev::endMarkedContent() { +void OutputDev::endMarkedContent(GfxState *state) { } void OutputDev::beginMarkedContent(char *name) { diff --git a/poppler/OutputDev.h b/poppler/OutputDev.h index 1e92b16..af042c6 100644 --- a/poppler/OutputDev.h +++ b/poppler/OutputDev.h @@ -210,7 +210,7 @@ public: //----- grouping operators - virtual void endMarkedContent(); + virtual void endMarkedContent(GfxState *state); virtual void beginMarkedContent(char *name); virtual void beginMarkedContent(char *name, Dict *properties); virtual void markPoint(char *name); diff --git a/poppler/PDFDocEncoding.h b/poppler/PDFDocEncoding.h index 3259d3e..da238c9 100644 --- a/poppler/PDFDocEncoding.h +++ b/poppler/PDFDocEncoding.h @@ -10,7 +10,10 @@ #define PDFDOCENCODING_H #include "CharTypes.h" +#include "goo/GooString.h" extern Unicode pdfDocEncoding[256]; +char* pdfDocEncodingToUTF16 (GooString* orig, int* length); + #endif diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index e2aaa43..44019dc 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -32,6 +32,7 @@ #include "Link.h" #include "TextOutputDev.h" #include "Page.h" +#include "PDFDocEncoding.h" #ifdef MACOS // needed for setting type/creator of MacOS files @@ -4484,6 +4485,7 @@ TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, // set up text object text = new TextPage(rawOrderA); + actualTextBMCLevel = 0; } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, @@ -4496,6 +4498,7 @@ TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, doHTML = gFalse; text = new TextPage(rawOrderA); ok = gTrue; + actualTextBMCLevel = 0; } TextOutputDev::~TextOutputDev() { @@ -4536,7 +4539,99 @@ void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, Unicode *u, int uLen) { - text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen); + if (actualTextBMCLevel == 0) { + text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen); + } else { + // Inside ActualText span. + if (newActualTextSpan) { + actualText_x = x; + actualText_y = y; + actualText_dx = dx; + actualText_dy = dy; + newActualTextSpan = gFalse; + } else { + if (x < actualText_x) + actualText_x = x; + if (y < actualText_y) + actualText_y = y; + if (x + dx > actualText_x + actualText_dx) + actualText_dx = x + dx - actualText_x; + if (y + dy > actualText_y + actualText_dy) + actualText_dy = y + dy - actualText_y; + } + } +} + +void TextOutputDev::beginMarkedContent(char *name, Dict *properties) +{ + Object obj; + + if (actualTextBMCLevel > 0) { + // Already inside a ActualText span. + actualTextBMCLevel++; + return; + } + + if (properties->lookup("ActualText", &obj)) { + if (obj.isString()) { + actualText = obj.getString(); + actualTextBMCLevel = 1; + newActualTextSpan = gTrue; + } + } +} + +void TextOutputDev::endMarkedContent(GfxState *state) +{ + char *uniString = NULL; + Unicode *uni; + int length, i; + + if (actualTextBMCLevel > 0) { + actualTextBMCLevel--; + if (actualTextBMCLevel == 0) { + // ActualText span closed. Output the span text and the + // extents of all the glyphs inside the span + + if (newActualTextSpan) { + // No content inside span. + actualText_x = state->getCurX(); + actualText_y = state->getCurY(); + actualText_dx = 0; + actualText_dy = 0; + } + + if (!actualText->hasUnicodeMarker()) { + if (actualText->getLength() > 0) { + //non-unicode string -- assume pdfDocEncoding and + //try to convert to UTF16BE + uniString = pdfDocEncodingToUTF16(actualText, &length); + } else { + length = 0; + } + } else { + uniString = actualText->getCString(); + length = actualText->getLength(); + } + + if (length < 2) + length = 0; + else + length = length/2 - 1; + uni = new Unicode[length]; + for (i = 0 ; i < length; i++) + uni[i] = (uniString[2 + i*2]<<8) + uniString[2 + i*2+1]; + + text->addChar(state, + actualText_x, actualText_y, + actualText_dx, actualText_dy, + 0, 1, uni, length); + + delete [] uni; + if (!actualText->hasUnicodeMarker()) + delete [] uniString; + } + } } void TextOutputDev::stroke(GfxState *state) { diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 2808a9d..db40a44 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -651,6 +651,10 @@ public: double originX, double originY, CharCode c, int nBytes, Unicode *u, int uLen); + //----- grouping operators + virtual void beginMarkedContent(char *name, Dict *properties); + virtual void endMarkedContent(GfxState *state); + //----- path painting virtual void stroke(GfxState *state); virtual void fill(GfxState *state); @@ -725,6 +729,13 @@ private: GBool rawOrder; // keep text in content stream order GBool doHTML; // extra processing for HTML conversion GBool ok; // set up ok? + + int actualTextBMCLevel; // > 0 when inside ActualText span. Incremented + // for each nested BMC inside the span. + GooString *actualText; // replacement text for the span + GBool newActualTextSpan; // true at start of span. used to init the extent + double actualText_x, actualText_y; // extent of the text inside the span + double actualText_dx, actualText_dy; }; #endif -- 1.5.2.4