From 0e0804cf6ca707189282b3dbe0753f631c556d8b Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Wed, 16 Aug 2017 21:01:07 +0930 Subject: [PATCH] pdfimages: support listing/extracting inline images The difficulty with extracting inline images is that inline images do not provide any way of determining the length or end of image data without decoding the image. We can get the length by using ImageStream to decode the data then check the stream position. But then we are still unable to extract the undecoded image data because embedded streams can only be read once. Since inline images tend to be small the solution implemented is to modify EmbedStream to keep a copy of the data read from it in memory and then allow the data to be read again. Two new functions have been added to EmbedStream. rewind() will cause EmbedStream.getChar() to stop recording data and switch to replaying the saved data, returning EOF when the end of the saved data is reached. The restore() function will make getChar() switch back to reading from the parent stream. ImageOutputDev can now extract or get the image size by first using ImageStream to read data from the embedded stream. After calling rewind() the undecoded image data can be read from the embedded stream until EOF is returned. Then restore() is called so that Gfx can read the 'EI' from the end of the embedded stream. --- poppler/Gfx.cc | 2 +- poppler/Stream.cc | 96 +++++++++++++++++++++++++++++++++++++++++++------ poppler/Stream.h | 14 ++++++-- utils/ImageOutputDev.cc | 67 ++++++++++++++++++++++++++++++---- utils/ImageOutputDev.h | 1 + 5 files changed, 159 insertions(+), 21 deletions(-) diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index be9810e1..2bfc1ecd 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -4901,7 +4901,7 @@ Stream *Gfx::buildImageStream() { // make stream if (parser->getStream()) { - str = new EmbedStream(parser->getStream(), std::move(dict), gFalse, 0); + str = new EmbedStream(parser->getStream(), std::move(dict), gFalse, 0, gTrue); str = str->addFilters(str->getDict()); } else { str = NULL; diff --git a/poppler/Stream.cc b/poppler/Stream.cc index df767652..9cb48abc 100644 --- a/poppler/Stream.cc +++ b/poppler/Stream.cc @@ -1033,14 +1033,25 @@ void MemStream::moveStart(Goffset delta) { //------------------------------------------------------------------------ EmbedStream::EmbedStream(Stream *strA, Object &&dictA, - GBool limitedA, Goffset lengthA): + GBool limitedA, Goffset lengthA, GBool reusableA): BaseStream(std::move(dictA), lengthA) { str = strA; limited = limitedA; length = lengthA; + reusable = reusableA; + record = gFalse; + replay = gFalse; + if (reusable) { + bufData = (unsigned char*)gmalloc(16384); + bufMax = 16384; + bufLen = 0; + record = gTrue; + } } EmbedStream::~EmbedStream() { + if (reusable) + gfree(bufData); } BaseStream *EmbedStream::copy() { @@ -1054,31 +1065,94 @@ Stream *EmbedStream::makeSubStream(Goffset start, GBool limitedA, return NULL; } +void EmbedStream::rewind() { + record = gFalse; + replay = gTrue; + bufPos = 0; +} + +void EmbedStream::restore() { + replay = gFalse; +} + +Goffset EmbedStream::getPos() { + if (replay) + return bufPos; + else + return str->getPos(); +} + int EmbedStream::getChar() { - if (limited && !length) { - return EOF; + if (replay) { + if (bufPos < bufLen) + return bufData[bufPos++]; + else + return EOF; + } else { + if (limited && !length) { + return EOF; + } + int c = str->getChar(); + --length; + if (record) { + bufData[bufLen] = c; + bufLen++; + if (bufLen >= bufMax) { + bufMax *= 2; + bufData = (unsigned char *)grealloc(bufData, bufMax); + } + } + return c; } - --length; - return str->getChar(); } int EmbedStream::lookChar() { - if (limited && !length) { - return EOF; + if (replay) { + if (bufPos < bufLen) + return bufData[bufPos]; + else + return EOF; + } else { + if (limited && !length) { + return EOF; + } + return str->lookChar(); } - return str->lookChar(); } int EmbedStream::getChars(int nChars, Guchar *buffer) { + int len; + if (nChars <= 0) { return 0; } - if (limited && length < nChars) { - nChars = length; + if (replay) { + if (bufPos >= bufLen) + return EOF; + len = bufLen - bufPos; + if (nChars > len) + nChars = len; + memcpy(buffer, bufData, len); + return len; + } else { + if (limited && length < nChars) { + nChars = length; + } + len = str->doGetChars(nChars, buffer); + if (record) { + if (bufLen + len >= bufMax) { + while (bufLen + len >= bufMax) + bufMax *= 2; + bufData = (unsigned char *)grealloc(bufData, bufMax); + } + memcpy(bufData+bufLen, buffer, len); + bufLen += len; + } } - return str->doGetChars(nChars, buffer); + return len; } + void EmbedStream::setPos(Goffset pos, int dir) { error(errInternal, -1, "Internal: called setPos() on EmbedStream"); } diff --git a/poppler/Stream.h b/poppler/Stream.h index 2317080e..7e67697c 100644 --- a/poppler/Stream.h +++ b/poppler/Stream.h @@ -607,7 +607,7 @@ private: class EmbedStream: public BaseStream { public: - EmbedStream(Stream *strA, Object &&dictA, GBool limitedA, Goffset lengthA); + EmbedStream(Stream *strA, Object &&dictA, GBool limitedA, Goffset lengthA, GBool reusableA = gFalse); ~EmbedStream(); BaseStream *copy() override; Stream *makeSubStream(Goffset start, GBool limitedA, @@ -616,7 +616,7 @@ public: void reset() override {} int getChar() override; int lookChar() override; - Goffset getPos() override { return str->getPos(); } + Goffset getPos() override; void setPos(Goffset pos, int dir = 0) override; Goffset getStart() override; void moveStart(Goffset delta) override; @@ -624,6 +624,8 @@ public: int getUnfilteredChar () override { return str->getUnfilteredChar(); } void unfilteredReset () override { str->unfilteredReset(); } + void rewind(); + void restore(); private: @@ -632,6 +634,14 @@ private: Stream *str; GBool limited; + GBool reusable; + GBool record; + GBool replay; + unsigned char *bufData; + long bufMax; + long bufLen; + long bufPos; + }; //------------------------------------------------------------------------ diff --git a/utils/ImageOutputDev.cc b/utils/ImageOutputDev.cc index f6fb35dd..33cbb714 100644 --- a/utils/ImageOutputDev.cc +++ b/utils/ImageOutputDev.cc @@ -246,7 +246,9 @@ void ImageOutputDev::listImage(GfxState *state, Object *ref, Stream *str, printf("%5.0f ", yppi); Goffset embedSize = -1; - if (!inlineImg) + if (inlineImg) + embedSize = getInlineImageLength(str, width, height, colorMap); + else embedSize = str->getBaseStream()->getLength(); long long imageSize = 0; @@ -311,6 +313,43 @@ void ImageOutputDev::listImage(GfxState *state, Object *ref, Stream *str, } } +long ImageOutputDev::getInlineImageLength(Stream *str, int width, int height, + GfxImageColorMap *colorMap) { + long len; + + if (colorMap) { + ImageStream *imgStr = new ImageStream(str, width, colorMap->getNumPixelComps(), + colorMap->getBits()); + imgStr->reset(); + for (int y = 0; y < height; y++) + imgStr->getLine(); + + imgStr->close(); + delete imgStr; + } else { + str->reset(); + for (int y = 0; y < height; y++) { + int size = (width + 7)/8; + for (int x = 0; x < size; x++) + str->getChar(); + } + } + + EmbedStream *embedStr = (EmbedStream *) (str->getBaseStream()); + embedStr->rewind(); + if (str->getKind() == strDCT || str->getKind() == strCCITTFax) + str = str->getNextStream(); + len = 0; + str->reset(); + while (str->getChar() != EOF) + len++; + + embedStr->restore(); + + + return len; +} + void ImageOutputDev::writeRawImage(Stream *str, const char *ext) { FILE *f; int c; @@ -498,15 +537,21 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, GBool inlineImg) { ImageFormat format; + EmbedStream *embedStr; - if (dumpJPEG && str->getKind() == strDCT && - (colorMap->getNumPixelComps() == 1 || - colorMap->getNumPixelComps() == 3) && - !inlineImg) { + if (dumpJPEG && str->getKind() == strDCT) { + if (inlineImg) { + embedStr = (EmbedStream *) (str->getBaseStream()); + getInlineImageLength(str, width, height, colorMap); // record the strean + embedStr->rewind(); + } // dump JPEG file writeRawImage(str, "jpg"); + if (inlineImg) + embedStr->restore(); + } else if (dumpJP2 && str->getKind() == strJPX && !inlineImg) { // dump JPEG2000 file writeRawImage(str, "jp2"); @@ -535,7 +580,7 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str, // dump JBIG2 embedded file writeRawImage(str, "jb2e"); - } else if (dumpCCITT && str->getKind() == strCCITTFax && !inlineImg) { + } else if (dumpCCITT && str->getKind() == strCCITTFax) { // write CCITT parameters CCITTFaxStream *ccittStr = static_cast(str); FILE *f; @@ -567,14 +612,22 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str, fclose(f); + if (inlineImg) { + embedStr = (EmbedStream *) (str->getBaseStream()); + getInlineImageLength(str, width, height, colorMap); // record the strean + embedStr->rewind(); + } + // dump CCITT file writeRawImage(str, "ccitt"); + if (inlineImg) + embedStr->restore(); + } else if (outputPNG && !(outputTiff && colorMap && (colorMap->getColorSpace()->getMode() == csDeviceCMYK || (colorMap->getColorSpace()->getMode() == csICCBased && colorMap->getNumPixelComps() == 4)))) { - // output in PNG format #if ENABLE_LIBPNG diff --git a/utils/ImageOutputDev.h b/utils/ImageOutputDev.h index 22954cf0..baccd8ef 100644 --- a/utils/ImageOutputDev.h +++ b/utils/ImageOutputDev.h @@ -160,6 +160,7 @@ private: void writeRawImage(Stream *str, const char *ext); void writeImageFile(ImgWriter *writer, ImageFormat format, const char *ext, Stream *str, int width, int height, GfxImageColorMap *colorMap); + long getInlineImageLength(Stream *str, int width, int height, GfxImageColorMap *colorMap); char *fileRoot; // root of output file names char *fileName; // buffer for output file names -- 2.11.0