From bd78218f342eb3697ddf4e625382e8f140763dab Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 25 Aug 2013 09:37:46 +0930 Subject: [PATCH] pdfimages: support listing/extracting inline images The difficulty with extracting inline images is that inline images do not provide any way of determining the length or end of image data without decoding the image. We can get the length by using ImageStream to decode the data then check the stream position. But then we are still unable to extract the undecoded image data because embedded streams can only be read once. Since inline images tend to be small the solution implemented is to modify EmbedStream to keep a copy of the data read from it in memory and then allow the data to be read again. Two new functions have been added to EmbedStream. rewind() will cause EmbedStream.getChar() to stop recording data and switch to replaying the saved data, returning EOF when the end of the saved data is reached. The restore() function will make getChar() switch back to reading from the parent stream. ImageOutputDev can now extract or get the image size by first using ImageStream to read data from the embedded stream. After calling rewind() the undecoded image data can be read from the embedded stream until EOF is returned. Then restore() is called so that Gfx can read the 'EI' from the end of the embedded stream. --- poppler/Gfx.cc | 2 +- poppler/Stream.cc | 96 +++++++++++++++++++++++++++++++++++++++++++------ poppler/Stream.h | 14 ++++++-- utils/ImageOutputDev.cc | 66 ++++++++++++++++++++++++++++++---- utils/ImageOutputDev.h | 1 + 5 files changed, 159 insertions(+), 20 deletions(-) diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index f2971d6..62850e7 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -4956,7 +4956,7 @@ Stream *Gfx::buildImageStream() { // make stream if (parser->getStream()) { - str = new EmbedStream(parser->getStream(), &dict, gFalse, 0); + str = new EmbedStream(parser->getStream(), &dict, gFalse, 0, gTrue); str = str->addFilters(&dict); } else { str = NULL; diff --git a/poppler/Stream.cc b/poppler/Stream.cc index 41cb8c1..98ff1ea 100644 --- a/poppler/Stream.cc +++ b/poppler/Stream.cc @@ -1040,14 +1040,25 @@ void MemStream::moveStart(Goffset delta) { //------------------------------------------------------------------------ EmbedStream::EmbedStream(Stream *strA, Object *dictA, - GBool limitedA, Goffset lengthA): + GBool limitedA, Goffset lengthA, GBool reusableA): BaseStream(dictA, lengthA) { str = strA; limited = limitedA; length = lengthA; + reusable = reusableA; + record = gFalse; + replay = gFalse; + if (reusable) { + bufData = (unsigned char*)gmalloc(16384); + bufMax = 16384; + bufLen = 0; + record = gTrue; + } } EmbedStream::~EmbedStream() { + if (reusable) + gfree(bufData); } BaseStream *EmbedStream::copy() { @@ -1061,31 +1072,94 @@ Stream *EmbedStream::makeSubStream(Goffset start, GBool limitedA, return NULL; } +void EmbedStream::rewind() { + record = gFalse; + replay = gTrue; + bufPos = 0; +} + +void EmbedStream::restore() { + replay = gFalse; +} + +Goffset EmbedStream::getPos() { + if (replay) + return bufPos; + else + return str->getPos(); +} + int EmbedStream::getChar() { - if (limited && !length) { - return EOF; + if (replay) { + if (bufPos < bufLen) + return bufData[bufPos++]; + else + return EOF; + } else { + if (limited && !length) { + return EOF; + } + int c = str->getChar(); + --length; + if (record) { + bufData[bufLen] = c; + bufLen++; + if (bufLen >= bufMax) { + bufMax *= 2; + bufData = (unsigned char *)grealloc(bufData, bufMax); + } + } + return c; } - --length; - return str->getChar(); } int EmbedStream::lookChar() { - if (limited && !length) { - return EOF; + if (replay) { + if (bufPos < bufLen) + return bufData[bufPos]; + else + return EOF; + } else { + if (limited && !length) { + return EOF; + } + return str->lookChar(); } - return str->lookChar(); } int EmbedStream::getChars(int nChars, Guchar *buffer) { + int len; + if (nChars <= 0) { return 0; } - if (limited && length < nChars) { - nChars = length; + if (replay) { + if (bufPos >= bufLen) + return EOF; + len = bufLen - bufPos; + if (nChars > len) + nChars = len; + memcpy(buffer, bufData, len); + return len; + } else { + if (limited && length < nChars) { + nChars = length; + } + len = str->doGetChars(nChars, buffer); + if (record) { + if (bufLen + len >= bufMax) { + while (bufLen + len >= bufMax) + bufMax *= 2; + bufData = (unsigned char *)grealloc(bufData, bufMax); + } + memcpy(bufData+bufLen, buffer, len); + bufLen += len; + } } - return str->doGetChars(nChars, buffer); + return len; } + void EmbedStream::setPos(Goffset pos, int dir) { error(errInternal, -1, "Internal: called setPos() on EmbedStream"); } diff --git a/poppler/Stream.h b/poppler/Stream.h index 00b2925..87e4141 100644 --- a/poppler/Stream.h +++ b/poppler/Stream.h @@ -608,7 +608,7 @@ private: class EmbedStream: public BaseStream { public: - EmbedStream(Stream *strA, Object *dictA, GBool limitedA, Goffset lengthA); + EmbedStream(Stream *strA, Object *dictA, GBool limitedA, Goffset lengthA, GBool reusableA = gFalse); virtual ~EmbedStream(); virtual BaseStream *copy(); virtual Stream *makeSubStream(Goffset start, GBool limitedA, @@ -617,7 +617,7 @@ public: virtual void reset() {} virtual int getChar(); virtual int lookChar(); - virtual Goffset getPos() { return str->getPos(); } + virtual Goffset getPos(); virtual void setPos(Goffset pos, int dir = 0); virtual Goffset getStart(); virtual void moveStart(Goffset delta); @@ -625,6 +625,8 @@ public: virtual int getUnfilteredChar () { return str->getUnfilteredChar(); } virtual void unfilteredReset () { str->unfilteredReset(); } + void rewind(); + void restore(); private: @@ -633,6 +635,14 @@ private: Stream *str; GBool limited; + GBool reusable; + GBool record; + GBool replay; + unsigned char *bufData; + long bufMax; + long bufLen; + long bufPos; + }; //------------------------------------------------------------------------ diff --git a/utils/ImageOutputDev.cc b/utils/ImageOutputDev.cc index ae7d309..3ec3074 100644 --- a/utils/ImageOutputDev.cc +++ b/utils/ImageOutputDev.cc @@ -241,7 +241,9 @@ void ImageOutputDev::listImage(GfxState *state, Object *ref, Stream *str, printf("%5.0f ", yppi); Goffset embedSize = -1; - if (!inlineImg) + if (inlineImg) + embedSize = getInlineImageLength(str, width, height, colorMap); + else embedSize = str->getBaseStream()->getLength(); long long imageSize = 0; @@ -286,6 +288,43 @@ void ImageOutputDev::listImage(GfxState *state, Object *ref, Stream *str, ++imgNum; } +long ImageOutputDev::getInlineImageLength(Stream *str, int width, int height, + GfxImageColorMap *colorMap) { + long len; + + if (colorMap) { + ImageStream *imgStr = new ImageStream(str, width, colorMap->getNumPixelComps(), + colorMap->getBits()); + imgStr->reset(); + for (int y = 0; y < height; y++) + imgStr->getLine(); + + imgStr->close(); + delete imgStr; + } else { + str->reset(); + for (int y = 0; y < height; y++) { + int size = (width + 7)/8; + for (int x = 0; x < size; x++) + str->getChar(); + } + } + + EmbedStream *embedStr = (EmbedStream *) (str->getBaseStream()); + embedStr->rewind(); + if (str->getKind() == strDCT || str->getKind() == strCCITTFax) + str = str->getNextStream(); + len = 0; + str->reset(); + while (str->getChar() != EOF) + len++; + + embedStr->restore(); + + + return len; +} + void ImageOutputDev::writeRawImage(Stream *str, const char *ext) { FILE *f; int c; @@ -417,15 +456,21 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, GBool inlineImg) { ImageFormat format; + EmbedStream *embedStr; - if (dumpJPEG && str->getKind() == strDCT && - (colorMap->getNumPixelComps() == 1 || - colorMap->getNumPixelComps() == 3) && - !inlineImg) { + if (dumpJPEG && str->getKind() == strDCT) { + if (inlineImg) { + embedStr = (EmbedStream *) (str->getBaseStream()); + getInlineImageLength(str, width, height, colorMap); // record the strean + embedStr->rewind(); + } // dump JPEG file writeRawImage(str, "jpg"); + if (inlineImg) + embedStr->restore(); + } else if (dumpJP2 && str->getKind() == strJPX && !inlineImg) { // dump JPEG2000 file writeRawImage(str, "jp2"); @@ -454,7 +499,7 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str, // dump JBIG2 embedded file writeRawImage(str, "jb2e"); - } else if (dumpCCITT && str->getKind() == strCCITTFax && !inlineImg) { + } else if (dumpCCITT && str->getKind() == strCCITTFax) { // write CCITT parameters CCITTFaxStream *ccittStr = static_cast(str); FILE *f; @@ -486,9 +531,18 @@ void ImageOutputDev::writeImage(GfxState *state, Object *ref, Stream *str, fclose(f); + if (inlineImg) { + embedStr = (EmbedStream *) (str->getBaseStream()); + getInlineImageLength(str, width, height, colorMap); // record the strean + embedStr->rewind(); + } + // dump CCITT file writeRawImage(str, "ccitt"); + if (inlineImg) + embedStr->restore(); + } else if (outputPNG) { // output in PNG format diff --git a/utils/ImageOutputDev.h b/utils/ImageOutputDev.h index 8d0785c..33002e4 100644 --- a/utils/ImageOutputDev.h +++ b/utils/ImageOutputDev.h @@ -158,6 +158,7 @@ private: void writeRawImage(Stream *str, const char *ext); void writeImageFile(ImgWriter *writer, ImageFormat format, const char *ext, Stream *str, int width, int height, GfxImageColorMap *colorMap); + long getInlineImageLength(Stream *str, int width, int height, GfxImageColorMap *colorMap); char *fileRoot; // root of output file names char *fileName; // buffer for output file names -- 1.8.1.2