From 1106d7f2f1f86392f2183c89668edee50bbe2a5c Mon Sep 17 00:00:00 2001 From: Joshua Richardson Date: Fri, 10 Jun 2011 15:17:54 -0700 Subject: [PATCH 2/3] Support text rotation for pdftohtml. --- poppler/Gfx.cc | 1 + poppler/GlobalParams.h | 1 + poppler/popplerUtils.h | 70 +++++++++++++++++++++++++++++++++++++++ utils/HtmlFonts.cc | 37 +++++++++++++++++++-- utils/HtmlFonts.h | 11 +++++- utils/HtmlOutputDev.cc | 86 ++++++++++++++++++++++++++++++++++++++++++++--- utils/HtmlOutputDev.h | 4 ++ utils/pdftohtml.cc | 4 ++ 8 files changed, 204 insertions(+), 10 deletions(-) create mode 100644 poppler/popplerUtils.h diff --git a/poppler/Gfx.cc b/poppler/Gfx.cc index dc5f8e3..cdb2fba 100644 --- a/poppler/Gfx.cc +++ b/poppler/Gfx.cc @@ -3826,6 +3826,7 @@ void Gfx::doShowText(GooString *s) { parser = oldParser; } else if (out->useDrawChar()) { + // it appears text rise is shift in Y only irrespective of writing mode! state->textTransformDelta(0, state->getRise(), &riseX, &riseY); p = s->getCString(); len = s->getLength(); diff --git a/poppler/GlobalParams.h b/poppler/GlobalParams.h index 5fff49d..4c9f5a3 100644 --- a/poppler/GlobalParams.h +++ b/poppler/GlobalParams.h @@ -157,6 +157,7 @@ enum ScreenType { //------------------------------------------------------------------------ +// Parameters that need to be accessible across all components of Poppler class GlobalParams { public: diff --git a/poppler/popplerUtils.h b/poppler/popplerUtils.h new file mode 100644 index 0000000..65bd9a8 --- /dev/null +++ b/poppler/popplerUtils.h @@ -0,0 +1,70 @@ +// +// popplerUtils.h +// +// Created on: Jun 8, 2011 +// Author: Joshua Richardson +// Copyright 2011 +// +// All changes made under the Poppler project to this file are licensed +// under GPL version 2 or later +// +// Copyright (C) 2011 Joshua Richardson +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + +#ifndef UTILS_H_ +#define UTILS_H_ + +#include // fabs +#include "goo/gtypes.h" // GBool + +namespace poppler { + +// Returns true iff the difference between a and b is less than the threshold +// We always use fuzzy math when comparing decimal numbers due to imprecision +inline GBool is_within(double a, double thresh, double b) { + return fabs(a-b) < thresh; +} + +inline GBool rot_matrices_equal(const double * const mat0, const double * const mat1) { + return is_within(mat0[0], .1, mat1[0]) && is_within(mat0[1], .1, mat1[1]) && + is_within(mat0[2], .1, mat1[2]) && is_within(mat0[3], .1, mat1[3]); +} + +// rotation is (cos q, sin q, -sin q, cos q, 0, 0) +// sin q is zero iff there is no rotation, or 180 deg. rotation; +// for 180 rotation, cos q will be negative +inline GBool isMatRotOrSkew(const double * const mat) { + return mat[0] < 0 || !is_within(mat[1], .1, 0); +} + +inline void +multRotMat(const double * const mat0, const double * const mat1, double * const mat2) { + double tmp[4]; // use a tmp in-case mat0 == mat2 + double *dest = (mat0 == mat2 ? mat2 : tmp); + + // [ a b X [ e f == [ ae+bg af+bh + // c d ] g h ] ce+dg cf+dh ] + dest[0] = mat0[0] * mat1[0] + mat0[1] * mat1[2]; + dest[1] = mat0[0] * mat1[1] + mat0[1] * mat1[3]; + dest[2] = mat0[2] * mat1[0] + mat0[3] * mat1[2]; + dest[3] = mat0[2] * mat1[1] + mat0[3] * mat1[3]; + + if (dest == tmp) bcopy(dest, mat2, sizeof(tmp)); +} + +// Alters the matrix so that it does not scale a vector's x component; +// If the matrix does not skew, then that will also normalize the y +// component, keeping any rotation, but removing scaling. +inline void normalizeRotMat(double *mat) { + double scale = fabs(mat[0] + mat[1]); + if (!scale) return; + for (int i = 0; i < 4; i++) mat[i] /= scale; +} + +}; + +#endif /* UTILS_H_ */ diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc index 4ada32b..e4af947 100644 --- a/utils/HtmlFonts.cc +++ b/utils/HtmlFonts.cc @@ -21,6 +21,7 @@ // Copyright (C) 2008 Boris Toloknov // Copyright (C) 2008 Tomas Are Haavet // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in) +// Copyright (C) 2011 Joshua Richardson // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -31,6 +32,9 @@ #include "GlobalParams.h" #include "UnicodeMap.h" #include +#include "popplerUtils.h" + +using namespace poppler; struct Fonts{ char *Fontname; @@ -119,6 +123,7 @@ HtmlFont::HtmlFont(GooString* ftname,int _size, GfxRGB rgb){ size=(_size-1); italic = gFalse; bold = gFalse; + rotOrSkewed = gFalse; if (fontname){ if (strstr(fontname->lowerCase()->getCString(),"bold")) bold=gTrue; @@ -147,6 +152,8 @@ HtmlFont::HtmlFont(const HtmlFont& x){ pos=x.pos; color=x.color; if (x.FontName) FontName=new GooString(x.FontName); + rotOrSkewed = x.rotOrSkewed; + bcopy(x.rotSkewMat, rotSkewMat, sizeof(rotSkewMat)); } @@ -175,14 +182,15 @@ void HtmlFont::clear(){ /* - This function is used to compare font uniquily for insertion into + This function is used to compare font uniquely for insertion into the list of all encountered fonts */ GBool HtmlFont::isEqual(const HtmlFont& x) const{ - return ((size==x.size) && + return (size==x.size) && (lineSize==x.lineSize) && (pos==x.pos) && (bold==x.bold) && (italic==x.italic) && - (color.isEqual(x.getColor()))); + (color.isEqual(x.getColor())) && isRotOrSkewed() == x.isRotOrSkewed() && + (!isRotOrSkewed() || rot_matrices_equal(getRotMat(), x.getRotMat())); } /* @@ -342,6 +350,29 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){ tmp->append(fontName); //font.getFontName()); tmp->append(";color:"); tmp->append(colorStr); + // if there is rotation or skew, include the matrix + if (font.isRotOrSkewed()) { + const double * const text_mat = font.getRotMat(); + GooString matrix_str(" matrix("); + matrix_str.appendf("{0:10.10g}, {1:10.10g}, {2:10.10g}, {3:10.10g}, 0, 0)", + text_mat[0], text_mat[1], text_mat[2], text_mat[3]); + tmp->append(";-moz-transform:"); + tmp->append(&matrix_str); + tmp->append(";-webkit-transform:"); + tmp->append(&matrix_str); + tmp->append(";-o-transform:"); + tmp->append(&matrix_str); + tmp->append(";-ms-transform:"); + tmp->append(&matrix_str); + // Todo: 75% is a wild guess that seems to work pretty well; + // We probably need to calculate the real percentage + // Based on the characteristic baseline and bounding box of current font + // PDF origin is at baseline + tmp->append(";-moz-transform-origin: left 75%"); + tmp->append(";-webkit-transform-origin: left 75%"); + tmp->append(";-o-transform-origin: left 75%"); + tmp->append(";-ms-transform-origin: left 75%"); + } tmp->append(";}"); } if (xml) { diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h index a0ca78a..607979c 100644 --- a/utils/HtmlFonts.h +++ b/utils/HtmlFonts.h @@ -19,6 +19,7 @@ // // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in) // Copyright (C) 2010 Albert Astals Cid +// Copyright (C) 2011 Joshua Richardson // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -31,6 +32,7 @@ #include "GfxState.h" #include "CharTypes.h" #include +#include // bcopy class HtmlFontColor{ private: @@ -61,14 +63,16 @@ class HtmlFont{ int lineSize; GBool italic; GBool bold; + GBool rotOrSkewed; int pos; // position of the font name in the fonts array static GooString *DefaultFont; GooString *FontName; HtmlFontColor color; static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); + double rotSkewMat[4]; // only four values needed for rotation and skew public: - HtmlFont(){FontName=NULL;}; + HtmlFont(){FontName=NULL; rotOrSkewed = gFalse; }; HtmlFont(GooString* fontname,int _size, GfxRGB rgb); HtmlFont(const HtmlFont& x); HtmlFont& operator=(const HtmlFont& x); @@ -78,9 +82,13 @@ public: GooString* getFullName(); GBool isItalic() const {return italic;} GBool isBold() const {return bold;} + GBool isRotOrSkewed() const { return rotOrSkewed; } unsigned int getSize() const {return size;} int getLineSize() const {return lineSize;} void setLineSize(int _lineSize) { lineSize = _lineSize; } + void setRotMat(const double * const mat) + { rotOrSkewed = gTrue; bcopy(mat, rotSkewMat, sizeof(rotSkewMat)); } + const double *getRotMat() const { return rotSkewMat; } GooString* getFontName(); static GooString* getDefaultFont(); static void setDefaultFont(GooString* defaultFont); @@ -90,6 +98,7 @@ public: void print() const {printf("font: %s %d %s%spos: %d\n", FontName->getCString(), size, bold ? "bold " : "", italic ? "italic " : "", pos);}; }; +// Fonts accumulated class HtmlFontAccu{ private: std::vector *accu; diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 67765a1..5571d0d 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -28,6 +28,7 @@ // Copyright (C) 2010 Adrian Johnson // Copyright (C) 2010 Hib Eris // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in) +// Copyright (C) 2011 Joshua Richardson // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -44,6 +45,7 @@ #include #include #include +#include // for memcpy() #include #include #include "goo/GooString.h" @@ -61,11 +63,18 @@ #include "GlobalParams.h" #include "HtmlOutputDev.h" #include "HtmlFonts.h" +#include "popplerUtils.h" -#define DEBUG(stuff) if (debug) { std::cerr << __FILE__ << ": " << __LINE__ << ": DEBUG: " << stuff << std::endl; } +#define DEBUG __FILE__ << ": " << __LINE__ << ": DEBUG: " +#if 0 +#define DEBUG(stuff) if (debug) { std::cerr << __FILE__ << ": " << __LINE__ << ": DEBUG: " << stuff << std::endl; } else {} +#endif // returns true iff x is closer to y than x is to z #define IS_CLOSER(x,y,z) (fabs((x)-(y)) getCString(); +} + +static const char *print_uni_str(const Unicode *u, const unsigned uLen) { + GooString *gstr_buff1 = NULL; + + if (gstr_buff0) delete gstr_buff0; + + if (!uLen) return ""; + gstr_buff0 = GooString::format("{0:c}", (*u < 0x7F ? *u & 0xFF : '?')); + for (unsigned i = 1; i < uLen; i++) + if (u[i] < 0x7F) { + gstr_buff1 = gstr_buff0->append(u[i] < 0x7F ? static_cast(u[i]) & 0xFF : '?'); + delete gstr_buff0; + gstr_buff0 = gstr_buff1; + } + + return gstr_buff0->getCString(); +} + //------------------------------------------------------------------------ // HtmlString //------------------------------------------------------------------------ -HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts) { +HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* _fonts) : + fonts(_fonts) { GfxFont *font; double x, y; @@ -132,6 +169,21 @@ HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts) { GooString *name = state->getFont()->getName(); if (!name) name = HtmlFont::getDefaultFont(); //new GooString("default"); HtmlFont hfont=HtmlFont(name, static_cast(fontSize-1), rgb); + if (isMatRotOrSkew(state->getTextMat())) { + double normalizedMatrix[4]; + bcopy(state->getTextMat(), normalizedMatrix, sizeof(normalizedMatrix)); + // browser rotates the opposite way + // so flip the sign of the angle -> sin() components change sign + if (debug) + cerr << DEBUG << "before transform: " << print_matrix(normalizedMatrix) << endl; + normalizedMatrix[1] *= -1; normalizedMatrix[2] *= -1; + if (debug) + cerr << DEBUG << "after reflecting angle: " << print_matrix(normalizedMatrix) << endl; + normalizeRotMat(normalizedMatrix); + if (debug) + cerr << DEBUG << "after norm: " << print_matrix(normalizedMatrix) << endl; + hfont.setRotMat(normalizedMatrix); + } fontpos = fonts->AddFont(hfont); } else { // this means that the PDF file draws text without a current font, @@ -310,9 +362,27 @@ void HtmlPage::addChar(GfxState *state, double x, double y, // and is not too far away from it before adding //if ((UnicodeMap::getDirection(u[0]) != curStr->dir) || // XXX - if ( - (n > 0 && - fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin))) { + if (debug) { + double *text_mat = state->getTextMat(); + // rotation is (cos q, sin q, -sin q, cos q, 0, 0) + // sin q is zero iff there is no rotation, or 180 deg. rotation; + // for 180 rotation, cos q will be negative + if (text_mat[0] < 0 || !is_within(text_mat[1], .1, 0)) { + cerr << DEBUG << "rotation matrix for \"" << print_uni_str(u, uLen) << '"' << endl; + cerr << "text " << print_matrix(state->getTextMat()); + } + } + if (n > 0 && // don't start a new string, unless there is already a string + // TODO: the following line assumes that text is flowing left to + // right, which will not necessarily be the case, e.g. if rotated; + // It assesses whether or not two characters are close enough to + // be part of the same string + fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) && + // rotation is (cos q, sin q, -sin q, cos q, 0, 0) + // sin q is zero iff there is no rotation, or 180 deg. rotation; + // for 180 rotation, cos q will be negative + !rot_matrices_equal(curStr->getFont().getRotMat(), state->getTextMat()) + ) { endString(); beginString(state, NULL); } @@ -544,7 +614,7 @@ void HtmlPage::coalesce() { ) { // YES! Combine the strings! // printf("yes\n"); - DEBUG("coalescing \"" << *str1 << "\" and \"" << *str2 << "\""); + if (debug) cerr << DEBUG << "coalescing \"" << *str1 << "\" and \"" << *str2 << "\"" << endl; n = str1->len + str2->len; if ((addSpace = horSpace > 0.1 * space)) { ++n; @@ -1548,6 +1618,10 @@ GBool HtmlOutputDev::dumpDocOutline(Catalog* catalog) return done; } +void HtmlOutputDev::setDebug(GBool val) { + debug = val; +} + GBool HtmlOutputDev::newOutlineLevel(FILE *output, Object *node, Catalog* catalog, int level) { Object curr, next; diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h index e6f2270..47d274f 100644 --- a/utils/HtmlOutputDev.h +++ b/utils/HtmlOutputDev.h @@ -19,6 +19,7 @@ // Copyright (C) 2009, 2011 Carlos Garcia Campos // Copyright (C) 2009 Kovid Goyal // Copyright (C) 2010 Hib Eris +// Copyright (C) 2011 Joshua Richardson // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -84,6 +85,7 @@ public: double dx, double dy, Unicode u); HtmlLink* getLink() { return link; } + const HtmlFont &getFont() const { return *fonts->Get(fontpos); } void endString(); // postprocessing // Serialization @@ -103,6 +105,7 @@ private: int len; // length of text and xRight int size; // size of text and xRight arrays UnicodeTextDirection dir; // direction (left to right/right to left) + HtmlFontAccu *fonts; friend class HtmlPage; @@ -300,6 +303,7 @@ public: GBool dumpDocOutline(Catalog* catalog); + static void setDebug(GBool debugValue = gTrue); private: // convert encoding into a HTML standard, or encoding->getCString if not // recognized diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index b46bf1b..27478ba 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -77,6 +77,7 @@ GBool noframes=gFalse; GBool stout=gFalse; GBool xml=gFalse; static GBool errQuiet=gFalse; +static GBool debug=gFalse; static GBool noDrm=gFalse; GBool showHidden = gFalse; @@ -100,6 +101,8 @@ static const ArgDesc argDesc[] = { "keep strings in content stream order"},*/ {"-q", argFlag, &errQuiet, 0, "don't print any messages or errors"}, + {"-d", argFlag, &debug, 0, + "display debugging information"}, {"-h", argFlag, &printHelp, 0, "print usage information"}, {"-help", argFlag, &printHelp, 0, @@ -368,6 +371,7 @@ int main(int argc, char *argv[]) { rawOrder = singleHtml; // write text file + HtmlOutputDev::setDebug(debug); htmlOut = new HtmlOutputDev(htmlFileName->getCString(), docTitle->getCString(), author ? author->getCString() : NULL, -- 1.7.4.1