diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 4247f51..b38af4d 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -94,10 +94,12 @@ extern GBool noframes; extern GBool stout; extern GBool xml; extern GBool showHidden; extern GBool noMerge; +extern double wordBreakThreshold; + static GBool debug = gFalse; static GooString *gstr_buff0 = NULL; // a workspace in which I format strings static GooString* basename(GooString* str){ @@ -349,11 +351,10 @@ void HtmlPage::conv(){ } } } - void HtmlPage::addChar(GfxState *state, double x, double y, double dx, double dy, double ox, double oy, Unicode *u, int uLen) { double x1, y1, w1, h1, dx2, dy2; int n, i; @@ -377,11 +378,11 @@ void HtmlPage::addChar(GfxState *state, double x, double y, if (n > 0 && // don't start a new string, unless there is already a string // TODO: the following line assumes that text is flowing left to // right, which will not necessarily be the case, e.g. if rotated; // It assesses whether or not two characters are close enough to // be part of the same string - fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) && + fabs(x1 - curStr->xRight[n-1]) > wordBreakThreshold * (curStr->yMax - curStr->yMin) && // rotation is (cos q, sin q, -sin q, cos q, 0, 0) // sin q is zero iff there is no rotation, or 180 deg. rotation; // for 180 rotation, cos q will be negative !rot_matrices_equal(curStr->getFont().getRotMat(), state->getTextMat())) { @@ -623,11 +624,11 @@ void HtmlPage::coalesce() { str1->dir == str2->dir // text direction the same ) { // printf("yes\n"); n = str1->len + str2->len; - if ((addSpace = horSpace > 0.1 * space)) { + if ((addSpace = horSpace > wordBreakThreshold * space)) { ++n; } if (addLineBreak) { ++n; } diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1 index 6763bbe..44137e4 100644 --- a/utils/pdftohtml.1 +++ b/utils/pdftohtml.1 @@ -82,10 +82,15 @@ If complex is selected, but neither \-fmt or \-dev are specified, .B \-nomerge do not merge paragraphs .TP .B \-nodrm override document DRM settings +.TP +.B \-wbt +adjust the word break threshold percent. Default is 10. +Word break occurs when distance between two adjacent characters is +greater than this percent of character height. .SH AUTHOR Pdftohtml was developed by Gueorgui Ovtcharov and Rainer Dorsch. It is based and benefits a lot from Derek Noonburg's xpdf package. diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 7347161..4b07989 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -80,10 +80,11 @@ static double scale=1.5; GBool noframes=gFalse; GBool stout=gFalse; GBool xml=gFalse; static GBool errQuiet=gFalse; static GBool noDrm=gFalse; +double wordBreakThreshold=10; // 10%, below converted into a coefficient - 0.1 GBool showHidden = gFalse; GBool noMerge = gFalse; static char ownerPassword[33] = ""; static char userPassword[33] = ""; @@ -140,10 +141,12 @@ static const ArgDesc argDesc[] = { "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)"}, {"-nodrm", argFlag, &noDrm, 0, "override document DRM settings"}, + {"-wbt", argFP, &wordBreakThreshold, 0, + "word break threshold (default 10 percent)"}, {NULL} }; #ifdef HAVE_SPLASH class SplashOutputDevNoText : public SplashOutputDev { @@ -219,10 +222,13 @@ int main(int argc, char *argv[]) { if( !globalParams->getTextEncoding() ) { goto error; } } + // convert from user-friendly percents into a coefficient + wordBreakThreshold /= 100.0; + // open PDF file if (ownerPassword[0]) { ownerPW = new GooString(ownerPassword); } else { ownerPW = NULL;