diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 4247f51..b38af4d 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -94,10 +94,12 @@ extern GBool noframes;
extern GBool stout;
extern GBool xml;
extern GBool showHidden;
extern GBool noMerge;
+extern double wordBreakThreshold;
+
static GBool debug = gFalse;
static GooString *gstr_buff0 = NULL; // a workspace in which I format strings
static GooString* basename(GooString* str){
@@ -349,11 +351,10 @@ void HtmlPage::conv(){
}
}
}
-
void HtmlPage::addChar(GfxState *state, double x, double y,
double dx, double dy,
double ox, double oy, Unicode *u, int uLen) {
double x1, y1, w1, h1, dx2, dy2;
int n, i;
@@ -377,11 +378,11 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
if (n > 0 && // don't start a new string, unless there is already a string
// TODO: the following line assumes that text is flowing left to
// right, which will not necessarily be the case, e.g. if rotated;
// It assesses whether or not two characters are close enough to
// be part of the same string
- fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) &&
+ fabs(x1 - curStr->xRight[n-1]) > wordBreakThreshold * (curStr->yMax - curStr->yMin) &&
// rotation is (cos q, sin q, -sin q, cos q, 0, 0)
// sin q is zero iff there is no rotation, or 180 deg. rotation;
// for 180 rotation, cos q will be negative
!rot_matrices_equal(curStr->getFont().getRotMat(), state->getTextMat()))
{
@@ -623,11 +624,11 @@ void HtmlPage::coalesce() {
str1->dir == str2->dir // text direction the same
)
{
// printf("yes\n");
n = str1->len + str2->len;
- if ((addSpace = horSpace > 0.1 * space)) {
+ if ((addSpace = horSpace > wordBreakThreshold * space)) {
++n;
}
if (addLineBreak) {
++n;
}
diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1
index 6763bbe..44137e4 100644
--- a/utils/pdftohtml.1
+++ b/utils/pdftohtml.1
@@ -82,10 +82,15 @@ If complex is selected, but neither \-fmt or \-dev are specified,
.B \-nomerge
do not merge paragraphs
.TP
.B \-nodrm
override document DRM settings
+.TP
+.B \-wbt
+adjust the word break threshold percent. Default is 10.
+Word break occurs when distance between two adjacent characters is
+greater than this percent of character height.
.SH AUTHOR
Pdftohtml was developed by Gueorgui Ovtcharov and Rainer Dorsch. It is
based and benefits a lot from Derek Noonburg's xpdf package.
diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc
index 7347161..4b07989 100644
--- a/utils/pdftohtml.cc
+++ b/utils/pdftohtml.cc
@@ -80,10 +80,11 @@ static double scale=1.5;
GBool noframes=gFalse;
GBool stout=gFalse;
GBool xml=gFalse;
static GBool errQuiet=gFalse;
static GBool noDrm=gFalse;
+double wordBreakThreshold=10; // 10%, below converted into a coefficient - 0.1
GBool showHidden = gFalse;
GBool noMerge = gFalse;
static char ownerPassword[33] = "";
static char userPassword[33] = "";
@@ -140,10 +141,12 @@ static const ArgDesc argDesc[] = {
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-nodrm", argFlag, &noDrm, 0,
"override document DRM settings"},
+ {"-wbt", argFP, &wordBreakThreshold, 0,
+ "word break threshold (default 10 percent)"},
{NULL}
};
#ifdef HAVE_SPLASH
class SplashOutputDevNoText : public SplashOutputDev {
@@ -219,10 +222,13 @@ int main(int argc, char *argv[]) {
if( !globalParams->getTextEncoding() ) {
goto error;
}
}
+ // convert from user-friendly percents into a coefficient
+ wordBreakThreshold /= 100.0;
+
// open PDF file
if (ownerPassword[0]) {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;