Reflow paragraphs when converting to html. From: Erik Hovland Based on the patch posted here: http://lists.freedesktop.org/archives/poppler/2008-September/004126.html It adds the capability to reflow paragraphs when converting to html. --- utils/HtmlFonts.cc | 3 + utils/HtmlOutputDev.cc | 157 ++++++++++++++++++++++++++---------------------- utils/pdftohtml.1 | 5 ++ utils/pdftohtml.cc | 6 +- 4 files changed, 97 insertions(+), 74 deletions(-) diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc index d2cbfd5..8570fe2 100644 --- a/utils/HtmlFonts.cc +++ b/utils/HtmlFonts.cc @@ -57,6 +57,7 @@ static Fonts fonts[font_num+1]={ #define xoutRound(x) ((int)(x + 0.5)) extern GBool xml; +extern GBool reFlow; GooString* HtmlFont::DefaultFont=new GooString("Times"); // Arial,Helvetica,sans-serif @@ -234,7 +235,7 @@ GooString* HtmlFont::HtmlFilter(Unicode* u, int uLen) { case '&': tmp->append("&"); break; case '<': tmp->append("<"); break; case '>': tmp->append(">"); break; - case ' ': tmp->append( !xml && ( i+1 >= uLen || !tmp->getLength() || tmp->getChar( tmp->getLength()-1 ) == ' ' ) ? " " : " " ); + case ' ': tmp->append( !xml && !reFlow && ( i+1 >= uLen || !tmp->getLength() || tmp->getChar( tmp->getLength()-1 ) == ' ' ) ? " " : " " ); break; default: { diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 008090b..78439bf 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -72,6 +72,7 @@ extern GBool stout; extern GBool xml; extern GBool showHidden; extern GBool noMerge; +extern GBool reFlow; static GooString* basename(GooString* str){ @@ -401,9 +402,9 @@ void HtmlPage::coalesce() { HtmlString *str1, *str2; HtmlFont *hfont1, *hfont2; double space, horSpace, vertSpace; - GBool addSpace, addLineBreak; + GBool addSpace, addLineBreak, nextLine; int n, i; - double curX, curY; + double curX, curY, lineStartX = 0.0; #if 0 //~ for debugging for (str1 = yxStrings; str1; str1 = str1->yxNext) { @@ -471,53 +472,78 @@ void HtmlPage::coalesce() { str1->htext->insert(0, ls); delete ls; } - curX = str1->xMin; curY = str1->yMin; + curX = str1->xMin; curY = str1->yMin; lineStartX = curX; while (str1 && (str2 = str1->yxNext)) { hfont2 = getFont(str2); space = str1->yMax - str1->yMin; horSpace = str2->xMin - str1->xMax; - addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4); + addLineBreak = (nextLine = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4)); vertSpace = str2->yMin - str1->yMax; //printf("coalesce %d %d %f? ", str1->dir, str2->dir, d); - if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) + // Heuristic: If the last character in str1 is a hyphen then + // turn off addLineBreak. This will re-merge hyphenated words + // that have been split over multiple lines. + if (reFlow && str1->text[str1->len - 1] == '-') { - vertOverlap = str1->yMax - str2->yMin; - } else - if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax) - { - vertOverlap = str2->yMax - str1->yMin; - } else - { - vertOverlap = 0; - } - - if ( - ( - ( - ( - (rawOrder && vertOverlap > 0.5 * space) - || - (!rawOrder && str2->yMin < str1->yMax) - ) && - (horSpace > -0.5 * space && horSpace < space) - ) || - (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak) - ) && - (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter - str1->dir == str2->dir // text direction the same - ) + addLineBreak = false; + str1->len--; + str1->htext->del(str1->htext->getLength() -1, 1); + } + + // Is str2 a new paragraph? + if (nextLine + && + ((str2->xMin > lineStartX + 3.0) + || (vertSpace > 0.5 * space) + || xml + ) + ) { -// printf("yes\n"); +// printf("new paragraph\n"); + GBool finishA = str1->getLink() != NULL; + GBool finishBold = hfont1->isBold(); + GBool finishItalic = hfont1->isItalic(); + CloseTags( str1->htext, finishA, finishItalic, finishBold ); + + str1->xMin = curX; str1->yMin = curY; + str1 = str2; + curX = str1->xMin; curY = str1->yMin; + lineStartX= str1->xMin; + hfont1 = hfont2; + if( hfont1->isBold() ) + str1->htext->insert(0,"",3); + if( hfont1->isItalic() ) + str1->htext->insert(0,"",3); + if( str1->getLink() != NULL ) { + GooString *ls = str1->getLink()->getLinkStart(); + str1->htext->insert(0, ls); + delete ls; + } + } else { +// printf("same paragraph\n"); + n = str1->len + str2->len; - if ((addSpace = horSpace > 0.1 * space)) { - ++n; + if ((addSpace = + // if reflow is on then if horSpace is positive do regular check + (reFlow && ((horSpace > 0.0 && horSpace > 0.1 * space) + || + // but if horSpace is negative, check the Y directions + (horSpace < 0.0 && str1->yMax < str2->yMax) + ) + ) + || + // otherwise reflow is off so do the usual check + horSpace > 0.1 * space + ) + ) + { + ++n; } - if (addLineBreak) { + if (nextLine) ++n; - } str1->size = (n + 15) & ~15; str1->text = (Unicode *)grealloc(str1->text, @@ -525,19 +551,23 @@ void HtmlPage::coalesce() { str1->xRight = (double *)grealloc(str1->xRight, str1->size * sizeof(double)); if (addSpace) { - str1->text[str1->len] = 0x20; - str1->htext->append(xml?" ":" "); - str1->xRight[str1->len] = str2->xMin; - ++str1->len; + str1->text[str1->len] = 0x20; + str1->htext->append((xml || reFlow) ? " " : " "); + str1->xRight[str1->len] = str2->xMin; + ++str1->len; } - if (addLineBreak) { - str1->text[str1->len] = '\n'; - str1->htext->append("
"); - str1->xRight[str1->len] = str2->xMin; - ++str1->len; + + if (nextLine) { + if (addLineBreak) { + str1->text[str1->len] = '\n'; + str1->htext->append(reFlow ? "\n" : "
"); + str1->xRight[str1->len] = str2->xMin; + ++str1->len; + } str1->yMin = str2->yMin; str1->yMax = str2->yMax; str1->xMax = str2->xMax; + lineStartX = str2->xMin; int fontLineSize = hfont1->getLineSize(); int curLineSize = (int)(vertSpace + space); if( curLineSize != fontLineSize ) @@ -589,26 +619,6 @@ void HtmlPage::coalesce() { } str1->yxNext = str2->yxNext; delete str2; - } else { // keep strings separate -// printf("no\n"); - GBool finish_a = str1->getLink() != NULL; - GBool finish_bold = hfont1->isBold(); - GBool finish_italic = hfont1->isItalic(); - CloseTags( str1->htext, finish_a, finish_italic, finish_bold ); - - str1->xMin = curX; str1->yMin = curY; - str1 = str2; - curX = str1->xMin; curY = str1->yMin; - hfont1 = hfont2; - if( hfont1->isBold() ) - str1->htext->insert(0,"",3); - if( hfont1->isItalic() ) - str1->htext->insert(0,"",3); - if( str1->getLink() != NULL ) { - GooString *ls = str1->getLink()->getLinkStart(); - str1->htext->insert(0, ls); - delete ls; - } } } str1->xMin = curX; str1->yMin = curY; @@ -777,13 +787,18 @@ void HtmlPage::dump(FILE *f, int pageNum) GooString* str; for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){ if (tmp->htext){ - str=new GooString(tmp->htext); - fputs(str->getCString(),f); - delete str; - fputs("
\n",f); + str=new GooString(tmp->htext); + if (reFlow) + fputs("

\n", f); + fputs(str->getCString(),f); + delete str; + if (reFlow) + fputs("

\n", f); + else + fputs("
\n", f); } } - fputs("
\n",f); + if (!reFlow) fputs("
\n",f); } } @@ -1593,7 +1608,7 @@ GBool HtmlOutputDev::dumpDocOutline(Catalog* catalog) if (noframes) { output = page; - fputs("
\n", output); + if (!reFlow) fputs("
\n", output); } else { @@ -1610,7 +1625,7 @@ GBool HtmlOutputDev::dumpDocOutline(Catalog* catalog) GBool done = newOutlineLevel(output, outlines, catalog); if (done && !complexMode) - fputs("
\n", output); + if (!reFlow) fputs("
\n", output); if (bClose) { diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1 index 850aa84..0d30504 100644 --- a/utils/pdftohtml.1 +++ b/utils/pdftohtml.1 @@ -52,6 +52,11 @@ use standard output .B \-zoom zoom the pdf document (default 1.5) .TP +.B \-reflow +join paragraph lines together and separate paragraphs with a

tag. With +this flag off, paragraph lines are separated by
tags and paragraphs are +also separated by
tags. +.TP .B \-xml output for XML post-processing .TP diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 41312de..8be7974 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -69,6 +69,7 @@ static GBool errQuiet=gFalse; static GBool noDrm=gFalse; GBool showHidden = gFalse; +GBool reFlow = gFalse; GBool noMerge = gFalse; static char ownerPassword[33] = ""; static char userPassword[33] = ""; @@ -107,12 +108,14 @@ static const ArgDesc argDesc[] = { "zoom the pdf document (default 1.5)"}, {"-xml", argFlag, &xml, 0, "output for XML post-processing"}, + {"-reflow", argFlag, &reFlow, 0, + "output reflow paragraphs"}, {"-hidden", argFlag, &showHidden, 0, "output hidden text"}, {"-nomerge", argFlag, &noMerge, 0, "do not merge paragraphs"}, {"-enc", argString, textEncName, sizeof(textEncName), - "output text encoding name"}, + "output text encoding name (UTF-8, Latin1 etc"}, {"-dev", argString, gsDevice, sizeof(gsDevice), "output device name for Ghostscript (png16m, jpeg etc)"}, {"-v", argFlag, &printVersion, 0, @@ -250,7 +253,6 @@ int main(int argc, char *argv[]) { { complexMode = gTrue; noframes = gTrue; - noMerge = gTrue; } // get page range