From 3d53024aa95d5a439b46768548420b45aec33e65 Mon Sep 17 00:00:00 2001 From: Joshua Richardson Date: Mon, 6 Jun 2011 22:17:43 -0700 Subject: [PATCH 1/2] Bug https://bugs.freedesktop.org/show_bug.cgi?id=38019 Fix vertical spacing issues in pdftohtml output. --- utils/HtmlFonts.cc | 2 +- utils/HtmlOutputDev.cc | 41 +++++++++++++++++++++++++++++++++++------ utils/HtmlOutputDev.h | 4 ++++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc index e2839e3..4ada32b 100644 --- a/utils/HtmlFonts.cc +++ b/utils/HtmlFonts.cc @@ -331,7 +331,7 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){ tmp->append(iStr); tmp->append("{font-size:"); tmp->append(Size); - if( font.getLineSize() != -1 ) + if( font.getLineSize() != -1 && font.getLineSize() != 0 ) { lSize = GooString::fromInt(font.getLineSize()); tmp->append("px;line-height:"); diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 0c707da..01662a1 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -45,6 +45,8 @@ #include #include #include +#include +#include #include "goo/GooString.h" #include "goo/GooList.h" #include "UnicodeMap.h" @@ -61,6 +63,10 @@ #include "HtmlOutputDev.h" #include "HtmlFonts.h" +#define DEBUG(stuff) (std::cerr << __FILE__ << ": " << __LINE__ << ": DEBUG: " << stuff << std::endl ) +// returns true iff x is closer to y than x is to z +#define IS_CLOSER(x,y,z) (fabs((x)-(y)) getCString(); + + return o; +} + //------------------------------------------------------------------------ // HtmlPage //------------------------------------------------------------------------ @@ -400,6 +412,9 @@ static void CloseTags( GooString *htext, GBool &finish_a, GBool &finish_italic, htext->append(""); } +// Strings are lines of text; +// This function aims to combine strings into lines and paragraphs if !noMerge +// It may also strip out duplicate strings (if they are on top of each other); sometimes they are to create a font effect void HtmlPage::coalesce() { HtmlString *str1, *str2; HtmlFont *hfont1, *hfont2; @@ -408,7 +423,7 @@ void HtmlPage::coalesce() { int n, i; double curX, curY; -#if 0 //~ for debugging +#if 1 //~ for debugging for (str1 = yxStrings; str1; str1 = str1->yxNext) { printf("x=%f..%f y=%f..%f size=%2d '", str1->xMin, str1->xMax, str1->yMin, str1->yMax, @@ -416,7 +431,10 @@ void HtmlPage::coalesce() { for (i = 0; i < str1->len; ++i) { fputc(str1->text[i] & 0xff, stdout); } - printf("'\n"); + printf("'"); + for (i = 0; i < str1->len; ++i) + printf("%u.", str1->text[i]); + printf("\n"); } printf("\n------------------------------------------------------------\n\n"); #endif @@ -478,9 +496,10 @@ void HtmlPage::coalesce() { while (str1 && (str2 = str1->yxNext)) { hfont2 = getFont(str2); - space = str1->yMax - str1->yMin; + space = str1->yMax - str1->yMin; // the height of the font's bounding box horSpace = str2->xMin - str1->xMax; - addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4); + // if strings line up on left-hand side AND they are on subsequent lines, we need a line break + addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4) && IS_CLOSER(str2->yMax, str1->yMax + space, str1->yMax); vertSpace = str2->yMin - str1->yMax; //printf("coalesce %d %d %f? ", str1->dir, str2->dir, d); @@ -497,6 +516,15 @@ void HtmlPage::coalesce() { vertOverlap = 0; } + // Combine strings if: + // They appear to be the same font (complex mode only) && going in the same direction AND at least one of the following: + // 1. They appear to be part of the same line of text + // 2. They appear to be subsequent lines of a paragraph + // We assume (1) or (2) above, respectively, based on: + // (1) strings overlap vertically AND + // horizontal space between end of str1 and start of str2 is consistent with a single space or less; + // when rawOrder, the strings have to overlap vertically by at least 50% + // (2) Strings flow down the page, but the space between them is not too great, and they are lined up on the left if ( ( ( @@ -512,8 +540,9 @@ void HtmlPage::coalesce() { (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter str1->dir == str2->dir // text direction the same ) - { + { // YES! Combine the strings! // printf("yes\n"); + DEBUG("coalescing \"" << *str1 << "\" and \"" << *str2 << "\""); n = str1->len + str2->len; if ((addSpace = horSpace > 0.1 * space)) { ++n; @@ -535,7 +564,7 @@ void HtmlPage::coalesce() { } if (addLineBreak) { str1->text[str1->len] = '\n'; - str1->htext->append("
"); + str1->htext->append("
"); str1->xRight[str1->len] = str2->xMin; ++str1->len; str1->yMin = str2->yMin; diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h index c268ce7..e6f2270 100644 --- a/utils/HtmlOutputDev.h +++ b/utils/HtmlOutputDev.h @@ -33,6 +33,7 @@ #endif #include +#include #include "goo/gtypes.h" #include "goo/GooList.h" #include "GfxFont.h" @@ -85,6 +86,8 @@ public: HtmlLink* getLink() { return link; } void endString(); // postprocessing + // Serialization + friend std::ostream &operator << (std::ostream &o, const HtmlString &hstr); private: // aender die text variable HtmlLink *link; @@ -156,6 +159,7 @@ public: void clear(); void conv(); + private: HtmlFont* getFont(HtmlString *hStr) { return fonts->Get(hStr->fontpos); } -- 1.7.4.1