diff -urp poppler-0.8.4.orig/poppler/Catalog.cc poppler-0.8.4.simx/poppler/Catalog.cc --- poppler-0.8.4.orig/poppler/Catalog.cc 2008-03-26 15:38:52.000000000 -0400 +++ poppler-0.8.4.simx/poppler/Catalog.cc 2008-07-02 23:07:01.000000000 -0400 @@ -338,6 +338,7 @@ int Catalog::findPage(int num, int gen) } LinkDest *Catalog::findDest(GooString *name) { + if( !name ) return NULL; LinkDest *dest; Object obj1, obj2; GBool found; diff -urp poppler-0.8.4.orig/utils/HtmlFonts.cc poppler-0.8.4.simx/utils/HtmlFonts.cc --- poppler-0.8.4.orig/utils/HtmlFonts.cc 2008-03-26 15:38:52.000000000 -0400 +++ poppler-0.8.4.simx/utils/HtmlFonts.cc 2008-07-02 23:07:01.000000000 -0400 @@ -205,6 +205,8 @@ GooString* HtmlFont::HtmlFilter(Unicode* case '&': tmp->append("&"); break; case '<': tmp->append("<"); break; case '>': tmp->append(">"); break; + case ' ': tmp->append( !xml && ( i+1 >= uLen || !tmp->getLength() || tmp->getChar( tmp->getLength()-1 ) == ' ' ) ? " " : " " ); + break; default: { // convert unicode to string diff -urp poppler-0.8.4.orig/utils/HtmlOutputDev.cc poppler-0.8.4.simx/utils/HtmlOutputDev.cc --- poppler-0.8.4.orig/utils/HtmlOutputDev.cc 2008-03-26 15:38:52.000000000 -0400 +++ poppler-0.8.4.simx/utils/HtmlOutputDev.cc 2008-07-02 23:07:01.000000000 -0400 @@ -81,8 +81,18 @@ HtmlString::HtmlString(GfxState *state, state->transform(state->getCurX(), state->getCurY(), &x, &y); if ((font = state->getFont())) { - yMin = y - font->getAscent() * fontSize; - yMax = y - font->getDescent() * fontSize; + double ascent = font->getAscent(); + double descent = font->getDescent(); + if( ascent > 1.05 ){ + //printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent ); + ascent = 1.05; + } + if( descent < -0.4 ){ + //printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent ); + descent = -0.4; + } + yMin = y - ascent * fontSize; + yMax = y - descent * fontSize; GfxRGB rgb; state->getFillRGB(&rgb); GooString *name = state->getFont()->getName(); @@ -333,6 +343,15 @@ void HtmlPage::endString() { curStr = NULL; } +static const char *strrstr( const char *s, const char *ss ) +{ + const char *p = strstr( s, ss ); + for( const char *pp = p; pp != NULL; pp = strstr( p+1, ss ) ){ + p = pp; + } + return p; +} + void HtmlPage::coalesce() { HtmlString *str1, *str2; HtmlFont *hfont1, *hfont2; @@ -402,7 +421,7 @@ void HtmlPage::coalesce() { str1->htext->insert(0,"",3); if( hfont1->isItalic() ) str1->htext->insert(0,"",3); - if( str1->getLink() != NULL ) { + if( str1->getLink() != NULL && !xml ) { GooString *ls = str1->getLink()->getLinkStart(); str1->htext->insert(0, ls); delete ls; @@ -462,7 +481,7 @@ void HtmlPage::coalesce() { str1->size * sizeof(double)); if (addSpace) { str1->text[str1->len] = 0x20; - str1->htext->append(" "); + str1->htext->append(xml?" ":" "); str1->xRight[str1->len] = str2->xMin; ++str1->len; } @@ -495,19 +514,25 @@ void HtmlPage::coalesce() { } /* fix and if str1 and str2 differ */ - if( hfont1->isBold() && !hfont2->isBold() ) - str1->htext->append("", 4); - if( hfont1->isItalic() && !hfont2->isItalic() ) - str1->htext->append("", 4); - if( !hfont1->isBold() && hfont2->isBold() ) - str1->htext->append("", 3); + bool finish_italic = hfont1->isItalic() && !hfont2->isItalic(); + bool finish_bold = hfont1->isBold() && ( !hfont2->isBold() || finish_italic ); + if( finish_bold && finish_italic && strrstr( str1->htext->getCString(), "" ) > strrstr( str1->htext->getCString(), "" ) ){ + str1->htext->append("", 4); + finish_italic = false; + } + if( finish_bold ) + str1->htext->append("", 4); + if( finish_italic ) + str1->htext->append("", 4); if( !hfont1->isItalic() && hfont2->isItalic() ) str1->htext->append("", 3); + if( ( !hfont1->isBold() || finish_bold ) && hfont2->isBold() ) + str1->htext->append("", 3); /* now handle switch of links */ HtmlLink *hlink1 = str1->getLink(); HtmlLink *hlink2 = str2->getLink(); - if( !hlink1 || !hlink2 || !hlink1->isEqualDest(*hlink2) ) { + if( !xml && ( !hlink1 || !hlink2 || !hlink1->isEqualDest(*hlink2) ) ) { if(hlink1 != NULL ) str1->htext->append(""); if(hlink2 != NULL ) { @@ -531,11 +556,17 @@ void HtmlPage::coalesce() { delete str2; } else { // keep strings separate // printf("no\n"); - if( hfont1->isBold() ) - str1->htext->append("",4); - if( hfont1->isItalic() ) - str1->htext->append("",4); - if(str1->getLink() != NULL ) + GBool finish_bold = hfont1->isBold(); + GBool finish_italic = hfont1->isItalic(); + if( finish_bold && finish_italic && strrstr( str1->htext->getCString(), "" ) > strrstr( str1->htext->getCString(), "" ) ){ + str1->htext->append("", 4); + finish_italic = false; + } + if( finish_bold ) + str1->htext->append("", 4); + if( finish_italic ) + str1->htext->append("", 4); + if(str1->getLink() != NULL && !xml ) str1->htext->append(""); str1->xMin = curX; str1->yMin = curY; @@ -546,7 +577,7 @@ void HtmlPage::coalesce() { str1->htext->insert(0,"",3); if( hfont1->isItalic() ) str1->htext->insert(0,"",3); - if( str1->getLink() != NULL ) { + if( str1->getLink() != NULL && !xml ) { GooString *ls = str1->getLink()->getLinkStart(); str1->htext->insert(0, ls); delete ls; @@ -554,11 +585,17 @@ void HtmlPage::coalesce() { } } str1->xMin = curX; str1->yMin = curY; - if( hfont1->isBold() ) - str1->htext->append("",4); - if( hfont1->isItalic() ) - str1->htext->append("",4); - if(str1->getLink() != NULL ) + GBool finish_bold = hfont1->isBold(); + GBool finish_italic = hfont1->isItalic(); + if( finish_bold && finish_italic && strrstr( str1->htext->getCString(), "" ) > strrstr( str1->htext->getCString(), "" ) ){ + str1->htext->append("", 4); + finish_italic = false; + } + if( finish_bold ) + str1->htext->append("", 4); + if( finish_italic ) + str1->htext->append("", 4); + if(str1->getLink() != NULL && !xml ) str1->htext->append(""); #if 0 //~ for debugging diff -urp poppler-0.8.4.orig/utils/pdftohtml.cc poppler-0.8.4.simx/utils/pdftohtml.cc --- poppler-0.8.4.orig/utils/pdftohtml.cc 2008-04-29 16:42:05.000000000 -0400 +++ poppler-0.8.4.simx/utils/pdftohtml.cc 2008-07-02 23:07:01.000000000 -0400 @@ -388,6 +388,13 @@ static GooString* getInfoString(Dict *in return s1; } +static inline int MinMax( int min_v, int v, int max_v ) +{ + if( v < min_v ) v = min_v; + else if( v > max_v ) v = max_v; + return v; +} + static GooString* getInfoDate(Dict *infoDict, char *key) { Object obj; char *s; @@ -403,12 +410,12 @@ static GooString* getInfoDate(Dict *info } if (sscanf(s, "%4d%2d%2d%2d%2d%2d", &year, &mon, &day, &hour, &min, &sec) == 6) { - tmStruct.tm_year = year - 1900; - tmStruct.tm_mon = mon - 1; - tmStruct.tm_mday = day; - tmStruct.tm_hour = hour; - tmStruct.tm_min = min; - tmStruct.tm_sec = sec; + tmStruct.tm_year = MinMax( 0, year - 1900, 200 ); + tmStruct.tm_mon = MinMax( 0, mon - 1, 11 ); + tmStruct.tm_mday = MinMax( 1, day, 31 ); + tmStruct.tm_hour = MinMax( 0, hour, 23 ); + tmStruct.tm_min = MinMax( 0, min, 59 ); + tmStruct.tm_sec = MinMax( 0, sec, 59 ); tmStruct.tm_wday = -1; tmStruct.tm_yday = -1; tmStruct.tm_isdst = -1;