diff -urp poppler-0.8.4.orig/poppler/Catalog.cc poppler-0.8.4.simx/poppler/Catalog.cc
--- poppler-0.8.4.orig/poppler/Catalog.cc 2008-03-26 15:38:52.000000000 -0400
+++ poppler-0.8.4.simx/poppler/Catalog.cc 2008-07-02 23:07:01.000000000 -0400
@@ -338,6 +338,7 @@ int Catalog::findPage(int num, int gen)
}
LinkDest *Catalog::findDest(GooString *name) {
+ if( !name ) return NULL;
LinkDest *dest;
Object obj1, obj2;
GBool found;
diff -urp poppler-0.8.4.orig/utils/HtmlFonts.cc poppler-0.8.4.simx/utils/HtmlFonts.cc
--- poppler-0.8.4.orig/utils/HtmlFonts.cc 2008-03-26 15:38:52.000000000 -0400
+++ poppler-0.8.4.simx/utils/HtmlFonts.cc 2008-07-02 23:07:01.000000000 -0400
@@ -205,6 +205,8 @@ GooString* HtmlFont::HtmlFilter(Unicode*
case '&': tmp->append("&"); break;
case '<': tmp->append("<"); break;
case '>': tmp->append(">"); break;
+ case ' ': tmp->append( !xml && ( i+1 >= uLen || !tmp->getLength() || tmp->getChar( tmp->getLength()-1 ) == ' ' ) ? " " : " " );
+ break;
default:
{
// convert unicode to string
diff -urp poppler-0.8.4.orig/utils/HtmlOutputDev.cc poppler-0.8.4.simx/utils/HtmlOutputDev.cc
--- poppler-0.8.4.orig/utils/HtmlOutputDev.cc 2008-03-26 15:38:52.000000000 -0400
+++ poppler-0.8.4.simx/utils/HtmlOutputDev.cc 2008-07-02 23:07:01.000000000 -0400
@@ -81,8 +81,18 @@ HtmlString::HtmlString(GfxState *state,
state->transform(state->getCurX(), state->getCurY(), &x, &y);
if ((font = state->getFont())) {
- yMin = y - font->getAscent() * fontSize;
- yMax = y - font->getDescent() * fontSize;
+ double ascent = font->getAscent();
+ double descent = font->getDescent();
+ if( ascent > 1.05 ){
+ //printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent );
+ ascent = 1.05;
+ }
+ if( descent < -0.4 ){
+ //printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent );
+ descent = -0.4;
+ }
+ yMin = y - ascent * fontSize;
+ yMax = y - descent * fontSize;
GfxRGB rgb;
state->getFillRGB(&rgb);
GooString *name = state->getFont()->getName();
@@ -333,6 +343,15 @@ void HtmlPage::endString() {
curStr = NULL;
}
+static const char *strrstr( const char *s, const char *ss )
+{
+ const char *p = strstr( s, ss );
+ for( const char *pp = p; pp != NULL; pp = strstr( p+1, ss ) ){
+ p = pp;
+ }
+ return p;
+}
+
void HtmlPage::coalesce() {
HtmlString *str1, *str2;
HtmlFont *hfont1, *hfont2;
@@ -402,7 +421,7 @@ void HtmlPage::coalesce() {
str1->htext->insert(0,"",3);
if( hfont1->isItalic() )
str1->htext->insert(0,"",3);
- if( str1->getLink() != NULL ) {
+ if( str1->getLink() != NULL && !xml ) {
GooString *ls = str1->getLink()->getLinkStart();
str1->htext->insert(0, ls);
delete ls;
@@ -462,7 +481,7 @@ void HtmlPage::coalesce() {
str1->size * sizeof(double));
if (addSpace) {
str1->text[str1->len] = 0x20;
- str1->htext->append(" ");
+ str1->htext->append(xml?" ":" ");
str1->xRight[str1->len] = str2->xMin;
++str1->len;
}
@@ -495,19 +514,25 @@ void HtmlPage::coalesce() {
}
/* fix and if str1 and str2 differ */
- if( hfont1->isBold() && !hfont2->isBold() )
- str1->htext->append("", 4);
- if( hfont1->isItalic() && !hfont2->isItalic() )
- str1->htext->append("", 4);
- if( !hfont1->isBold() && hfont2->isBold() )
- str1->htext->append("", 3);
+ bool finish_italic = hfont1->isItalic() && !hfont2->isItalic();
+ bool finish_bold = hfont1->isBold() && ( !hfont2->isBold() || finish_italic );
+ if( finish_bold && finish_italic && strrstr( str1->htext->getCString(), "" ) > strrstr( str1->htext->getCString(), "" ) ){
+ str1->htext->append("", 4);
+ finish_italic = false;
+ }
+ if( finish_bold )
+ str1->htext->append("", 4);
+ if( finish_italic )
+ str1->htext->append("", 4);
if( !hfont1->isItalic() && hfont2->isItalic() )
str1->htext->append("", 3);
+ if( ( !hfont1->isBold() || finish_bold ) && hfont2->isBold() )
+ str1->htext->append("", 3);
/* now handle switch of links */
HtmlLink *hlink1 = str1->getLink();
HtmlLink *hlink2 = str2->getLink();
- if( !hlink1 || !hlink2 || !hlink1->isEqualDest(*hlink2) ) {
+ if( !xml && ( !hlink1 || !hlink2 || !hlink1->isEqualDest(*hlink2) ) ) {
if(hlink1 != NULL )
str1->htext->append("");
if(hlink2 != NULL ) {
@@ -531,11 +556,17 @@ void HtmlPage::coalesce() {
delete str2;
} else { // keep strings separate
// printf("no\n");
- if( hfont1->isBold() )
- str1->htext->append("",4);
- if( hfont1->isItalic() )
- str1->htext->append("",4);
- if(str1->getLink() != NULL )
+ GBool finish_bold = hfont1->isBold();
+ GBool finish_italic = hfont1->isItalic();
+ if( finish_bold && finish_italic && strrstr( str1->htext->getCString(), "" ) > strrstr( str1->htext->getCString(), "" ) ){
+ str1->htext->append("", 4);
+ finish_italic = false;
+ }
+ if( finish_bold )
+ str1->htext->append("", 4);
+ if( finish_italic )
+ str1->htext->append("", 4);
+ if(str1->getLink() != NULL && !xml )
str1->htext->append("");
str1->xMin = curX; str1->yMin = curY;
@@ -546,7 +577,7 @@ void HtmlPage::coalesce() {
str1->htext->insert(0,"",3);
if( hfont1->isItalic() )
str1->htext->insert(0,"",3);
- if( str1->getLink() != NULL ) {
+ if( str1->getLink() != NULL && !xml ) {
GooString *ls = str1->getLink()->getLinkStart();
str1->htext->insert(0, ls);
delete ls;
@@ -554,11 +585,17 @@ void HtmlPage::coalesce() {
}
}
str1->xMin = curX; str1->yMin = curY;
- if( hfont1->isBold() )
- str1->htext->append("",4);
- if( hfont1->isItalic() )
- str1->htext->append("",4);
- if(str1->getLink() != NULL )
+ GBool finish_bold = hfont1->isBold();
+ GBool finish_italic = hfont1->isItalic();
+ if( finish_bold && finish_italic && strrstr( str1->htext->getCString(), "" ) > strrstr( str1->htext->getCString(), "" ) ){
+ str1->htext->append("", 4);
+ finish_italic = false;
+ }
+ if( finish_bold )
+ str1->htext->append("", 4);
+ if( finish_italic )
+ str1->htext->append("", 4);
+ if(str1->getLink() != NULL && !xml )
str1->htext->append("");
#if 0 //~ for debugging
diff -urp poppler-0.8.4.orig/utils/pdftohtml.cc poppler-0.8.4.simx/utils/pdftohtml.cc
--- poppler-0.8.4.orig/utils/pdftohtml.cc 2008-04-29 16:42:05.000000000 -0400
+++ poppler-0.8.4.simx/utils/pdftohtml.cc 2008-07-02 23:07:01.000000000 -0400
@@ -388,6 +388,13 @@ static GooString* getInfoString(Dict *in
return s1;
}
+static inline int MinMax( int min_v, int v, int max_v )
+{
+ if( v < min_v ) v = min_v;
+ else if( v > max_v ) v = max_v;
+ return v;
+}
+
static GooString* getInfoDate(Dict *infoDict, char *key) {
Object obj;
char *s;
@@ -403,12 +410,12 @@ static GooString* getInfoDate(Dict *info
}
if (sscanf(s, "%4d%2d%2d%2d%2d%2d",
&year, &mon, &day, &hour, &min, &sec) == 6) {
- tmStruct.tm_year = year - 1900;
- tmStruct.tm_mon = mon - 1;
- tmStruct.tm_mday = day;
- tmStruct.tm_hour = hour;
- tmStruct.tm_min = min;
- tmStruct.tm_sec = sec;
+ tmStruct.tm_year = MinMax( 0, year - 1900, 200 );
+ tmStruct.tm_mon = MinMax( 0, mon - 1, 11 );
+ tmStruct.tm_mday = MinMax( 1, day, 31 );
+ tmStruct.tm_hour = MinMax( 0, hour, 23 );
+ tmStruct.tm_min = MinMax( 0, min, 59 );
+ tmStruct.tm_sec = MinMax( 0, sec, 59 );
tmStruct.tm_wday = -1;
tmStruct.tm_yday = -1;
tmStruct.tm_isdst = -1;