From 32c866feb04eb81fcad259aaab61e4e0070bf8ef Mon Sep 17 00:00:00 2001 From: ulatekh Date: Sun, 15 Jul 2018 09:10:51 -0700 Subject: [PATCH 3/3] Emit more font information when pdftohtml is run with -xml. This extra information makes it easier to infer the text's meaning. --- utils/HtmlFonts.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++---- utils/HtmlFonts.h | 6 ++++ 2 files changed, 100 insertions(+), 6 deletions(-) diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc index 4c65d5f5..94986c37 100644 --- a/utils/HtmlFonts.cc +++ b/utils/HtmlFonts.cc @@ -125,6 +125,43 @@ HtmlFont::HtmlFont(GfxFont *font, int _size, GfxRGB rgb){ fontname = nullptr; FontName = nullptr; } + + // Try to get more information about this font, for the XML dump. + GooString *familyName = font->getFamily(); + this->FamilyName = (familyName != nullptr) ? new GooString(familyName) : nullptr; + this->FontID = *font->getID(); + GooString *embeddedFontName = font->getEmbeddedFontName(); + this->EmbeddedFontName = (embeddedFontName != nullptr) ? new GooString(embeddedFontName) : nullptr; + // NOTE: Removing the suffixes will make the emitted + // values the same, but there will still be duplicate-looking entries. + // If they're merged now, the emitted HTML won't have and + // elements in mixed-style text-lines. + // So those will have to be detected and merged at post-emit + // analysis time. + char const *apszSuffixes[] = { "-Italic", "-Bold", "-Regular", + "-Curves", "-Corners" }; + // (Remove the style from the embedded font name.) + for(char const *pszSuffix : apszSuffixes) + { + if (this->FontName->endsWith(pszSuffix)) + { + int iLength = this->FontName->getLength(); + int iSuffixLength = strlen(pszSuffix); + this->FontName->del(iLength - iSuffixLength, + iSuffixLength); + } + } + // (Remove the style from the embedded font name.) + for(char const *pszSuffix : apszSuffixes) + { + if (this->EmbeddedFontName->endsWith(pszSuffix)) + { + int iLength = this->EmbeddedFontName->getLength(); + int iSuffixLength = strlen(pszSuffix); + this->EmbeddedFontName->del(iLength - iSuffixLength, + iSuffixLength); + } + } lineSize = -1; @@ -169,6 +206,9 @@ HtmlFont::HtmlFont(const HtmlFont& x){ pos=x.pos; color=x.color; FontName = (x.FontName) ? new GooString(x.FontName) : nullptr; + FamilyName = (x.FamilyName) ? new GooString(x.FamilyName) : nullptr; + this->FontID = x.FontID; + EmbeddedFontName = (x.EmbeddedFontName) ? new GooString(x.EmbeddedFontName) : nullptr; rotOrSkewed = x.rotOrSkewed; memcpy(rotSkewMat, x.rotSkewMat, sizeof(rotSkewMat)); } @@ -176,6 +216,8 @@ HtmlFont::HtmlFont(const HtmlFont& x){ HtmlFont::~HtmlFont(){ if (FontName) delete FontName; + if (FamilyName) delete FamilyName; + if (EmbeddedFontName) delete EmbeddedFontName; } HtmlFont& HtmlFont::operator=(const HtmlFont& x){ @@ -188,6 +230,11 @@ HtmlFont& HtmlFont::operator=(const HtmlFont& x){ color=x.color; if (FontName) delete FontName; FontName = (x.FontName) ? new GooString(x.FontName) : nullptr; + if (FamilyName) delete FamilyName; + FamilyName = (x.FamilyName) ? new GooString(x.FamilyName) : nullptr; + this->FontID = x.FontID; + if (EmbeddedFontName) delete EmbeddedFontName; + EmbeddedFontName = (x.EmbeddedFontName) ? new GooString(x.EmbeddedFontName) : nullptr; return *this; } @@ -204,10 +251,16 @@ void HtmlFont::clear(){ */ GBool HtmlFont::isEqual(const HtmlFont& x) const{ return (size==x.size) && - (lineSize==x.lineSize) && - (pos==x.pos) && (bold==x.bold) && (italic==x.italic) && - (color.isEqual(x.getColor())) && isRotOrSkewed() == x.isRotOrSkewed() && - (!isRotOrSkewed() || rot_matrices_equal(getRotMat(), x.getRotMat())); + (lineSize==x.lineSize) && + ((FontName == nullptr && x.FontName == nullptr) + || (FontName != nullptr && x.FontName != nullptr && FontName->cmp(x.FontName) == 0)) && + ((FamilyName == nullptr && x.FamilyName == nullptr) + || (FamilyName != nullptr && x.FamilyName != nullptr && FamilyName->cmp(x.FamilyName) == 0)) && + ((EmbeddedFontName == nullptr && x.EmbeddedFontName == nullptr) + || (EmbeddedFontName != nullptr && x.EmbeddedFontName != nullptr && EmbeddedFontName->cmp(x.EmbeddedFontName) == 0)) && + (pos==x.pos) && (bold==x.bold) && (italic==x.italic) && + (color.isEqual(x.getColor())) && isRotOrSkewed() == x.isRotOrSkewed() && + (!isRotOrSkewed() || rot_matrices_equal(getRotMat(), x.getRotMat())); } /* @@ -216,8 +269,10 @@ GBool HtmlFont::isEqual(const HtmlFont& x) const{ */ GBool HtmlFont::isEqualIgnoreBold(const HtmlFont& x) const{ return ((size==x.size) && - (!strcmp(fonts[pos].name, fonts[x.pos].name)) && - (color.isEqual(x.getColor()))); + ((FontName == nullptr && x.FontName == nullptr) + || (FontName != nullptr && x.FontName != nullptr && FontName->cmp(x.FontName) == 0)) && + (!strcmp(fonts[pos].name, fonts[x.pos].name)) && + (color.isEqual(x.getColor()))); } GooString* HtmlFont::getFontName(){ @@ -225,6 +280,21 @@ GooString* HtmlFont::getFontName(){ else return new GooString(DefaultFont); } +GooString* HtmlFont::getFamilyName() const +{ + return FamilyName; +} + +Ref HtmlFont::getFontID() const +{ + return FontID; +} + +GooString* HtmlFont::getEmbeddedFontName() const +{ + return EmbeddedFontName; +} + GooString* HtmlFont::getFullName(){ if (FontName) return new GooString(FontName); @@ -330,6 +400,8 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){ GooString *Size=GooString::fromInt(font.getSize()); GooString *colorStr=font.getColor().toString(); GooString *fontName=(fontFullName ? font.getFullName() : font.getFontName()); + GooString *familyName = font.getFamilyName(); + GooString *embeddedFontName = font.getEmbeddedFontName(); GooString *lSize; if(!xml){ @@ -375,15 +447,31 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){ tmp->append(";}"); } if (xml) { + GooString *rotOrSkewedStr=GooString::fromInt(font.isRotOrSkewed() ? 1 : 0); + tmp->append("append(iStr); tmp->append("\" size=\""); tmp->append(Size); tmp->append("\" family=\""); tmp->append(fontName); //font.getFontName()); + if (familyName != nullptr) + { + tmp->append("\" fontFamily=\""); + tmp->append(familyName); + } + if (embeddedFontName != nullptr) + { + tmp->append("\" embeddedFontName=\""); + tmp->append(embeddedFontName); + } + tmp->append("\" rotOrSkewed=\""); + tmp->append(rotOrSkewedStr); tmp->append("\" color=\""); tmp->append(colorStr); tmp->append("\"/>"); + + delete rotOrSkewedStr; } delete fontName; diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h index ba4f42ae..d5b24a64 100644 --- a/utils/HtmlFonts.h +++ b/utils/HtmlFonts.h @@ -68,6 +68,9 @@ class HtmlFont{ int pos; // position of the font name in the fonts array static GooString *DefaultFont; GooString *FontName; + GooString *FamilyName; + Ref FontID; + GooString *EmbeddedFontName; HtmlFontColor color; double rotSkewMat[4]; // only four values needed for rotation and skew public: @@ -89,6 +92,9 @@ public: { rotOrSkewed = gTrue; memcpy(rotSkewMat, mat, sizeof(rotSkewMat)); } const double *getRotMat() const { return rotSkewMat; } GooString* getFontName(); + GooString* getFamilyName() const; + Ref getFontID() const; + GooString* getEmbeddedFontName() const; static GooString* getDefaultFont(); static void setDefaultFont(GooString* defaultFont); static GooString* HtmlFilter(const Unicode* u, int uLen); //char* s); -- 2.14.4