From 1c46a8f35fe4121f5dd35eec861cc8f62cfb2602 Mon Sep 17 00:00:00 2001
From: Jason Crain <jason@aquaticape.us>
Date: Wed, 22 Aug 2012 22:14:21 -0500
Subject: [PATCH] Allow multiple fonts in a TextWord

---
 glib/poppler-page.cc     |   47 ++++++----
 poppler/TextOutputDev.cc |  229 ++++++++++++++++++++++------------------------
 poppler/TextOutputDev.h  |   20 ++--
 3 files changed, 145 insertions(+), 151 deletions(-)

diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc
index 8113e9c..fce173d 100644
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1545,9 +1545,9 @@ poppler_text_attributes_new (void)
 }
 
 static gchar *
-get_font_name_from_word (TextWord *word)
+get_font_name_from_word (TextWord *word, gint word_i)
 {
-  GooString *font_name = word->getFontName();
+  GooString *font_name = word->getFontName(word_i);
   const gchar *name;
   gboolean subset;
   gint i;
@@ -1573,12 +1573,12 @@ get_font_name_from_word (TextWord *word)
  * Allocates a new PopplerTextAttributes with word attributes
  */
 static PopplerTextAttributes *
-poppler_text_attributes_new_from_word (TextWord *word)
+poppler_text_attributes_new_from_word (TextWord *word, gint i)
 {
   PopplerTextAttributes *attrs = poppler_text_attributes_new ();
   gdouble r, g, b;
 
-  attrs->font_name = get_font_name_from_word (word);
+  attrs->font_name = get_font_name_from_word (word, i);
   attrs->font_size = word->getFontSize();
   attrs->is_underlined = word->isUnderlined();
   word->getColor (&r, &g, &b);
@@ -2071,11 +2071,11 @@ poppler_page_free_text_attributes (GList *list)
 }
 
 static gboolean
-word_text_attributes_equal (TextWord *a, TextWord *b)
+word_text_attributes_equal (TextWord *a, gint ai, TextWord *b, gint bi)
 {
   double ar, ag, ab, br, bg, bb;
 
-  if (!a->getFontInfo()->matches (b->getFontInfo()))
+  if (!a->getFontInfo(ai)->matches (b->getFontInfo(bi)))
     return FALSE;
 
   if (a->getFontSize() != b->getFontSize())
@@ -2125,23 +2125,32 @@ poppler_page_get_text_attributes (PopplerPage *page)
       return NULL;
     }
 
+  TextWord *word, *prev_word = NULL;
+  gint word_i, prev_word_i;
+
   // Calculating each word attributes
   for (i = 0; i < wordlist->getLength (); i++)
     {
-      TextWord *word = wordlist->get (i);
+      word = wordlist->get (i);
 
-      // each char of the word has the same attributes
-      if (i > 0 && word_text_attributes_equal (word, wordlist->get (i - 1))) {
-        attrs = previous;
-      } else {
-        attrs = poppler_text_attributes_new_from_word (word);
-        attrs->start_index = offset;
-        if (previous)
-          previous->end_index--;
-        previous = attrs;
-        attributes = g_list_prepend (attributes, attrs);
-      }
-      offset += word->getLength () + 1;
+      for (word_i = 0; word_i < word->getLength (); word_i++)
+	{
+	  if (prev_word && word_text_attributes_equal (word, word_i, prev_word, prev_word_i)) {
+	    attrs = previous;
+	  } else {
+	    attrs = poppler_text_attributes_new_from_word (word, word_i);
+	    attrs->start_index = offset;
+	    if (previous)
+	      previous->end_index--;
+	    previous = attrs;
+	    attributes = g_list_prepend (attributes, attrs);
+	  }
+	  offset++;
+	  attrs->end_index = offset;
+	  prev_word = word;
+	  prev_word_i = word_i;
+	}
+      offset++;
       attrs->end_index = offset;
     }
   if (attrs)
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 9af7532..83899f1 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -234,104 +234,14 @@ GBool TextFontInfo::matches(TextFontInfo *fontInfo) {
 // TextWord
 //------------------------------------------------------------------------
 
-TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
-		   TextFontInfo *fontA, double fontSizeA) {
-  GfxFont *gfxFont;
-  double x, y, ascent, descent;
-  int wMode;
-  
+TextWord::TextWord(GfxState *state, int rotA, double fontSizeA) {
   rot = rotA;
-  font = fontA;
   fontSize = fontSizeA;
-  state->transform(x0, y0, &x, &y);
-  if ((gfxFont = font->gfxFont)) {
-    ascent = gfxFont->getAscent() * fontSize;
-    descent = gfxFont->getDescent() * fontSize;
-    wMode = gfxFont->getWMode();
-  } else {
-    // this means that the PDF file draws text without a current font,
-    // which should never happen
-    ascent = 0.95 * fontSize;
-    descent = -0.35 * fontSize;
-    wMode = 0;
-  }
-  if (wMode) { // vertical writing mode
-    // NB: the rotation value has been incremented by 1 (in
-    // TextPage::beginWord()) for vertical writing mode
-    switch (rot) {
-    case 0:
-      yMin = y - fontSize;
-      yMax = y;
-      base = y;
-      break;
-    case 1:
-      xMin = x;
-      xMax = x + fontSize;
-      base = x;
-      break;
-    case 2:
-      yMin = y;
-      yMax = y + fontSize;
-      base = y;
-      break;
-    case 3:
-      xMin = x - fontSize;
-      xMax = x;
-      base = x;
-      break;
-    }
-  } else { // horizontal writing mode
-    switch (rot) {
-    case 0:
-      yMin = y - ascent;
-      yMax = y - descent;
-      if (yMin == yMax) {
-	// this is a sanity check for a case that shouldn't happen -- but
-	// if it does happen, we want to avoid dividing by zero later
-	yMin = y;
-	yMax = y + 1;
-      }
-      base = y;
-      break;
-    case 1:
-      xMin = x + descent;
-      xMax = x + ascent;
-      if (xMin == xMax) {
-	// this is a sanity check for a case that shouldn't happen -- but
-	// if it does happen, we want to avoid dividing by zero later
-	xMin = x;
-	xMax = x + 1;
-      }
-      base = x;
-      break;
-    case 2:
-      yMin = y + descent;
-      yMax = y + ascent;
-      if (yMin == yMax) {
-	// this is a sanity check for a case that shouldn't happen -- but
-	// if it does happen, we want to avoid dividing by zero later
-	yMin = y;
-	yMax = y + 1;
-      }
-      base = y;
-      break;
-    case 3:
-      xMin = x - ascent;
-      xMax = x - descent;
-      if (xMin == xMax) {
-	// this is a sanity check for a case that shouldn't happen -- but
-	// if it does happen, we want to avoid dividing by zero later
-	xMin = x;
-	xMax = x + 1;
-      }
-      base = x;
-      break;
-    }
-  }
   text = NULL;
   charcode = NULL;
   edge = NULL;
   charPos = NULL;
+  font = NULL;
   len = size = 0;
   spaceAfter = gFalse;
   next = NULL;
@@ -358,12 +268,14 @@ TextWord::~TextWord() {
   gfree(charcode);
   gfree(edge);
   gfree(charPos);
+  gfree(font);
 }
 
-void TextWord::addChar(GfxState *state, double x, double y,
+void TextWord::addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
 		       double dx, double dy, int charPosA, int charLen,
 		       CharCode c, Unicode u) {
-  int wMode;
+  GfxFont *gfxFont;
+  double ascent, descent;
 
   if (len == size) {
     size += 16;
@@ -371,12 +283,28 @@ void TextWord::addChar(GfxState *state, double x, double y,
     charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode));
     edge = (double *)greallocn(edge, (size + 1), sizeof(double));
     charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
+    font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
   }
   text[len] = u;
   charcode[len] = c;
   charPos[len] = charPosA;
   charPos[len + 1] = charPosA + charLen;
-  wMode = font->gfxFont ? font->gfxFont->getWMode() : 0;
+  font[len] = fontA;
+
+  if (len == 0) {
+    if ((gfxFont = fontA->gfxFont)) {
+      ascent = gfxFont->getAscent() * fontSize;
+      descent = gfxFont->getDescent() * fontSize;
+      wMode = gfxFont->getWMode();
+    } else {
+      // this means that the PDF file draws text without a current font,
+      // which should never happen
+      ascent = 0.95 * fontSize;
+      descent = -0.35 * fontSize;
+      wMode = 0;
+    }
+  }
+
   if (wMode) { // vertical writing mode
     // NB: the rotation value has been incremented by 1 (in
     // TextPage::beginWord()) for vertical writing mode
@@ -384,27 +312,39 @@ void TextWord::addChar(GfxState *state, double x, double y,
     case 0:
       if (len == 0) {
 	xMin = x - fontSize;
+	yMin = y - fontSize;
+	yMax = y;
+	base = y;
       }
       edge[len] = x - fontSize;
       xMax = edge[len+1] = x;
       break;
     case 1:
       if (len == 0) {
+	xMin = x;
 	yMin = y - fontSize;
+	xMax = x + fontSize;
+	base = x;
       }
       edge[len] = y - fontSize;
       yMax = edge[len+1] = y;
       break;
     case 2:
       if (len == 0) {
+	yMin = y;
 	xMax = x + fontSize;
+	yMax = y + fontSize;
+	base = y;
       }
       edge[len] = x + fontSize;
       xMin = edge[len+1] = x;
       break;
     case 3:
       if (len == 0) {
+	xMin = x - fontSize;
+	xMax = x;
 	yMax = y + fontSize;
+	base = x;
       }
       edge[len] = y + fontSize;
       yMin = edge[len+1] = y;
@@ -415,27 +355,63 @@ void TextWord::addChar(GfxState *state, double x, double y,
     case 0:
       if (len == 0) {
 	xMin = x;
+	yMin = y - ascent;
+	yMax = y - descent;
+	if (yMin == yMax) {
+	  // this is a sanity check for a case that shouldn't happen -- but
+	  // if it does happen, we want to avoid dividing by zero later
+	  yMin = y;
+	  yMax = y + 1;
+	}
+	base = y;
       }
       edge[len] = x;
       xMax = edge[len+1] = x + dx;
       break;
     case 1:
       if (len == 0) {
+	xMin = x + descent;
 	yMin = y;
+	xMax = x + ascent;
+	if (xMin == xMax) {
+	  // this is a sanity check for a case that shouldn't happen -- but
+	  // if it does happen, we want to avoid dividing by zero later
+	  xMin = x;
+	  xMax = x + 1;
+	}
+	base = x;
       }
       edge[len] = y;
       yMax = edge[len+1] = y + dy;
       break;
     case 2:
       if (len == 0) {
+	yMin = y + descent;
 	xMax = x;
+	yMax = y + ascent;
+	if (yMin == yMax) {
+	  // this is a sanity check for a case that shouldn't happen -- but
+	  // if it does happen, we want to avoid dividing by zero later
+	  yMin = y;
+	  yMax = y + 1;
+	}
+	base = y;
       }
       edge[len] = x;
       xMin = edge[len+1] = x + dx;
       break;
     case 3:
       if (len == 0) {
+	xMin = x - ascent;
+	xMax = x - descent;
 	yMax = y;
+	if (xMin == xMax) {
+	  // this is a sanity check for a case that shouldn't happen -- but
+	  // if it does happen, we want to avoid dividing by zero later
+	  xMin = x;
+	  xMax = x + 1;
+	}
+	base = x;
       }
       edge[len] = y;
       yMin = edge[len+1] = y + dy;
@@ -466,12 +442,14 @@ void TextWord::merge(TextWord *word) {
     charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
     edge = (double *)greallocn(edge, (size + 1), sizeof(double));
     charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
+    font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
   }
   for (i = 0; i < word->len; ++i) {
     text[len + i] = word->text[i];
     charcode[len + i] = word->charcode[i];
     edge[len + i] = word->edge[i];
     charPos[len + i] = word->charPos[i];
+    font[len + i] = word->font[i];
   }
   edge[len + word->len] = word->edge[word->len];
   charPos[len + word->len] = word->charPos[word->len];
@@ -863,7 +841,7 @@ void TextLine::coalesce(UnicodeMap *uMap) {
 	word0->spaceAfter = gTrue;
 	word0 = word1;
 	word1 = word1->next;
-      } else if (word0->font == word1->font &&
+      } else if (word0->font[word0->len - 1] == word1->font[0] &&
 		 word0->underlined == word1->underlined &&
 		 fabs(word0->fontSize - word1->fontSize) <
 		   maxWordFontSizeDelta * words->fontSize &&
@@ -2234,7 +2212,7 @@ void TextPage::updateFont(GfxState *state) {
   }
 }
 
-void TextPage::beginWord(GfxState *state, double x0, double y0) {
+void TextPage::beginWord(GfxState *state) {
   GfxFont *gfxFont;
   double *fontm;
   double m[4], m2[4];
@@ -2274,7 +2252,7 @@ void TextPage::beginWord(GfxState *state, double x0, double y0) {
     rot = (rot + 1) & 3;
   }
 
-  curWord = new TextWord(state, rot, x0, y0, curFont, curFontSize);
+  curWord = new TextWord(state, rot, curFontSize);
 }
 
 void TextPage::addChar(GfxState *state, double x, double y,
@@ -2283,6 +2261,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
   double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
   GBool overlap;
   int i;
+  int wMode;
 
   // subtract char and word spacing from the dx,dy values
   sp = state->getCharSpace();
@@ -2329,6 +2308,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
   // (3) the previous character was an overlap (we want each duplicated
   //     character to be in a word by itself at this stage),
   // (4) the font size has changed
+  // (5) the WMode changed
   if (curWord && curWord->len > 0) {
     base = sp = delta = 0; // make gcc happy
     switch (curWord->rot) {
@@ -2355,11 +2335,13 @@ void TextPage::addChar(GfxState *state, double x, double y,
     }
     overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
               fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
+    wMode = curFont->gfxFont ? curFont->gfxFont->getWMode() : 0;
     if (overlap || lastCharOverlap ||
 	sp < -minDupBreakOverlap * curWord->fontSize ||
 	sp > minWordBreakSpace * curWord->fontSize ||
 	fabs(base - curWord->base) > 0.5 ||
-	curFontSize != curWord->fontSize) {
+	curFontSize != curWord->fontSize ||
+	wMode != curWord->wMode) {
       endWord();
     }
     lastCharOverlap = overlap;
@@ -2370,7 +2352,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
   if (uLen != 0) {
     // start a new word if needed
     if (!curWord) {
-      beginWord(state, x, y);
+      beginWord(state);
     }
 
     // page rotation and/or transform matrices can cause text to be
@@ -2381,7 +2363,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
         (curWord->rot == 2 && w1 > 0) ||
         (curWord->rot == 3 && h1 > 0)) {
       endWord();
-      beginWord(state, x + dx, y + dy);
+      beginWord(state);
       x1 += w1;
       y1 += h1;
       w1 = -w1;
@@ -2397,18 +2379,18 @@ void TextPage::addChar(GfxState *state, double x, double y,
 	  /* next code is a low surrogate */
 	  Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
 	  i++;
-	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu);
+	  curWord->addChar(state, curFont, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu);
 	} else {
 	    /* missing low surrogate
 	     replace it with REPLACEMENT CHARACTER (U+FFFD) */
-	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
+	  curWord->addChar(state, curFont, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
 	}
       } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
 	  /* invalid low surrogate
 	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
-	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
+	curWord->addChar(state, curFont, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd);
       } else {
-	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
+	curWord->addChar(state, curFont, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]);
       }
     }
   }
@@ -4356,29 +4338,32 @@ void TextSelectionPainter::visitLine (TextLine *line,
 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
 				      PDFRectangle *selection)
 {
-  GooString *string;
-  int i;
-
   state->setFillColor(glyph_color);
   out->updateFillColor(state);
-  word->font->gfxFont->incRefCnt();
-  state->setFont(word->font->gfxFont, word->fontSize);
-  out->updateFont(state);
 
-  /* The only purpose of this string is to let the output device query
-   * it's length.  Might want to change this interface later. */
+  while (begin < end) {
+    TextFontInfo *font = word->font[begin];
+    font->gfxFont->incRefCnt();
+    state->setFont(font->gfxFont, word->fontSize);
+    out->updateFont(state);
 
-  string = new GooString ((char *) word->charcode, end - begin);
+    int fEnd = begin + 1;
+    while (fEnd < end && font->matches(word->font[fEnd]))
+      fEnd++;
 
-  out->beginString(state, string);
+    /* The only purpose of this string is to let the output device query
+     * it's length.  Might want to change this interface later. */
+    GooString *string = new GooString ((char *) word->charcode, fEnd - begin);
+    out->beginString(state, string);
 
-  for (i = begin; i < end; i++)
-    out->drawChar(state, word->edge[i], word->base, 0, 0, 0, 0,
-		  word->charcode[i], 1, NULL, 0);
-  
-  out->endString(state);
-
-  delete string;
+    for (int i = begin; i < fEnd; i++) {
+      out->drawChar(state, word->edge[i], word->base, 0, 0, 0, 0,
+		    word->charcode[i], 1, NULL, 0);
+    }
+    out->endString(state);
+    delete string;
+    begin = fEnd;
+  }
 }
 
 void TextWord::visitSelection(TextSelectionVisitor *visitor,
diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h
index e31876b..e7707af 100644
--- a/poppler/TextOutputDev.h
+++ b/poppler/TextOutputDev.h
@@ -113,14 +113,13 @@ class TextWord {
 public:
 
   // Constructor.
-  TextWord(GfxState *state, int rotA, double x0, double y0,
-	   TextFontInfo *fontA, double fontSize);
+  TextWord(GfxState *state, int rotA, double fontSize);
 
   // Destructor.
   ~TextWord();
 
   // Add a character to the word.
-  void addChar(GfxState *state, double x, double y,
+  void addChar(GfxState *state, TextFontInfo *fontA, double x, double y,
 	       double dx, double dy, int charPosA, int charLen,
 	       CharCode c, Unicode u);
 
@@ -141,8 +140,8 @@ public:
 		      PDFRectangle *selection,
 		      SelectionStyle style);
 
-  // Get the TextFontInfo object associated with this word.
-  TextFontInfo *getFontInfo() { return font; }
+  // Get the TextFontInfo object associated with a character.
+  TextFontInfo *getFontInfo(int idx) { return font[idx]; }
 
   // Get the next TextWord on the linked list.
   TextWord *getNext() { return next; }
@@ -151,7 +150,7 @@ public:
   int getLength() { return len; }
   const Unicode *getChar(int idx) { return &text[idx]; }
   GooString *getText();
-  GooString *getFontName() { return font->fontName; }
+  GooString *getFontName(int idx) { return font[idx]->fontName; }
   void getColor(double *r, double *g, double *b)
     { *r = colorR; *g = colorG; *b = colorB; }
   void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
@@ -184,13 +183,14 @@ private:
   int *charPos;			// character position (within content stream)
 				//   of each char (plus one extra entry for
 				//   the last char)
-  int len;			// length of text/edge/charPos arrays
-  int size;			// size of text/edge/charPos arrays
-  TextFontInfo *font;		// font information
+  int len;			// length of text/edge/charPos/font arrays
+  int size;			// size of text/edge/charPos/font arrays
+  TextFontInfo **font;		// font information for each char
   double fontSize;		// font size
   GBool spaceAfter;		// set if there is a space between this
 				//   word and the next word on the line
   TextWord *next;		// next word in line
+  int wMode;			// horizontal (0) or vertical (1) writing mode
 
 #if TEXTOUT_WORD_LIST
   double colorR,		// word color
@@ -498,7 +498,7 @@ public:
   void updateFont(GfxState *state);
 
   // Begin a new word.
-  void beginWord(GfxState *state, double x0, double y0);
+  void beginWord(GfxState *state);
 
   // Add a character to the current word.
   void addChar(GfxState *state, double x, double y,
-- 
1.7.9.5