From 3d5bf7b9305ece2bb0c1625563e44d0e128a3067 Mon Sep 17 00:00:00 2001
From: Khaled Hosny <khaledhosny@eglug.org>
Date: Wed, 18 Nov 2015 14:47:28 +0400
Subject: [PATCH] Handle right-to-left text in search

Currently right-to-left text reversal is only done during text dumping,
but not during search. This commit applies the same reversal logic
during PDF search as well.
---
 poppler/TextOutputDev.cc | 227 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 144 insertions(+), 83 deletions(-)

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index bbb371a..3aed89f 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -178,6 +178,133 @@
 #define combMaxMidDelta 0.3
 #define combMaxBaseDelta 0.4
 
+static Unicode* reorderText(Unicode *text, int len, GBool primaryLR) {
+  int i, j, k;
+
+  Unicode *out = (Unicode*)gmallocn(len, sizeof(Unicode));
+  int outIdx = 0;
+
+  if (primaryLR) {
+    i = 0;
+    while (i < len) {
+      // output a left-to-right section
+      for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
+      for (k = i; k < j; ++k)
+          out[outIdx++] = text[k];
+      i = j;
+      // output a right-to-left section
+      for (j = i;
+         j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+         ++j) ;
+      if (j > i) {
+        for (k = j - 1; k >= i; --k)
+            out[outIdx++] = text[k];
+        i = j;
+      }
+    }
+  } else {
+    // Note: This code treats numeric characters (European and
+    // Arabic/Indic) as left-to-right, which isn't strictly correct
+    // (incurs extra LRE/POPDF pairs), but does produce correct
+    // visual formatting.
+    i = len - 1;
+    while (i >= 0) {
+      // output a right-to-left section
+      for (j = i;
+         j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+         --j) ;
+      for (k = i; k > j; --k)
+          out[outIdx++] = text[k];
+      i = j;
+      // output a left-to-right section
+      for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
+      if (j < i) {
+        for (k = j + 1; k <= i; ++k)
+            out[outIdx++] = text[k];
+        i = j;
+      }
+    }
+  }
+
+  return out;
+}
+
+static int dumpReorderedText(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GooString *s) {
+  char lre[8], rle[8], popdf[8], buf[8];
+  int lreLen, rleLen, popdfLen, n;
+  int nCols, i, j, k;
+
+  nCols = 0;
+
+  lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
+  rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
+  popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
+
+  if (primaryLR) {
+    i = 0;
+    while (i < len) {
+      // output a left-to-right section
+      for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
+      for (k = i; k < j; ++k) {
+        n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+        s->append(buf, n);
+        ++nCols;
+      }
+      i = j;
+      // output a right-to-left section
+      for (j = i;
+         j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+         ++j) ;
+      if (j > i) {
+        s->append(rle, rleLen);
+        for (k = j - 1; k >= i; --k) {
+          n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+          s->append(buf, n);
+          ++nCols;
+        }
+        s->append(popdf, popdfLen);
+        i = j;
+      }
+    }
+
+  } else {
+
+    // Note: This code treats numeric characters (European and
+    // Arabic/Indic) as left-to-right, which isn't strictly correct
+    // (incurs extra LRE/POPDF pairs), but does produce correct
+    // visual formatting.
+    s->append(rle, rleLen);
+    i = len - 1;
+    while (i >= 0) {
+      // output a right-to-left section
+      for (j = i;
+         j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+         --j) ;
+      for (k = i; k > j; --k) {
+        n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+        s->append(buf, n);
+        ++nCols;
+      }
+      i = j;
+      // output a left-to-right section
+      for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
+      if (j < i) {
+        s->append(lre, lreLen);
+        for (k = j + 1; k <= i; ++k) {
+          n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+          s->append(buf, n);
+          ++nCols;
+        }
+        s->append(popdf, popdfLen);
+        i = j;
+      }
+    }
+    s->append(popdf, popdfLen);
+  }
+
+  return nCols;
+}
+
 //------------------------------------------------------------------------
 // TextUnderline
 //------------------------------------------------------------------------
@@ -3720,7 +3847,7 @@ GBool TextPage::findText(Unicode *s, int len,
 			 double *xMax, double *yMax) {
   TextBlock *blk;
   TextLine *line;
-  Unicode *s2, *txt;
+  Unicode *s2, *txt, *reordered;
   Unicode *p;
   int txtSize, m, i, j, k;
   double xStart, yStart, xStop, yStop;
@@ -3728,20 +3855,22 @@ GBool TextPage::findText(Unicode *s, int len,
   double xMin1, yMin1, xMax1, yMax1;
   GBool found;
 
-  //~ needs to handle right-to-left text
 
   if (rawOrder) {
     return gFalse;
   }
 
+  // handle right-to-left text
+  reordered = reorderText(s, len, primaryLR);
+
   // convert the search string to uppercase
   if (!caseSensitive) {
-    s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
+    s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL);
     for (i = 0; i < len; ++i) {
       s2[i] = unicodeToUpper(s2[i]);
     }
   } else {
-    s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
+    s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL);
   }
 
   txt = NULL;
@@ -3915,6 +4044,7 @@ GBool TextPage::findText(Unicode *s, int len,
   }
 
   gfree(s2);
+  gfree(reordered);
   if (!caseSensitive) {
     gfree(txt);
   }
@@ -5330,91 +5460,22 @@ void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
 
 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
 			   GooString *s) {
-  char lre[8], rle[8], popdf[8], buf[8];
-  int lreLen, rleLen, popdfLen, n;
-  int nCols, i, j, k;
-
-  nCols = 0;
-
   if (uMap->isUnicode()) {
+    return dumpReorderedText(text, len, uMap, primaryLR, s);
+  } else {
+    int nCols = 0;
 
-    lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
-    rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
-    popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
-
-    if (primaryLR) {
-
-      i = 0;
-      while (i < len) {
-	// output a left-to-right section
-	for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
-	for (k = i; k < j; ++k) {
-	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-	  s->append(buf, n);
-	  ++nCols;
-	}
-	i = j;
-	// output a right-to-left section
-	for (j = i;
-	     j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
-	     ++j) ;
-	if (j > i) {
-	  s->append(rle, rleLen);
-	  for (k = j - 1; k >= i; --k) {
-	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-	    s->append(buf, n);
-	    ++nCols;
-	  }
-	  s->append(popdf, popdfLen);
-	  i = j;
-	}
-      }
-
-    } else {
-
-      // Note: This code treats numeric characters (European and
-      // Arabic/Indic) as left-to-right, which isn't strictly correct
-      // (incurs extra LRE/POPDF pairs), but does produce correct
-      // visual formatting.
-      s->append(rle, rleLen);
-      i = len - 1;
-      while (i >= 0) {
-	// output a right-to-left section
-	for (j = i;
-	     j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
-	     --j) ;
-	for (k = i; k > j; --k) {
-	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-	  s->append(buf, n);
-	  ++nCols;
-	}
-	i = j;
-	// output a left-to-right section
-	for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
-	if (j < i) {
-	  s->append(lre, lreLen);
-	  for (k = j + 1; k <= i; ++k) {
-	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-	    s->append(buf, n);
-	    ++nCols;
-	  }
-	  s->append(popdf, popdfLen);
-	  i = j;
-	}
-      }
-      s->append(popdf, popdfLen);
+    char buf[8];
+    int buflen = 0;
 
+    for (int i = 0; i < len; ++i) {
+      buflen = uMap->mapUnicode(text[i], buf, sizeof(buf));
+      s->append(buf, buflen);
+      nCols += buflen;
     }
 
-  } else {
-    for (i = 0; i < len; ++i) {
-      n = uMap->mapUnicode(text[i], buf, sizeof(buf));
-      s->append(buf, n);
-      nCols += n;
-    }
+    return nCols;
   }
-
-  return nCols;
 }
 
 #if TEXTOUT_WORD_LIST
-- 
2.6.2