From 3d5bf7b9305ece2bb0c1625563e44d0e128a3067 Mon Sep 17 00:00:00 2001 From: Khaled Hosny Date: Wed, 18 Nov 2015 14:47:28 +0400 Subject: [PATCH] Handle right-to-left text in search Currently right-to-left text reversal is only done during text dumping, but not during search. This commit applies the same reversal logic during PDF search as well. --- poppler/TextOutputDev.cc | 227 ++++++++++++++++++++++++++++++----------------- 1 file changed, 144 insertions(+), 83 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index bbb371a..3aed89f 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -178,6 +178,133 @@ #define combMaxMidDelta 0.3 #define combMaxBaseDelta 0.4 +static Unicode* reorderText(Unicode *text, int len, GBool primaryLR) { + int i, j, k; + + Unicode *out = (Unicode*)gmallocn(len, sizeof(Unicode)); + int outIdx = 0; + + if (primaryLR) { + i = 0; + while (i < len) { + // output a left-to-right section + for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; + for (k = i; k < j; ++k) + out[outIdx++] = text[k]; + i = j; + // output a right-to-left section + for (j = i; + j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); + ++j) ; + if (j > i) { + for (k = j - 1; k >= i; --k) + out[outIdx++] = text[k]; + i = j; + } + } + } else { + // Note: This code treats numeric characters (European and + // Arabic/Indic) as left-to-right, which isn't strictly correct + // (incurs extra LRE/POPDF pairs), but does produce correct + // visual formatting. + i = len - 1; + while (i >= 0) { + // output a right-to-left section + for (j = i; + j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); + --j) ; + for (k = i; k > j; --k) + out[outIdx++] = text[k]; + i = j; + // output a left-to-right section + for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; + if (j < i) { + for (k = j + 1; k <= i; ++k) + out[outIdx++] = text[k]; + i = j; + } + } + } + + return out; +} + +static int dumpReorderedText(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GooString *s) { + char lre[8], rle[8], popdf[8], buf[8]; + int lreLen, rleLen, popdfLen, n; + int nCols, i, j, k; + + nCols = 0; + + lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); + rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); + popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); + + if (primaryLR) { + i = 0; + while (i < len) { + // output a left-to-right section + for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; + for (k = i; k < j; ++k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + i = j; + // output a right-to-left section + for (j = i; + j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); + ++j) ; + if (j > i) { + s->append(rle, rleLen); + for (k = j - 1; k >= i; --k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + s->append(popdf, popdfLen); + i = j; + } + } + + } else { + + // Note: This code treats numeric characters (European and + // Arabic/Indic) as left-to-right, which isn't strictly correct + // (incurs extra LRE/POPDF pairs), but does produce correct + // visual formatting. + s->append(rle, rleLen); + i = len - 1; + while (i >= 0) { + // output a right-to-left section + for (j = i; + j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); + --j) ; + for (k = i; k > j; --k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + i = j; + // output a left-to-right section + for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; + if (j < i) { + s->append(lre, lreLen); + for (k = j + 1; k <= i; ++k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + s->append(popdf, popdfLen); + i = j; + } + } + s->append(popdf, popdfLen); + } + + return nCols; +} + //------------------------------------------------------------------------ // TextUnderline //------------------------------------------------------------------------ @@ -3720,7 +3847,7 @@ GBool TextPage::findText(Unicode *s, int len, double *xMax, double *yMax) { TextBlock *blk; TextLine *line; - Unicode *s2, *txt; + Unicode *s2, *txt, *reordered; Unicode *p; int txtSize, m, i, j, k; double xStart, yStart, xStop, yStop; @@ -3728,20 +3855,22 @@ GBool TextPage::findText(Unicode *s, int len, double xMin1, yMin1, xMax1, yMax1; GBool found; - //~ needs to handle right-to-left text if (rawOrder) { return gFalse; } + // handle right-to-left text + reordered = reorderText(s, len, primaryLR); + // convert the search string to uppercase if (!caseSensitive) { - s2 = unicodeNormalizeNFKC(s, len, &len, NULL); + s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL); for (i = 0; i < len; ++i) { s2[i] = unicodeToUpper(s2[i]); } } else { - s2 = unicodeNormalizeNFKC(s, len, &len, NULL); + s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL); } txt = NULL; @@ -3915,6 +4044,7 @@ GBool TextPage::findText(Unicode *s, int len, } gfree(s2); + gfree(reordered); if (!caseSensitive) { gfree(txt); } @@ -5330,91 +5460,22 @@ void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) { int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s) { - char lre[8], rle[8], popdf[8], buf[8]; - int lreLen, rleLen, popdfLen, n; - int nCols, i, j, k; - - nCols = 0; - if (uMap->isUnicode()) { + return dumpReorderedText(text, len, uMap, primaryLR, s); + } else { + int nCols = 0; - lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); - rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); - popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); - - if (primaryLR) { - - i = 0; - while (i < len) { - // output a left-to-right section - for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; - for (k = i; k < j; ++k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - i = j; - // output a right-to-left section - for (j = i; - j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); - ++j) ; - if (j > i) { - s->append(rle, rleLen); - for (k = j - 1; k >= i; --k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - s->append(popdf, popdfLen); - i = j; - } - } - - } else { - - // Note: This code treats numeric characters (European and - // Arabic/Indic) as left-to-right, which isn't strictly correct - // (incurs extra LRE/POPDF pairs), but does produce correct - // visual formatting. - s->append(rle, rleLen); - i = len - 1; - while (i >= 0) { - // output a right-to-left section - for (j = i; - j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); - --j) ; - for (k = i; k > j; --k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - i = j; - // output a left-to-right section - for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; - if (j < i) { - s->append(lre, lreLen); - for (k = j + 1; k <= i; ++k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - s->append(popdf, popdfLen); - i = j; - } - } - s->append(popdf, popdfLen); + char buf[8]; + int buflen = 0; + for (int i = 0; i < len; ++i) { + buflen = uMap->mapUnicode(text[i], buf, sizeof(buf)); + s->append(buf, buflen); + nCols += buflen; } - } else { - for (i = 0; i < len; ++i) { - n = uMap->mapUnicode(text[i], buf, sizeof(buf)); - s->append(buf, n); - nCols += n; - } + return nCols; } - - return nCols; } #if TEXTOUT_WORD_LIST -- 2.6.2