From 773a46c257626f59074abe7827c7ac2391fdabd0 Mon Sep 17 00:00:00 2001 From: Khaled Hosny Date: Mon, 23 Nov 2015 13:52:10 +0400 Subject: [PATCH 2/2] Fix finding Arabic Presentation Forms ligatures PDF text containing Arabic Presentation forms ligatures is still not found after the previous commit. This because the ligatures are decomposed in logical order after normalisation, while the whole string is in visual order. For example the RTL text ABCD in visual order will be DCBA, and assuming B is a ligature, it will be decomposed to B1B2 so the string after normalization will be DCB1B2A while we are expecting it to be DCB2B1A. This patch reverses the order of the decomposition of RTL characters to work around this issue. --- poppler/TextOutputDev.cc | 3 ++- poppler/UnicodeTypeTable.cc | 14 ++++++++++---- poppler/UnicodeTypeTable.h | 5 +++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 31d303d..cf74d23 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3900,7 +3900,8 @@ GBool TextPage::findText(Unicode *s, int len, if (!line->normalized) line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, - &line->normalized_idx); + &line->normalized_idx, + true); // convert the line to uppercase m = line->normalized_len; if (!caseSensitive) { diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc index 721af9d..8f574e5 100644 --- a/poppler/UnicodeTypeTable.cc +++ b/poppler/UnicodeTypeTable.cc @@ -1015,7 +1015,9 @@ Unicode unicodeToUpper(Unicode c) { // of characters written. @buf may be NULL, in which case the length of the // decomposition is returned but nothing is written. If @u is its own // decomposition, write @u into @buf and return 1. -static int decomp_compat(Unicode u, Unicode *buf) { +// If reverseRTL is true, then decompositions of RTL characters will be output +// in reverse order. +static int decomp_compat(Unicode u, Unicode *buf, GBool reverseRTL = false) { // decomposition tables stored as lists {character, decomp_length, offset} // so we do a binary search int start = 0, end = DECOMP_TABLE_LENGTH; @@ -1031,7 +1033,10 @@ static int decomp_compat(Unicode u, Unicode *buf) { int length = decomp_table[midpoint].length, i; if (buf) for (i = 0; i < length; ++i) - buf[i] = decomp_expansion[offset + i]; + if (unicodeTypeR(u) && reverseRTL) + buf[i] = decomp_expansion[offset + length - i - 1]; + else + buf[i] = decomp_expansion[offset + i]; return length; } } else if (midpoint == start) @@ -1126,7 +1131,8 @@ static GBool combine(Unicode base, Unicode add, Unicode *out) { // corresponding unnormalized character. @indices is not guaranteed monotone or // onto. Unicode *unicodeNormalizeNFKC(Unicode *in, int len, - int *out_len, int **indices) { + int *out_len, int **indices, + GBool reverseRtlCompat) { Unicode *out; int i, o, *classes, *idx = NULL; @@ -1174,7 +1180,7 @@ Unicode *unicodeNormalizeNFKC(Unicode *in, int len, u = in[j]; if (j != i && COMBINING_CLASS(u) == 0) break; - dlen = decomp_compat(u, out + p); + dlen = decomp_compat(u, out + p, reverseRtlCompat); for (q = p; q < p + dlen; ++q) { classes[q] = COMBINING_CLASS(out[q]); if (indices) diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h index 869aad9..6d10a67 100644 --- a/poppler/UnicodeTypeTable.h +++ b/poppler/UnicodeTypeTable.h @@ -38,7 +38,8 @@ extern GBool unicodeIsAlphabeticPresentationForm(Unicode c); extern Unicode unicodeToUpper(Unicode c); -extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, - int *out_len, int **offsets); +extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, + int *out_len, int **offsets, + GBool reverseRtlCompat = false); #endif -- 2.6.2