From 1a2c87d06e068589751db96e55db931f73fa7497 Mon Sep 17 00:00:00 2001 From: Johannes Buchner Date: Wed, 16 Jun 2010 03:48:47 +1200 Subject: [PATCH 2/2] search: implemented KMP (Knuth-Morris-Pratt) --- poppler/TextOutputDev.cc | 187 +++++++++++++++++++++++++++------------------- 1 files changed, 111 insertions(+), 76 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index dfa2c43..74fb914 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3379,6 +3379,33 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { } } +static void preKmp(Unicode * needle, int m, int kmpNext[]) { + int i, j; + + i = 2; + j = 0; + kmpNext[0] = -1; + for (int i = 0; i < m; i++) { + kmpNext[i + 1] = kmpNext[i] + 1; + while (kmpNext[i + 1] > 0 && needle[i] != needle[kmpNext[i + 1] - 1]) + kmpNext[i + 1] = kmpNext[kmpNext[i + 1] - 1] + 1; + } +} + +static void preKmpBackward(Unicode * needle, int m, int kmpNext[]) { + int i, j; + + i = 2; + j = 0; + kmpNext[0] = -1; + for (int i = 0; i < m; i++) { + kmpNext[i + 1] = kmpNext[i] + 1; + while (kmpNext[i + 1] > 0 && + needle[m - 1 - i] != needle[m - 1 - kmpNext[i + 1] - 1]) + kmpNext[i + 1] = kmpNext[kmpNext[i + 1] - 1] + 1; + } +} + GBool TextPage::findText(Unicode * s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, @@ -3393,6 +3420,7 @@ GBool TextPage::findText(Unicode * s, int len, double xStart, yStart, xStop, yStop; double xMin0, yMin0, xMax0, yMax0; double xMin1, yMin1, xMax1, yMax1; + int *kmpNext; GBool found; //~ needs to handle right-to-left text @@ -3409,6 +3437,12 @@ GBool TextPage::findText(Unicode * s, int len, } else { s2 = unicodeNormalizeNFKC(s, len, &len, NULL); } + // calculate next array for KMP + kmpNext = (int *) gmallocn(len + 1, sizeof(int)); + if (backward) + preKmpBackward(s2, len, kmpNext); + else + preKmp(s2, len, kmpNext); txt = NULL; txtSize = 0; @@ -3478,88 +3512,88 @@ GBool TextPage::findText(Unicode * s, int len, } else { txt = line->normalized; } - // search each position in this line - j = backward ? m - len : 0; - p = txt + j; - while (backward ? j >= 0 : j <= m - len) { - - // compare the strings - for (k = 0; k < len; ++k) { - if (p[k] != s2[k]) { - break; - } - } - - // found it - if (k == len) { - // where s2 matches a subsequence of a compatibility equivalence - // decomposition, highlight the entire glyph, since we don't know - // the internal layout of subglyph components - int normStart = line->normalized_idx[j]; - int normAfterEnd = line->normalized_idx[j + len - 1] + 1; - switch (line->rot) { - case 0: - xMin1 = line->edge[normStart]; - xMax1 = line->edge[normAfterEnd]; - yMin1 = line->yMin; - yMax1 = line->yMax; - break; - case 1: - xMin1 = line->xMin; - xMax1 = line->xMax; - yMin1 = line->edge[normStart]; - yMax1 = line->edge[normAfterEnd]; - break; - case 2: - xMin1 = line->edge[normAfterEnd]; - xMax1 = line->edge[normStart]; - yMin1 = line->yMin; - yMax1 = line->yMax; - break; - case 3: - xMin1 = line->xMin; - xMax1 = line->xMax; - yMin1 = line->edge[normAfterEnd]; - yMax1 = line->edge[normStart]; - break; - } - if (backward) { - if ((startAtTop || - yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && - (stopAtBottom || - yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) { - if (!found || - yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) { - xMin0 = xMin1; - xMax0 = xMax1; - yMin0 = yMin1; - yMax0 = yMax1; - found = gTrue; - } + j = 0; // index in haystack=p of len m + p = txt; // haystack + k = 0; // index in needle=s2 of len len + while (j + len < m) { + if (backward ? s2[len - 1 - k] == p[m - 1 - (k + j)] : s2[k] == + p[k + j]) { + k++; + + // found it + if (k == len) { + // where s2 matches a subsequence of a compatibility equivalence + // decomposition, highlight the entire glyph, since we don't know + // the internal layout of subglyph components + int normStart = + line->normalized_idx[(backward ? m - 1 - j : j)]; + int normAfterEnd = + line-> + normalized_idx[(backward ? m - 1 - (j + len - 1) : j + + len - 1)] + 1; + switch (line->rot) { + case 0: + xMin1 = line->edge[normStart]; + xMax1 = line->edge[normAfterEnd]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 1: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[normStart]; + yMax1 = line->edge[normAfterEnd]; + break; + case 2: + xMin1 = line->edge[normAfterEnd]; + xMax1 = line->edge[normStart]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 3: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[normAfterEnd]; + yMax1 = line->edge[normStart]; + break; } - } else { - if ((startAtTop || - yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && - (stopAtBottom || - yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) { - if (!found || - yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { - xMin0 = xMin1; - xMax0 = xMax1; - yMin0 = yMin1; - yMax0 = yMax1; - found = gTrue; + if (backward) { + if ((startAtTop || + yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) + && (stopAtBottom || yMin1 > yStop + || (yMin1 == yStop && xMin1 > xStop))) { + if (!found || yMin1 > yMin0 + || (yMin1 == yMin0 && xMin1 > xMin0)) { + xMin0 = xMin1; + xMax0 = xMax1; + yMin0 = yMin1; + yMax0 = yMax1; + found = gTrue; + } + } + } else { + if ((startAtTop || + yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) + && (stopAtBottom || yMin1 < yStop + || (yMin1 == yStop && xMin1 < xStop))) { + if (!found || yMin1 < yMin0 + || (yMin1 == yMin0 && xMin1 < xMin0)) { + xMin0 = xMin1; + xMax0 = xMax1; + yMin0 = yMin1; + yMax0 = yMax1; + found = gTrue; + } } } } - } - if (backward) { - --j; - --p; } else { - ++j; - ++p; + j += k - kmpNext[k]; + if (kmpNext[k] > -1) + k = kmpNext[k]; + else + k = 0; } } } @@ -3569,6 +3603,7 @@ GBool TextPage::findText(Unicode * s, int len, if (!caseSensitive) { gfree(txt); } + gfree(kmpNext); if (found) { *xMin = xMin0; -- 1.6.4.4