From 4d4ec33c06a4ce59c2000ff84d1b1d100f321b7e Mon Sep 17 00:00:00 2001 From: Johannes Buchner Date: Mon, 14 Jun 2010 18:30:21 +1200 Subject: [PATCH] search: implemented KMP --- poppler/TextOutputDev.cc | 228 ++++++++++++++++++++++++++-------------------- 1 files changed, 128 insertions(+), 100 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 87f6f08..b0eed41 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3379,6 +3379,33 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { } } +static void preKmp(Unicode *needle, int m, int kmpNext[]) { + int i, j; + + i = 2; + j = 0; + kmpNext[0] = -1; + for (int i = 0; i < m; i++) { + kmpNext[i + 1] = kmpNext[i] + 1; + while (kmpNext[i + 1] > 0 && + needle[i] != needle[kmpNext[i + 1] - 1]) + kmpNext[i + 1] = kmpNext[kmpNext[i + 1] - 1] + 1; + } +} +static void preKmpBackward(Unicode *needle, int m, int kmpNext[]) { + int i, j; + + i = 2; + j = 0; + kmpNext[0] = -1; + for (int i = 0; i < m; i++) { + kmpNext[i + 1] = kmpNext[i] + 1; + while (kmpNext[i + 1] > 0 && + needle[m - 1 - i] != needle[m - 1 - kmpNext[i + 1] - 1]) + kmpNext[i + 1] = kmpNext[kmpNext[i + 1] - 1] + 1; + } +} + GBool TextPage::findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, @@ -3393,6 +3420,7 @@ GBool TextPage::findText(Unicode *s, int len, double xStart, yStart, xStop, yStop; double xMin0, yMin0, xMax0, yMax0; double xMin1, yMin1, xMax1, yMax1; + int* kmpNext; GBool found; //~ needs to handle right-to-left text @@ -3410,6 +3438,12 @@ GBool TextPage::findText(Unicode *s, int len, } else { s2 = unicodeNormalizeNFKC(s, len, &len, NULL); } + // calculate next array for KMP + kmpNext = (int*)gmallocn(len + 1, sizeof(int)); + if (backward) + preKmpBackward(s2, len, kmpNext); + else + preKmp(s2, len, kmpNext); txt = NULL; txtSize = 0; @@ -3453,124 +3487,118 @@ GBool TextPage::findText(Unicode *s, int len, // check: is the line above the top limit? if (!startAtTop && - (backward ? line->yMin > yStart : line->yMin < yStart)) { - continue; + (backward ? line->yMin > yStart : line->yMin < yStart)) { + continue; } // check: is the line below the bottom limit? if (!stopAtBottom && - (backward ? line->yMin < yStop : line->yMin > yStop)) { - continue; + (backward ? line->yMin < yStop : line->yMin > yStop)) { + continue; } if (!line->normalized) - line->normalized = unicodeNormalizeNFKC(line->text, line->len, - &line->normalized_len, - &line->normalized_idx); + line->normalized = unicodeNormalizeNFKC(line->text, line->len, + &line->normalized_len, + &line->normalized_idx); // convert the line to uppercase m = line->normalized_len; if (!caseSensitive) { - if (m > txtSize) { - txt = (Unicode *)greallocn(txt, m, sizeof(Unicode)); - txtSize = m; - } - for (k = 0; k < m; ++k) { - txt[k] = unicodeToUpper(line->normalized[k]); - } - } else { - txt = line->normalized; - } - + if (m > txtSize) { + txt = (Unicode *)greallocn(txt, m, sizeof(Unicode)); + txtSize = m; + } + for (k = 0; k < m; ++k) { + txt[k] = unicodeToUpper(line->normalized[k]); + } + } else { + txt = line->normalized; + } // search each position in this line - j = backward ? m - len : 0; - p = txt + j; - while (backward ? j >= 0 : j <= m - len) { - - // compare the strings - for (k = 0; k < len; ++k) { - if (p[k] != s2[k]) { - break; - } - } - - // found it - if (k == len) { - // where s2 matches a subsequence of a compatibility equivalence - // decomposition, highlight the entire glyph, since we don't know - // the internal layout of subglyph components - int normStart = line->normalized_idx[j]; - int normAfterEnd = line->normalized_idx[j + len - 1] + 1; - switch (line->rot) { - case 0: - xMin1 = line->edge[normStart]; - xMax1 = line->edge[normAfterEnd]; - yMin1 = line->yMin; - yMax1 = line->yMax; - break; - case 1: - xMin1 = line->xMin; - xMax1 = line->xMax; - yMin1 = line->edge[normStart]; - yMax1 = line->edge[normAfterEnd]; - break; - case 2: - xMin1 = line->edge[normAfterEnd]; - xMax1 = line->edge[normStart]; - yMin1 = line->yMin; - yMax1 = line->yMax; - break; - case 3: - xMin1 = line->xMin; - xMax1 = line->xMax; - yMin1 = line->edge[normAfterEnd]; - yMax1 = line->edge[normStart]; - break; - } - if (backward) { - if ((startAtTop || - yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && - (stopAtBottom || - yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) { - if (!found || - yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) { - xMin0 = xMin1; - xMax0 = xMax1; - yMin0 = yMin1; - yMax0 = yMax1; - found = gTrue; - } - } - } else { - if ((startAtTop || - yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && - (stopAtBottom || - yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) { - if (!found || - yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { - xMin0 = xMin1; - xMax0 = xMax1; - yMin0 = yMin1; - yMax0 = yMax1; - found = gTrue; - } - } - } - } - if (backward) { - --j; - --p; - } else { - ++j; - ++p; - } + j = 0; // index in haystack=p of len m + p = txt; // haystack + k = 0; // index in needle=s2 of len len + while (j + len < m) { + if (backward ? s2[len - 1 - k] == p[m - 1 - (k + j)] : s2[k] == p[k + j]) { + k++; + if (k == len) { + // where s2 matches a subsequence of a compatibility equivalence + // decomposition, highlight the entire glyph, since we don't know + // the internal layout of subglyph components + int normStart = line->normalized_idx[(backward? m - 1 - j : j)]; + int normAfterEnd = line->normalized_idx[(backward? m - 1 - (j + len - 1): j + len - 1)] + 1; + switch (line->rot) { + case 0: + xMin1 = line->edge[normStart]; + xMax1 = line->edge[normAfterEnd]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 1: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[normStart]; + yMax1 = line->edge[normAfterEnd]; + break; + case 2: + xMin1 = line->edge[normAfterEnd]; + xMax1 = line->edge[normStart]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 3: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[normAfterEnd]; + yMax1 = line->edge[normStart]; + break; + } + if (backward) { + if ((startAtTop || + yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && + (stopAtBottom || + yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) { + if (!found || + yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) { + xMin0 = xMin1; + xMax0 = xMax1; + yMin0 = yMin1; + yMax0 = yMax1; + found = gTrue; + } + } + } else { + if ((startAtTop || + yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && + (stopAtBottom || + yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) { + if (!found || + yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { + xMin0 = xMin1; + xMax0 = xMax1; + yMin0 = yMin1; + yMax0 = yMax1; + found = gTrue; + } + } + } + } + } else { + j += k - kmpNext[k]; + if (kmpNext[k] > -1) + k = kmpNext[k]; + else + k = 0; + } } } - } + } gfree(s2); if (!caseSensitive) { gfree(txt); } + gfree(kmpNext); if (found) { *xMin = xMin0; -- 1.6.4.4