From 6dc2e36442357d62ab412610f9be7c254a1819c7 Mon Sep 17 00:00:00 2001 From: Jason Crain Date: Wed, 28 Sep 2016 14:56:02 +0000 Subject: [PATCH] TextOutputDev: Break words on all whitespace characters Some PDF creators like Chrome use no-break spaces or other whitespace characters between words, causing pdftotext -bbox to not break words as expected. Fix this by breaking words on any character with the Unicode whitespace property. Bug #97399 --- poppler/TextOutputDev.cc | 2 +- poppler/UTF.cc | 12 ++++++++++++ poppler/UTF.h | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 56ea3cc..e0dda08 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2607,7 +2607,7 @@ void TextPage::addChar(GfxState *state, double x, double y, } // break words at space character - if (uLen == 1 && u[0] == (Unicode)0x20) { + if (uLen == 1 && UnicodeIsWhitespace(u[0])) { charPos += nBytes; endWord(); return; diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 3b3ae35..c140bd4 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -26,6 +26,7 @@ #include "goo/gmem.h" #include "PDFDocEncoding.h" #include "UTF.h" +#include bool UnicodeIsValid(Unicode ucs4) { @@ -117,3 +118,14 @@ int TextStringToUCS4(GooString *textStr, Unicode **ucs4) *ucs4 = u; return len; } + +bool UnicodeIsWhitespace(Unicode ucs4) +{ + static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, + 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, + 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, + 0x3000 }; + Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]); + Unicode const *i = std::lower_bound(spaces, end, ucs4); + return (i != end && *i == ucs4); +} diff --git a/poppler/UTF.h b/poppler/UTF.h index 248c168..5a47902 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -35,5 +35,7 @@ int TextStringToUCS4(GooString *textStr, Unicode **ucs4); // check if UCS-4 character is valid bool UnicodeIsValid(Unicode ucs4); +// is a unicode whitespace character +bool UnicodeIsWhitespace(Unicode ucs4); #endif -- 2.9.3