From d4dd796e74a50f2ca9b2321237eee1423019e173 Mon Sep 17 00:00:00 2001 From: Christophe Fergeau Date: Tue, 26 Sep 2017 11:02:58 +0200 Subject: [PATCH 1/2] goo: Add GooString::has{Big,Little}EndianBOM --- goo/GooString.cc | 10 ++++++++++ goo/GooString.h | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/goo/GooString.cc b/goo/GooString.cc index 10976e87..0838d058 100644 --- a/goo/GooString.cc +++ b/goo/GooString.cc @@ -917,6 +917,16 @@ GBool GooString::hasUnicodeMarker(void) const return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff; } +GBool GooString::hasBigEndianBOM(void) const +{ + return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff; +} + +GBool GooString::hasLittleEndianBOM(void) const +{ + return length > 1 && (s[0] & 0xff) == 0xff && (s[1] & 0xff) == 0xfe; +} + void GooString::prependUnicodeMarker() { insert(0, (char)0xff); diff --git a/goo/GooString.h b/goo/GooString.h index a5418c3d..623655fc 100644 --- a/goo/GooString.h +++ b/goo/GooString.h @@ -161,8 +161,11 @@ public: GBool endsWith(const char *suffix) const; GBool hasUnicodeMarker(void) const; + GBool hasBigEndianBOM(void) const; + GBool hasLittleEndianBOM(void) const; void prependUnicodeMarker(); - GBool hasJustUnicodeMarker(void) const { return length == 2 && hasUnicodeMarker(); } + /* FIXME: Move this (hasBigEndianBOM() || hasLittleEndianBOM()) check to hasUnicodeMarker? */ + GBool hasJustUnicodeMarker(void) const { return length == 2 && (hasBigEndianBOM() || hasLittleEndianBOM()); } // Sanitizes the string so that it does // not contain any ( ) < > [ ] { } / % -- 2.13.5 From 993ab2aa1f4cda5b6ab74f7434146879866e76cd Mon Sep 17 00:00:00 2001 From: Christophe Fergeau Date: Tue, 26 Sep 2017 11:03:49 +0200 Subject: [PATCH 2/2] document: Handle UTF16-LE annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I can produce such annotations when adding annotations to a PDF attachement from the standard mail app on my iPhone (iOS 11). They currently all show as "ÿþÚ" rather than the actual string content. UTF16-BE vs UTF16-LE is detected by inferring the endianness from the first two bytes of the string (0xFF 0xFE and 0xFE 0xFF aka Byte Order Marker). --- glib/poppler-document.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/glib/poppler-document.cc b/glib/poppler-document.cc index 41b6a04b..e8efd3b2 100644 --- a/glib/poppler-document.cc +++ b/glib/poppler-document.cc @@ -716,10 +716,14 @@ char *_poppler_goo_string_to_utf8(GooString *s) char *result; - if (s->hasUnicodeMarker()) { + if (s->hasBigEndianBOM()) { result = g_convert (s->getCString () + 2, s->getLength () - 2, "UTF-8", "UTF-16BE", NULL, NULL, NULL); + } else if (s->hasLittleEndianBOM()) { + result = g_convert (s->getCString () + 2, + s->getLength () - 2, + "UTF-8", "UTF-16LE", NULL, NULL, NULL); } else { int len; gunichar *ucs4_temp; -- 2.13.5