From e5a73d5f9302a09a019cfc7ec090e801111aaa5b Mon Sep 17 00:00:00 2001 From: Hans-Peter Deifel Date: Sat, 15 Aug 2015 17:09:42 +0200 Subject: [PATCH] cpp: Fix utf8/utf16 conversion The old code assumed that ustring::size() would return the number of bytes in ustring, but it really returns the number of characters. Since ustring is a basic_string, these two values differ (by a factor of two). This needs to be considered when using iconv, since it operates on byte counts, not character counts. --- cpp/poppler-global.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/poppler-global.cpp b/cpp/poppler-global.cpp index 525dc99..61d2b0e 100644 --- a/cpp/poppler-global.cpp +++ b/cpp/poppler-global.cpp @@ -225,9 +225,9 @@ byte_array ustring::to_utf8() const return byte_array(); } const value_type *me_data = data(); - byte_array str(size()); + byte_array str(size()*sizeof(value_type)); char *str_data = &str[0]; - size_t me_len_char = size(); + size_t me_len_char = size()*sizeof(value_type); size_t str_len_left = str.size(); size_t ir = iconv(ic, (ICONV_CONST char **)&me_data, &me_len_char, &str_data, &str_len_left); if ((ir == (size_t)-1) && (errno == E2BIG)) { @@ -273,23 +273,24 @@ ustring ustring::from_utf8(const char *str, int len) return ustring(); } - ustring ret(len * 2, 0); + // +1, because iconv inserts byte order marks + ustring ret(len+1, 0); char *ret_data = reinterpret_cast(&ret[0]); char *str_data = const_cast(str); size_t str_len_char = len; - size_t ret_len_left = ret.size(); + size_t ret_len_left = ret.size() * sizeof(ustring::value_type); size_t ir = iconv(ic, (ICONV_CONST char **)&str_data, &str_len_char, &ret_data, &ret_len_left); if ((ir == (size_t)-1) && (errno == E2BIG)) { const size_t delta = ret_data - reinterpret_cast(&ret[0]); - ret_len_left += ret.size(); + ret_len_left += ret.size()*sizeof(ustring::value_type); ret.resize(ret.size() * 2); - ret_data = reinterpret_cast(&ret[delta]); + ret_data = reinterpret_cast(&ret[0]) + delta; ir = iconv(ic, (ICONV_CONST char **)&str_data, &str_len_char, &ret_data, &ret_len_left); if (ir == (size_t)-1) { return ustring(); } } - ret.resize(ret.size() - ret_len_left); + ret.resize(ret.size() - ret_len_left/sizeof(ustring::value_type)); return ret; } -- 2.5.0