From 3649437736d41e474efc8c6bde23e93e8fdf5236 Mon Sep 17 00:00:00 2001 From: Adrian Johnson Date: Sun, 12 Nov 2017 10:33:07 +1030 Subject: [PATCH 4/4] Support unicode on windows console The Win32Console should be used in programs that require unicode support for command line arguments and stdio ouput on windows. On windows it gets the command line arguments from GetCommandLineW and converts to UTF-8, and redefines the stdio output functions to convert UTF-8 to calls to WriteConsoleW. On other platforms this class is a no-op. --- poppler/PDFDoc.cc | 9 +- poppler/UTF.cc | 287 ++++++++++++++++++++++++++++++++++++- poppler/UTF.h | 39 +++++ qt5/tests/CMakeLists.txt | 1 + qt5/tests/check_utf_conversion.cpp | 87 +++++++++++ utils/CMakeLists.txt | 1 + utils/Win32Console.cc | 167 +++++++++++++++++++++ utils/Win32Console.h | 63 ++++++++ utils/pdfdetach.cc | 2 + utils/pdffonts.cc | 2 + utils/pdfimages.cc | 2 + utils/pdfinfo.cc | 2 + utils/pdfsig.cc | 2 + utils/pdftocairo.cc | 2 + utils/pdftohtml.cc | 2 + utils/pdftoppm.cc | 2 + utils/pdftops.cc | 2 + utils/pdftotext.cc | 2 + 18 files changed, 672 insertions(+), 2 deletions(-) create mode 100644 qt5/tests/check_utf_conversion.cpp create mode 100644 utils/Win32Console.cc create mode 100644 utils/Win32Console.h diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc index 09ee0a21..147d1f45 100644 --- a/poppler/PDFDoc.cc +++ b/poppler/PDFDoc.cc @@ -81,6 +81,7 @@ #endif #include "PDFDoc.h" #include "Hints.h" +#include "UTF.h" #ifdef MULTITHREADED # define pdfdocLocker() MutexLocker locker(&mutex) @@ -152,7 +153,13 @@ PDFDoc::PDFDoc(GooString *fileNameA, GooString *ownerPassword, #endif // try to open file - file = GooFile::open(fileName); +#ifdef _WIN32 + wchar_t *wFileName = (wchar_t*)utf8ToUtf16(fileName->getCString()); + file = GooFile::open(wFileName); + gfree(wFileName); +#else + file = GooFile::open(fileName); +#endif if (file == NULL) { // fopen() has failed. // Keep a copy of the errno returned by fopen so that it can be diff --git a/poppler/UTF.cc b/poppler/UTF.cc index f7b02d14..3eb1f422 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -1,6 +1,6 @@ //======================================================================== // -// UTF.h +// UTF.cc // // Copyright 2001-2003 Glyph & Cog, LLC // @@ -27,6 +27,7 @@ #include "goo/gmem.h" #include "PDFDocEncoding.h" #include "UTF.h" +#include "UnicodeMapFuncs.h" #include bool UnicodeIsValid(Unicode ucs4) @@ -130,3 +131,287 @@ bool UnicodeIsWhitespace(Unicode ucs4) Unicode const *i = std::lower_bound(spaces, end, ucs4); return (i != end && *i == ucs4); } + +// +// decodeUtf8() and decodeUtf8Table are: +// +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +// +static const uint32_t UTF8_ACCEPT = 0; +static const uint32_t UTF8_REJECT = 12; +static const uint32_t UCS4_MAX = 0x10FFFF; +static const Unicode REPLACEMENT_CHAR = 0xFFFD; + +static const uint8_t decodeUtf8Table[] = { + // The first part of the table maps bytes to character classes + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +// Decode utf8 state machine for fast UTF-8 decoding. Initialise state +// to 0 and call decodeUtf8() for each byte of UTF-8. Return value +// (and state) is UTF8_ACCEPT when it has found a valid codepoint +// (codepoint returned in codep), UTF8_REJECT when the byte is not +// allowed to occur at its position, and some other positive value if +// more bytes have to be read. Reset state to 0 to recover from +// errors. +inline uint32_t decodeUtf8(uint32_t* state, uint32_t* codep, char byte) +{ + uint32_t b = (unsigned char)byte; + uint32_t type = decodeUtf8Table[b]; + + *codep = (*state != UTF8_ACCEPT) ? + (b & 0x3fu) | (*codep << 6) : + (0xff >> type) & (b); + + *state = decodeUtf8Table[256 + *state + type]; + return *state; +} + +// Count number of UTF-16 chars required to convert a UTF8 +// string. Each invalid byte is counted as a code point since the UTF8 +// conversion functions will replace it with REPLACEMENT_CHAR. +int utf8CountUtf16Chars(const char *utf8) +{ + uint32_t codepoint; + uint32_t state = 0; + int count = 0; + + while (*utf8) { + decodeUtf8(&state, &codepoint, *utf8); + if (state == UTF8_ACCEPT) { + if (codepoint < 0x10000) + count++; + else if (codepoint <= UCS4_MAX) + count += 2; + else + count++; // replace with REPLACEMENT_CHAR + } else if (state == UTF8_REJECT) { + count++; // replace with REPLACEMENT_CHAR + state = 0; + } + utf8++; + } + if (state != UTF8_ACCEPT && state != UTF8_REJECT) + count++; // replace with REPLACEMENT_CHAR + + return count; +} + + +// Convert UTF-8 to UTF-16 +// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num +// bytes to convert +// utf16 - output buffer to write UTF-16 to. Output will always be null terminated. +// maxUtf16 - maximum size of output buffer including space for null. +// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when +// either this count is reached or a null is encountered. +// Returns number of UTF-16 code units written (excluding NULL). +int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8) +{ + uint16_t *p = utf16; + uint32_t codepoint; + uint32_t state = 0; + int nIn = 0; + int nOut = 0; + while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) { + decodeUtf8(&state, &codepoint, *utf8); + if (state == UTF8_ACCEPT) { + if (codepoint < 0x10000) { + *p++ = (uint16_t)codepoint; + nOut++; + } else if (codepoint <= UCS4_MAX) { + *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10)); + *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF)); + nOut += 2; + } else { + *p++ = REPLACEMENT_CHAR; + nOut++; + state = 0; + } + } else if (state == UTF8_REJECT) { + *p++ = REPLACEMENT_CHAR; // invalid byte for this position + nOut++; + } + utf8++; + nIn++; + } + // replace any trailing bytes too short for a valid UTF-8 with a replacement char + if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) { + *p++ = REPLACEMENT_CHAR; + nOut++; + } + if (nOut > maxUtf16 - 1) + nOut = maxUtf16 - 1; + utf16[nOut] = 0; + return nOut; +} + +// Allocate utf16 string and convert utf8 into it. +uint16_t *utf8ToUtf16(const char *utf8, int *len) +{ + int n = utf8CountUtf16Chars(utf8); + if (len) + *len = n; + uint16_t *utf16 = (uint16_t*)gmallocn(n + 1, sizeof(uint16_t)); + utf8ToUtf16(utf8, utf16); + return utf16; +} + +static const uint32_t UTF16_ACCEPT = 0; +static const uint32_t UTF16_REJECT = -1; + +// Initialise state to 0. Returns UTF16_ACCEPT when a valid code point +// has been found, UTF16_REJECT when invalid code unit for this state, +// some other valid if another code unit needs to be read. +inline uint32_t decodeUtf16(uint32_t* state, uint32_t* codePoint, uint16_t codeUnit) +{ + if (*state == 0) { + if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */ + *state = codeUnit; + return *state; + } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) { + /* invalid low surrogate */ + return UTF16_REJECT; + } else { + *codePoint = codeUnit; + return UTF16_ACCEPT; + } + } else { + if (codeUnit >= 0xdc00 && codeUnit < 0xe000) { + *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000; + *state = 0; + return UTF16_ACCEPT; + } else { + /* invalid high surrogate */ + return UTF16_REJECT; + } + } +} + +// Count number of UTF-16 chars required to convert a UTF8 +// string. Each invalid byte is counted as a code point since the UTF8 +// conversion functions will replace it with REPLACEMENT_CHAR. +int utf16CountUtf8Chars(const uint16_t *utf16) +{ + uint32_t codepoint; + uint32_t state = 0; + int count = 0; + + while (*utf16) { + decodeUtf16(&state, &codepoint, *utf16); + if (state == UTF16_ACCEPT) { + if (codepoint < 0x80) + count++; + else if (codepoint < 0x800) + count += 2; + else if (codepoint < 0x10000) + count += 3; + else if (codepoint <= UCS4_MAX) + count += 4; + else + count += 3; // replace with REPLACEMENT_CHAR + } else if (state == UTF16_REJECT) { + count += 3; // replace with REPLACEMENT_CHAR + state = 0; + } + utf16++; + } + if (state != UTF8_ACCEPT && state != UTF8_REJECT) + count++; // replace with REPLACEMENT_CHAR + + return count; +} + +// Convert UTF-16 to UTF-8 +// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num +// code units to convert +// utf8 - output buffer to write UTF-8 to. Output will always be null terminated. +// maxUtf8 - maximum size of output buffer including space for null. +// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when +// either this count is reached or a null is encountered. +// Returns number of UTF-8 bytes written (excluding NULL). +int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16) +{ + uint32_t codepoint = 0; + uint32_t state = 0; + int nIn = 0; + int nOut = 0; + char *p = utf8; + while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) { + decodeUtf16(&state, &codepoint, *utf16); + if (state == UTF16_ACCEPT || state == UTF16_REJECT) { + if (state == UTF16_REJECT || codepoint > UCS4_MAX) { + codepoint = REPLACEMENT_CHAR; + state = 0; + } + + int bufSize = maxUtf8 - nOut; + int count = mapUTF8(codepoint, p, bufSize); + p += count; + nOut += count; + } + utf16++; + nIn++; + } + // replace any trailing bytes too short for a valid UTF-8 with a replacement char + if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) { + int bufSize = maxUtf8 - nOut; + int count = mapUTF8(REPLACEMENT_CHAR, p, bufSize); + p += count; + nOut += count; + nOut++; + } + if (nOut > maxUtf8 - 1) + nOut = maxUtf8 - 1; + utf8[nOut] = 0; + return nOut; +} + +// Allocate utf8 string and convert utf16 into it. +char *utf16ToUtf8(const uint16_t *utf16, int *len) +{ + int n = utf16CountUtf8Chars(utf16); + if (len) + *len = n; + char *utf8 = (char*)gmalloc(n + 1); + utf16ToUtf8(utf16, utf8); + return utf8; +} diff --git a/poppler/UTF.h b/poppler/UTF.h index c82e165a..b22b8aa6 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -16,6 +16,9 @@ #pragma implementation #endif +#include +#include + #include "goo/GooString.h" #include "CharTypes.h" @@ -39,4 +42,40 @@ bool UnicodeIsValid(Unicode ucs4); // is a unicode whitespace character bool UnicodeIsWhitespace(Unicode ucs4); +// Count number of UTF-16 chars required to convert a UTF8 +// string. Each invalid byte is counted as a code point since the UTF8 +// conversion functions will replace it with REPLACEMENT_CHAR. +int utf8CountUtf16Chars(const char *utf8); + +// Convert UTF-8 to UTF-16 +// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num +// bytes to convert +// utf16 - output buffer to write UTF-16 to. Output will always be null terminated. +// maxUtf16 - maximum size of output buffer including space for null. +// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when +// either this count is reached or a null is encountered. +// Returns number of UTF-16 code units written (excluding NULL). +int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16 = INT_MAX, int maxUtf8 = INT_MAX); + +// Allocate utf16 string and convert utf8 into it. +uint16_t *utf8ToUtf16(const char *utf8, int *len = nullptr); + +// Count number of UTF-16 chars required to convert a UTF8 +// string. Each invalid byte is counted as a code point since the UTF8 +// conversion functions will replace it with REPLACEMENT_CHAR. +int utf16CountUtf8Chars(const uint16_t *utf16); + +// Convert UTF-16 to UTF-8 +// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num +// code units to convert +// utf8 - output buffer to write UTF-8 to. Output will always be null terminated. +// maxUtf8 - maximum size of output buffer including space for null. +// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when +// either this count is reached or a null is encountered. +// Returns number of UTF-8 bytes written (excluding NULL). +int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX); + +// Allocate utf8 string and convert utf16 into it. +char *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr); + #endif diff --git a/qt5/tests/CMakeLists.txt b/qt5/tests/CMakeLists.txt index 01a1a970..1e67e6ec 100644 --- a/qt5/tests/CMakeLists.txt +++ b/qt5/tests/CMakeLists.txt @@ -69,6 +69,7 @@ qt5_add_qtest(check_qt5_actualtext check_actualtext.cpp) qt5_add_qtest(check_qt5_lexer check_lexer.cpp) qt5_add_qtest(check_qt5_pagelabelinfo check_pagelabelinfo.cpp) qt5_add_qtest(check_qt5_goostring check_goostring.cpp) +qt5_add_qtest(check_qt5_utf_conversion check_utf_conversion.cpp) if (NOT WIN32) qt5_add_qtest(check_qt5_strings check_strings.cpp) endif () diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp new file mode 100644 index 00000000..890b91ea --- /dev/null +++ b/qt5/tests/check_utf_conversion.cpp @@ -0,0 +1,87 @@ +#include +#include + +#include +#include "UTF.h" + +class TestUTFConversion : public QObject +{ + Q_OBJECT +private slots: + void testUTF_data(); + void testUTF(); +}; + +static bool compare(const char *a, const char *b) +{ + return strcmp(a,b) == 0; +} + +static bool compare(const uint16_t *a, const uint16_t *b) +{ + while (*a && *b) { + if (*a++ != *b++) + return false; + } + return *a == *b; +} + +void TestUTFConversion::testUTF_data() +{ + QTest::addColumn("s"); + + QTest::newRow("") << QString::fromUtf8(""); + QTest::newRow("a") << QString::fromUtf8("a"); + QTest::newRow("abc") << QString::fromUtf8("abc"); + QTest::newRow("Latin") << QString::fromUtf8("Vitrum edere possum; mihi non nocet"); + QTest::newRow("Greek") << QString::fromUtf8("Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα"); + QTest::newRow("Icelandic") << QString::fromUtf8("Ég get etið gler án þess að meiða mig"); + QTest::newRow("Russian") << QString::fromUtf8("Я могу есть стекло, оно мне не вредит."); + QTest::newRow("Sanskrit") << QString::fromUtf8("काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥"); + QTest::newRow("Arabic") << QString::fromUtf8("أنا قادر على أكل الزجاج و هذا لا يؤلمني"); + QTest::newRow("Chinese") << QString::fromUtf8("我能吞下玻璃而不伤身体。"); + QTest::newRow("Thai") << QString::fromUtf8("ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ"); + QTest::newRow("non BMP") << QString::fromUtf8("𝓹𝓸𝓹𝓹𝓵𝓮𝓻"); + } + +void TestUTFConversion::testUTF() +{ + char utf8Buf[100]; + char *utf8String; + uint16_t utf16Buf[100]; + uint16_t *utf16String; + int len; + + QFETCH(QString, s); + char *str = strdup(s.toUtf8().constData()); + + // UTF-8 to UTF-16 + + // QString size() returns number of code units, not code points + QCOMPARE( utf8CountUtf16Chars(str), s.size() ); + + len = utf8ToUtf16(str, utf16Buf); + QVERIFY( compare(utf16Buf, s.utf16()) ); + QCOMPARE( len, s.size() ); + + utf16String = utf8ToUtf16(str); + QVERIFY( compare(utf16String, s.utf16()) ); + free (utf16String); + + // UTF-16 to UTF-8 + + QCOMPARE( utf16CountUtf8Chars(s.utf16()), (int)strlen(str) ); + + len = utf16ToUtf8(s.utf16(), utf8Buf); + QVERIFY( compare(utf8Buf, str) ); + QCOMPARE( len, (int)strlen(str) ); + + utf8String = utf16ToUtf8(s.utf16() ); + QVERIFY( compare(utf8String, str) ); + free (utf8String); + + free(str); +} + +QTEST_GUILESS_MAIN(TestUTFConversion) +#include "check_utf_conversion.moc" diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 07d73651..e86dc7c1 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -1,6 +1,7 @@ set(common_srcs parseargs.cc + Win32Console.cc ) set(common_libs poppler diff --git a/utils/Win32Console.cc b/utils/Win32Console.cc new file mode 100644 index 00000000..4db92de6 --- /dev/null +++ b/utils/Win32Console.cc @@ -0,0 +1,167 @@ +//======================================================================== +// +// Win32Console.cc +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2017 Adrian Johnson +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + +#ifdef _WIN32 + +#include "goo/gmem.h" +#include "UTF.h" + +#define WIN32_CONSOLE_IMPL +#include "Win32Console.h" + +#include +#include + +static const int BUF_SIZE = 4096; +static int bufLen = 0; +static char buf[BUF_SIZE]; +static wchar_t wbuf[BUF_SIZE]; +static bool stdoutIsConsole = true; +static bool stderrIsConsole = true; +static HANDLE consoleHandle = 0; + +// If all = true, flush all characters to console. +// If all = false, flush up to and including last newline. +// Also flush all if buffer > half full to ensure space for future +// writes. +static void flush(bool all = false) +{ + int nchars = 0; + + if (all || bufLen > BUF_SIZE/2) { + nchars = bufLen; + } else if (bufLen > 0) { + // find num chars up to and including last '\n' + for (nchars = bufLen; nchars > 0; --nchars) { + if (buf[nchars-1] == '\n') + break; + } + } + + if (nchars > 0) { + DWORD wlen = utf8ToUtf16(buf, (uint16_t*)wbuf, BUF_SIZE, nchars); + WriteConsoleW(consoleHandle, wbuf, wlen, &wlen, nullptr); + if (nchars < bufLen) { + memmove(buf, buf + nchars, bufLen - nchars); + bufLen -= nchars; + } else { + bufLen = 0; + } + } +} + +static inline bool streamIsConsole(FILE *stream) +{ + return ((stream == stdout && stdoutIsConsole) || (stream == stderr && stderrIsConsole)); +} + +int win32_fprintf(FILE *stream, ...) +{ + va_list args; + int ret = 0; + + va_start(args, stream); + const char *format = va_arg(args, const char *); + if (streamIsConsole(stream)) { + ret = vsnprintf(buf + bufLen, BUF_SIZE - bufLen, format, args); + bufLen += ret; + if (ret >= BUF_SIZE - bufLen) { + // output was truncated + buf[BUF_SIZE - 1] = 0; + bufLen = BUF_SIZE - 1; + } + flush(); + } else { + vfprintf(stream, format, args); + } + va_end(args); + + return ret; +} + +size_t win32_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = 0; + + if (streamIsConsole(stream)) { + int n = size * nmemb; + if (n > BUF_SIZE - bufLen - 1) + n = BUF_SIZE - bufLen - 1; + memcpy(buf + bufLen, ptr, n); + bufLen += n; + buf[bufLen] = 0; + flush(); + } else { + ret = fwrite(ptr, size, nmemb, stream); + } + + return ret; +} + + +Win32Console::Win32Console(int *argc, char **argv[]) +{ + LPWSTR *wargv; + fpos_t pos; + + argList = nullptr; + privateArgList = nullptr; + wargv = CommandLineToArgvW(GetCommandLineW(), &numArgs); + if (wargv) { + argList = new char*[numArgs]; + privateArgList = new char*[numArgs]; + for (int i = 0; i < numArgs; i++) { + argList[i] = utf16ToUtf8((uint16_t*)(wargv[i])); + // parseArgs will rearrange the argv list so we keep our own copy + // to use for freeing all the strings + privateArgList[i] = argList[i]; + } + LocalFree(wargv); + *argc = numArgs; + *argv = argList; + } + + bufLen = 0; + buf[0] = 0; + wbuf[0] = 0; + + // check if stdout or stderr redirected + // GetFileType() returns CHAR for console and special devices COMx, PRN, CON, NUL etc + // fgetpos() succeeds on all CHAR devices except console and CON. + + stdoutIsConsole = (GetFileType(GetStdHandle(STD_OUTPUT_HANDLE)) == FILE_TYPE_CHAR) + && (fgetpos(stdout, &pos) != 0); + + stderrIsConsole = (GetFileType(GetStdHandle(STD_ERROR_HANDLE)) == FILE_TYPE_CHAR) + && (fgetpos(stderr, &pos) != 0); + + // Need a handle to the console. Doesn't matter if we use stdout or stderr as + // long as the handle output is to the console. + if (stdoutIsConsole) + consoleHandle = GetStdHandle(STD_OUTPUT_HANDLE); + else if (stderrIsConsole) + consoleHandle = GetStdHandle(STD_ERROR_HANDLE); +} + +Win32Console::~Win32Console() +{ + flush(true); + if (argList) { + for (int i = 0; i < numArgs; i++) + gfree(privateArgList[i]); + delete[] argList; + delete[] privateArgList; + } +} + +#endif // _WIN32 diff --git a/utils/Win32Console.h b/utils/Win32Console.h new file mode 100644 index 00000000..46381000 --- /dev/null +++ b/utils/Win32Console.h @@ -0,0 +1,63 @@ +//======================================================================== +// +// Win32Console.h +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2017 Adrian Johnson +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + +// UTF-8 Support for win32 console +// +// Converts argc/argv to UTF-8. Supports UTF-8 stdout/stderr to win32 console. +// On other platforms this class is a no-op. + +#ifdef _WIN32 + +// Ensure stdio.h is included before redefining stdio functions. We need to provide +// our own declarations for the redefined functions because win32 stdio.h functions +// have DLL export decorations. +#include + +#ifndef WIN32_CONSOLE_IMPL // don't redefine in Win32Console.cc so we can call original functions +#define printf(...) win32_fprintf(stdout, __VA_ARGS__) +#define fprintf(stream, ...) win32_fprintf(stream, __VA_ARGS__) +#define puts(s) win32_fprintf(stdout, "%s\n", s) +#define fputs(s, stream) win32_fprintf(stream, "%s", s) +#define putc(c) win32_fprintf(stdout, "%c", c) +#define putchar(c) win32_fprintf(stdout, "%c", c) +#define fputc(c, stream) win32_fprintf(stream, "%c", c) +#define fwrite(ptr, size, nmemb, stream) win32_fwrite(ptr, size, nmemb, stream) +#endif + +extern "C" { + int win32_fprintf(FILE *stream, ...); + size_t win32_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); +} + +class Win32Console +{ +public: + Win32Console(int *argc, char **argv[]); + ~Win32Console(); +private: + int numArgs; + char **argList; + char **privateArgList; +}; + +#else + +// On other platforms this class is a no-op. + +class Win32Console +{ +public: + Win32Console(int *argc, char ***argv) {} +}; + +#endif // _WIN32 diff --git a/utils/pdfdetach.cc b/utils/pdfdetach.cc index 5bbdc1e1..a39f817e 100644 --- a/utils/pdfdetach.cc +++ b/utils/pdfdetach.cc @@ -40,6 +40,7 @@ #include "UnicodeMap.h" #include "PDFDocEncoding.h" #include "Error.h" +#include "Win32Console.h" static GBool doList = gFalse; static int saveNum = 0; @@ -99,6 +100,7 @@ int main(int argc, char *argv[]) { Unicode u; GBool isUnicode; + Win32Console win32Console(&argc, &argv); exitCode = 99; // parse args diff --git a/utils/pdffonts.cc b/utils/pdffonts.cc index 535bf8fb..2867d51f 100644 --- a/utils/pdffonts.cc +++ b/utils/pdffonts.cc @@ -39,6 +39,7 @@ #include "PDFDoc.h" #include "PDFDocFactory.h" #include "FontInfo.h" +#include "Win32Console.h" static const char *fontTypeNames[] = { "unknown", @@ -94,6 +95,7 @@ int main(int argc, char *argv[]) { GBool ok; int exitCode; + Win32Console win32Console(&argc, &argv); exitCode = 99; // parse args diff --git a/utils/pdfimages.cc b/utils/pdfimages.cc index d11b0147..525a80ba 100644 --- a/utils/pdfimages.cc +++ b/utils/pdfimages.cc @@ -47,6 +47,7 @@ #include "PDFDocFactory.h" #include "ImageOutputDev.h" #include "Error.h" +#include "Win32Console.h" static int firstPage = 1; static int lastPage = 0; @@ -120,6 +121,7 @@ int main(int argc, char *argv[]) { GBool ok; int exitCode; + Win32Console win32Console(&argc, &argv); exitCode = 99; // parse args diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index b29e97a0..a3099cf6 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -61,6 +61,7 @@ #include "JSInfo.h" #include "StructTreeRoot.h" #include "StructElement.h" +#include "Win32Console.h" static int firstPage = 1; @@ -616,6 +617,7 @@ int main(int argc, char *argv[]) { exitCode = 99; // parse args + Win32Console win32console(&argc, &argv); ok = parseArgs(argDesc, &argc, argv); if (!ok || (argc != 2 && !printEnc) || printVersion || printHelp) { fprintf(stderr, "pdfinfo version %s\n", PACKAGE_VERSION); diff --git a/utils/pdfsig.cc b/utils/pdfsig.cc index eb8acd79..e31048f4 100644 --- a/utils/pdfsig.cc +++ b/utils/pdfsig.cc @@ -29,6 +29,7 @@ #include "Error.h" #include "GlobalParams.h" #include "SignatureInfo.h" +#include "Win32Console.h" static const char * getReadableSigState(SignatureValidationStatus sig_vs) { @@ -116,6 +117,7 @@ int main(int argc, char *argv[]) std::vector sig_widgets; globalParams = new GlobalParams(); + Win32Console win32Console(&argc, &argv); int exitCode = 99; GBool ok; diff --git a/utils/pdftocairo.cc b/utils/pdftocairo.cc index 46f3e1e2..7a5ef981 100644 --- a/utils/pdftocairo.cc +++ b/utils/pdftocairo.cc @@ -55,6 +55,7 @@ #include "PDFDoc.h" #include "PDFDocFactory.h" #include "CairoOutputDev.h" +#include "Win32Console.h" #ifdef USE_CMS #ifdef USE_LCMS1 #include @@ -943,6 +944,7 @@ int main(int argc, char *argv[]) { int num_outputs; // parse args + Win32Console win32Console(&argc, &argv); if (!parseArgs(argDesc, &argc, argv)) { printUsage("pdftocairo", 0, argDesc); exit(99); diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 8e59b8b8..05a5b7e5 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -63,6 +63,7 @@ #include "Error.h" #include "DateInfo.h" #include "goo/gfile.h" +#include "Win32Console.h" static int firstPage = 1; static int lastPage = 0; @@ -190,6 +191,7 @@ int main(int argc, char *argv[]) { Object info; int exit_status = EXIT_FAILURE; + Win32Console win32Console(&argc, &argv); // parse args ok = parseArgs(argDesc, &argc, argv); if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) { diff --git a/utils/pdftoppm.cc b/utils/pdftoppm.cc index 5cd9f53c..5677a1dd 100644 --- a/utils/pdftoppm.cc +++ b/utils/pdftoppm.cc @@ -51,6 +51,7 @@ #include "splash/SplashBitmap.h" #include "splash/Splash.h" #include "SplashOutputDev.h" +#include "Win32Console.h" // Uncomment to build pdftoppm with pthreads // You may also have to change the buildsystem to @@ -394,6 +395,7 @@ int main(int argc, char *argv[]) { int pg, pg_num_len; double pg_w, pg_h, tmp; + Win32Console win32Console(&argc, &argv); exitCode = 99; // parse args diff --git a/utils/pdftops.cc b/utils/pdftops.cc index e415fcae..8dd85ba1 100644 --- a/utils/pdftops.cc +++ b/utils/pdftops.cc @@ -51,6 +51,7 @@ #include "PDFDocFactory.h" #include "PSOutputDev.h" #include "Error.h" +#include "Win32Console.h" static GBool setPSPaperSize(char *size, int &psPaperWidth, int &psPaperHeight) { if (!strcmp(size, "match")) { @@ -218,6 +219,7 @@ int main(int argc, char *argv[]) { GBool rasterAntialias = gFalse; std::vector pages; + Win32Console win32Console(&argc, &argv); exitCode = 99; // parse args diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index ebf9a2b2..5c3eaaa5 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -59,6 +59,7 @@ #include #include #include +#include "Win32Console.h" static void printInfoString(FILE *f, Dict *infoDict, const char *key, const char *text1, const char *text2, UnicodeMap *uMap); @@ -180,6 +181,7 @@ int main(int argc, char *argv[]) { char *p; int exitCode; + Win32Console win32Console(&argc, &argv); exitCode = 99; // parse args -- 2.11.0