From 58aec06968f134a2a2ef36c13bae0a2029b9a7c9 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Sat, 24 Sep 2011 14:19:54 +0100 Subject: [PATCH] unicode: Add support for compatibility decomposition Add a new decompose_fully vfunc to the Unicode function table which decomposes a single codepoint in NFKD mode. This includes GLib, ICU and nil implementations of the vfunc, plus a selection of tests. Helps: fdo#41095 --- src/hb-glib.cc | 33 +++++++++++++++++++++++++++++++++ src/hb-icu.cc | 41 +++++++++++++++++++++++++++++++++++++++++ src/hb-unicode-private.hh | 1 + src/hb-unicode.cc | 19 +++++++++++++++++++ src/hb-unicode.h | 34 ++++++++++++++++++++++++++++++++++ test/test-unicode.c | 39 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 167 insertions(+), 0 deletions(-) diff --git a/src/hb-glib.cc b/src/hb-glib.cc index f990988..2b91c96 100644 --- a/src/hb-glib.cc +++ b/src/hb-glib.cc @@ -327,6 +327,39 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_glib_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ +#if GLIB_CHECK_VERSION(2,29,12) + return g_unichar_fully_decompose (ab, TRUE, decomposed, HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN); +#endif + + /* If the user doesn't have GLib >= 2.29.12 we have to perform + * a round trip to UTF-8 and the associated memory management dance. */ + gchar utf8[6]; + gchar *utf8_decomposed, *c; + gsize utf8_len, utf8_decomposed_len, i; + + /* Convert @ab to UTF-8 and normalise it in NFKD mode. This performs the compatibility decomposition. */ + utf8_len = g_unichar_to_utf8 (ab, utf8); + utf8_decomposed = g_utf8_normalize (utf8, utf8_len, G_NORMALIZE_NFKD); + utf8_decomposed_len = g_utf8_strlen (utf8_decomposed, -1); + + /* Don't overrun the output buffer. */ + assert (utf8_decomposed_len <= HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN); + + /* Copy the decomposed UTF-8 string into our output buffer, @decomposed. */ + for (i = 0, c = utf8_decomposed; i < utf8_decomposed_len; i++, c = g_utf8_next_char (c)) { + *decomposed++ = g_utf8_get_char (c); + } + + g_free (utf8_decomposed); + + return utf8_decomposed_len; +} extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_glib; hb_unicode_funcs_t _hb_glib_unicode_funcs = { diff --git a/src/hb-icu.cc b/src/hb-icu.cc index 0f5ed1c..a749d73 100644 --- a/src/hb-icu.cc +++ b/src/hb-icu.cc @@ -37,6 +37,7 @@ #include #include #include +#include @@ -271,6 +272,46 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ + UChar utf16[2], normalized[20]; + gint len; + int32_t utf32_len; + hb_bool_t err; + UErrorCode icu_err; + + /* Copy @ab into a UTF-16 array to be passed to ICU. */ + len = 0; + err = FALSE; + U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); + if (err) + goto error; + + /* Normalise the codepoint using NFKD mode. */ + icu_err = U_ZERO_ERROR; + len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); + if (icu_err) + goto error; + + /* Convert the decomposed form from UTF-16 to UTF-32. */ + icu_err = U_ZERO_ERROR; + u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); + if (icu_err) + goto error; + + return utf32_len; + +error: + /* Pretend the codepoint has no compat decomposition. */ + decomposed[0] = ab; + + return 1; +} + extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_icu; hb_unicode_funcs_t _hb_icu_unicode_funcs = { HB_OBJECT_HEADER_STATIC, diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh index 2ad8a49..a53720b 100644 --- a/src/hb-unicode-private.hh +++ b/src/hb-unicode-private.hh @@ -50,6 +50,7 @@ HB_UNICODE_FUNC_IMPLEMENT (script) \ HB_UNICODE_FUNC_IMPLEMENT (compose) \ HB_UNICODE_FUNC_IMPLEMENT (decompose) \ + HB_UNICODE_FUNC_IMPLEMENT (decompose_compatibility) \ /* ^--- Add new callbacks here */ /* Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t */ diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc index 4b285c5..58a16ea 100644 --- a/src/hb-unicode.cc +++ b/src/hb-unicode.cc @@ -100,6 +100,18 @@ hb_unicode_decompose_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, return FALSE; } +static unsigned int +hb_unicode_decompose_compatibility_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ + /* We require that decomposed has enough space for this. */ + decomposed[0] = ab; + + return 1; +} + hb_unicode_funcs_t _hb_unicode_funcs_nil = { HB_OBJECT_HEADER_STATIC, @@ -271,3 +283,10 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, return ufuncs->func.decompose (ufuncs, ab, a, b, ufuncs->user_data.decompose); } +unsigned int +hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed) +{ + return ufuncs->func.decompose_compatibility (ufuncs, ab, decomposed, ufuncs->user_data.decompose_compatibility); +} diff --git a/src/hb-unicode.h b/src/hb-unicode.h index 13886df..83ba94e 100644 --- a/src/hb-unicode.h +++ b/src/hb-unicode.h @@ -118,6 +118,31 @@ typedef hb_bool_t (*hb_unicode_decompose_func_t) (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *b, void *user_data); +/** + * hb_unicode_decompose_compatibility_func_t: + * @ufuncs: Unicode function structure + * @ab: codepoint to decompose + * @decomposed: address of codepoint array (of length at least %HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN) to write decomposition into + * @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_compatibility_func() + * + * Fully decompose @ab to its compatibility decomposition. The codepoints of the decomposition will be written to @decomposed. + * The complete length of the decomposition will be returned. + * + * If @ab has no compatibility decomposition, the codepoint is copied into @decomposed and 1 is returned. + * + * The Unicode standard guarantees that a buffer of length %HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN codepoints will always be sufficient for any + * compatibility decomposition. Consequently, @decompose must be allocated by the caller to be at least this length. + * + * Return value: number of codepoints in the full compatibility decomposition of @ab + */ +typedef unsigned int (*hb_unicode_decompose_compatibility_func_t) (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data); + +/* See http://unicode.org/reports/tr15/ for details. */ +#define HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN 18 /* codepoints */ + /* setters */ void @@ -155,6 +180,10 @@ hb_unicode_funcs_set_decompose_func (hb_unicode_funcs_t *ufuncs, hb_unicode_decompose_func_t decompose_func, void *user_data, hb_destroy_func_t destroy); +void +hb_unicode_funcs_set_decompose_compatibility_func (hb_unicode_funcs_t *ufuncs, + hb_unicode_decompose_compatibility_func_t decompose_compatibility_func, + void *user_data, hb_destroy_func_t destroy); /* accessors */ @@ -189,6 +218,11 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *a, hb_codepoint_t *b); +unsigned int +hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed); + HB_END_DECLS #endif /* HB_UNICODE_H */ diff --git a/test/test-unicode.c b/test/test-unicode.c index 3482a05..7b39160 100644 --- a/test/test-unicode.c +++ b/test/test-unicode.c @@ -787,6 +787,7 @@ test_unicode_normalization (gconstpointer user_data) { hb_unicode_funcs_t *uf = (hb_unicode_funcs_t *) user_data; gunichar a, b, ab; + hb_codepoint_t decomposed[HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN]; /* Test compose() */ @@ -849,6 +850,44 @@ test_unicode_normalization (gconstpointer user_data) g_assert (hb_unicode_decompose (uf, 0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8); g_assert (hb_unicode_decompose (uf, 0xCE20, &a, &b) && a == 0x110E && b == 0x1173); + + /* Test decompose_compatibility() */ + + /* Not decomposable */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x0041, decomposed) == 1 && decomposed[0] == 0x0041); + g_assert (hb_unicode_decompose_compatibility (uf, 0x1F632, decomposed) == 1 && decomposed[0] == 0x1F632); + + /* Singletons */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x00B5, decomposed) == 1 && decomposed[0] == 0x03BC); + g_assert (hb_unicode_decompose_compatibility (uf, 0x03D6, decomposed) == 1 && decomposed[0] == 0x03C0); + + /* Spaces */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x2002, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2003, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2004, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2005, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2006, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2008, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2009, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x200A, decomposed) == 1 && decomposed[0] == 0x0020); + + /* Pairs */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x0587, decomposed) == 2 && + decomposed[0] == 0x0565 && decomposed[1] == 0x0582); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2017, decomposed) == 2 && + decomposed[0] == 0x0020 && decomposed[1] == 0x0333); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2025, decomposed) == 2 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2033, decomposed) == 2 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032); + + /* Triples */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x2026, decomposed) == 3 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E && decomposed[2] == 0x002E); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2034, decomposed) == 3 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032 && decomposed[2] == 0x2032); + g_assert (hb_unicode_decompose_compatibility (uf, 0x213B, decomposed) == 3 && + decomposed[0] == 0x0046 && decomposed[1] == 0x0041 && decomposed[2] == 0x0058); } -- 1.7.6.2