From 917d2d3421f849d1c7f6a181761e12197dd4d8c8 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Sat, 24 Sep 2011 14:19:54 +0100 Subject: [PATCH] unicode: Add support for compatibility decomposition Add a new decompose_fully vfunc to the Unicode function table which decomposes a single codepoint in NFKD mode. This includes GLib, ICU and nil implementations of the vfunc, plus a selection of tests. Helps: fdo#41095 --- src/hb-glib.cc | 15 +++++++++++++++ src/hb-icu.cc | 42 ++++++++++++++++++++++++++++++++++++++++++ src/hb-unicode-private.hh | 1 + src/hb-unicode.cc | 21 +++++++++++++++++++++ src/hb-unicode.h | 39 +++++++++++++++++++++++++++++++++++++++ test/test-unicode.c | 35 +++++++++++++++++++++++++++++++++++ 6 files changed, 153 insertions(+), 0 deletions(-) diff --git a/src/hb-glib.cc b/src/hb-glib.cc index f990988..c48e09b 100644 --- a/src/hb-glib.cc +++ b/src/hb-glib.cc @@ -327,6 +327,21 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_glib_unicode_decompose_fully (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + unsigned int decomposed_len, + void *user_data HB_UNUSED) +{ +#if GLIB_CHECK_VERSION(2,29,12) + return g_unichar_fully_decompose (ab, TRUE, decomposed, decomposed_len); +#endif + + /* If the user doesn't have GLib >= 2.29.12, they don't get to use compat + * decompositions. Just return @ab as if it doesn't have a compat decomposition. */ + return hb_unicode_decompose_fully (hb_unicode_funcs_get_parent (ufuncs), ab, decomposed, decomposed_len); +} extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_glib; hb_unicode_funcs_t _hb_glib_unicode_funcs = { diff --git a/src/hb-icu.cc b/src/hb-icu.cc index 0f5ed1c..1c52c64 100644 --- a/src/hb-icu.cc +++ b/src/hb-icu.cc @@ -37,6 +37,7 @@ #include #include #include +#include @@ -271,6 +272,47 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_icu_unicode_decompose_fully (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + unsigned int decomposed_len, + void *user_data HB_UNUSED) +{ + UChar utf16[2], normalized[20]; + gint len; + int32_t utf32_len; + hb_bool_t err; + UErrorCode icu_err; + + /* Copy @ab into a UTF-16 array to be passed to ICU. */ + len = 0; + err = FALSE; + U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); + if (err) + goto error; + + /* Normalise the codepoint using NFKD mode. */ + icu_err = U_ZERO_ERROR; + len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); + if (icu_err) + goto error; + + /* Convert the decomposed form from UTF-16 to UTF-32. */ + icu_err = U_ZERO_ERROR; + u_strToUTF32 ((UChar32*) decomposed, decomposed_len, &utf32_len, normalized, len, &icu_err); + if (icu_err) + goto error; + + return utf32_len; + +error: + /* Pretend the codepoint has no compat decomposition. */ + if (decomposed_len >= 1) + decomposed[0] = ab; + return 1; +} + extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_icu; hb_unicode_funcs_t _hb_icu_unicode_funcs = { HB_OBJECT_HEADER_STATIC, diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh index 2ad8a49..e401519 100644 --- a/src/hb-unicode-private.hh +++ b/src/hb-unicode-private.hh @@ -50,6 +50,7 @@ HB_UNICODE_FUNC_IMPLEMENT (script) \ HB_UNICODE_FUNC_IMPLEMENT (compose) \ HB_UNICODE_FUNC_IMPLEMENT (decompose) \ + HB_UNICODE_FUNC_IMPLEMENT (decompose_fully) \ /* ^--- Add new callbacks here */ /* Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t */ diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc index 4b285c5..d17d724 100644 --- a/src/hb-unicode.cc +++ b/src/hb-unicode.cc @@ -100,6 +100,19 @@ hb_unicode_decompose_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, return FALSE; } +static unsigned int +hb_unicode_decompose_fully_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + unsigned int decomposed_len, + void *user_data HB_UNUSED) +{ + if (decomposed_len >= 1) + decomposed[0] = ab; + + return 1; +} + hb_unicode_funcs_t _hb_unicode_funcs_nil = { HB_OBJECT_HEADER_STATIC, @@ -271,3 +284,11 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, return ufuncs->func.decompose (ufuncs, ab, a, b, ufuncs->user_data.decompose); } +unsigned int +hb_unicode_decompose_fully (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + unsigned int decomposed_len) +{ + return ufuncs->func.decompose_fully (ufuncs, ab, decomposed, decomposed_len, ufuncs->user_data.decompose_fully); +} diff --git a/src/hb-unicode.h b/src/hb-unicode.h index 13886df..ec748f3 100644 --- a/src/hb-unicode.h +++ b/src/hb-unicode.h @@ -118,6 +118,35 @@ typedef hb_bool_t (*hb_unicode_decompose_func_t) (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *b, void *user_data); +/** + * hb_unicode_decompose_fully_func_t: + * @ufuncs: Unicode function structure + * @ab: codepoint to decompose + * @decomposed: address of codepoint array to write decomposition into + * @decomposed_len: allocated length of @decomposed + * @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_fully_func() + * + * Fully decompose @ab to its compatibility decomposition. At most @decomposed_len codepoints of the decomposition will be written to @decomposed. + * The complete length of the decomposition will be returned, regardless of how many codepoints have been written to @decomposed. i.e. If the return + * value is less than or equal to @decomposed_len, @decomposed holds the full decomposition sequence. Otherwise @decomposed holds a prefix of the full + * decomposed sequence. + * + * If @ab has no compatibility decomposition, the codepoint is copied in @decomposed and 1 is returned. + * + * The Unicode standard guarantees that a @decomposed_len of %HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN will always be sufficient for any compatibility + * decomposition. + * + * Return value: number of codepoints in the full compatibility decomposition of @ab + */ +typedef unsigned int (*hb_unicode_decompose_fully_func_t) (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + unsigned int decomposed_len, + void *user_data); + +/* See http://unicode.org/reports/tr15/ for details. */ +#define HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN 18 /* codepoints */ + /* setters */ void @@ -155,6 +184,10 @@ hb_unicode_funcs_set_decompose_func (hb_unicode_funcs_t *ufuncs, hb_unicode_decompose_func_t decompose_func, void *user_data, hb_destroy_func_t destroy); +void +hb_unicode_funcs_set_decompose_fully_func (hb_unicode_funcs_t *ufuncs, + hb_unicode_decompose_fully_func_t decompose_fully_func, + void *user_data, hb_destroy_func_t destroy); /* accessors */ @@ -189,6 +222,12 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *a, hb_codepoint_t *b); +unsigned int +hb_unicode_decompose_fully (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + unsigned int decomposed_len); + HB_END_DECLS #endif /* HB_UNICODE_H */ diff --git a/test/test-unicode.c b/test/test-unicode.c index 3482a05..1379e71 100644 --- a/test/test-unicode.c +++ b/test/test-unicode.c @@ -787,6 +787,7 @@ test_unicode_normalization (gconstpointer user_data) { hb_unicode_funcs_t *uf = (hb_unicode_funcs_t *) user_data; gunichar a, b, ab; + hb_codepoint_t decomposed[HB_UNICODE_MAX_COMPAT_DECOMPOSITION_LEN]; /* Test compose() */ @@ -849,6 +850,40 @@ test_unicode_normalization (gconstpointer user_data) g_assert (hb_unicode_decompose (uf, 0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8); g_assert (hb_unicode_decompose (uf, 0xCE20, &a, &b) && a == 0x110E && b == 0x1173); + + /* Test decompose_fully() */ + + /* Not decomposable */ + g_assert (hb_unicode_decompose_fully (uf, 0x0041, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0041); + g_assert (hb_unicode_decompose_fully (uf, 0x1F632, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x1F632); + + /* Singletons */ + g_assert (hb_unicode_decompose_fully (uf, 0x00B5, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x03BC); + g_assert (hb_unicode_decompose_fully (uf, 0x03D6, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x03C0); + + /* Spaces */ + g_assert (hb_unicode_decompose_fully (uf, 0x2002, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x2003, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x2004, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x2005, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x2006, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x2008, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x2009, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_fully (uf, 0x200A, decomposed, G_N_ELEMENTS (decomposed)) == 1 && decomposed[0] == 0x0020); + + /* Pairs */ + g_assert (hb_unicode_decompose_fully (uf, 0x0587, decomposed, G_N_ELEMENTS (decomposed)) == 2 && decomposed[0] == 0x0565 && decomposed[1] == 0x0582); + g_assert (hb_unicode_decompose_fully (uf, 0x2017, decomposed, G_N_ELEMENTS (decomposed)) == 2 && decomposed[0] == 0x0020 && decomposed[1] == 0x0333); + g_assert (hb_unicode_decompose_fully (uf, 0x2025, decomposed, G_N_ELEMENTS (decomposed)) == 2 && decomposed[0] == 0x002E && decomposed[1] == 0x002E); + g_assert (hb_unicode_decompose_fully (uf, 0x2033, decomposed, G_N_ELEMENTS (decomposed)) == 2 && decomposed[0] == 0x2032 && decomposed[1] == 0x2032); + + /* Triples */ + g_assert (hb_unicode_decompose_fully (uf, 0x2026, decomposed, G_N_ELEMENTS (decomposed)) == 3 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E && decomposed[2] == 0x002E); + g_assert (hb_unicode_decompose_fully (uf, 0x2034, decomposed, G_N_ELEMENTS (decomposed)) == 3 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032 && decomposed[2] == 0x2032); + g_assert (hb_unicode_decompose_fully (uf, 0x213B, decomposed, G_N_ELEMENTS (decomposed)) == 3 && + decomposed[0] == 0x0046 && decomposed[1] == 0x0041 && decomposed[2] == 0x0058); } -- 1.7.6.2