From b8fc5f77b6ff16118927cc52817947acdee992ac Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Sat, 24 Sep 2011 14:19:54 +0100 Subject: [PATCH] unicode: Add support for compatibility decomposition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new decompose_fully vfunc to the Unicode function table which decomposes a single codepoint in NFKD mode. This includes GLib, ICU and nil implementations of the vfunc, plus a selection of tests. Helps: fdo#41095 Bug 41095 — Fall back to Unicode compat decompositions for missing glyphs Extend decompose() in hb-ot-shape-normalize.cc to fall back to the compatibility decomposition of a codepoint if the original glyph or the glyphs for the canonical decomposition don't exist in the font. Notably, this now means that we can basically guarantee to render ellipses (‘…’) correctly, since their compatibility decomposition is three dots (‘...’). Closes: fdo#41095 --- src/hb-glib.cc | 33 +++++++++++++++++++++++++++++++++ src/hb-icu.cc | 41 +++++++++++++++++++++++++++++++++++++++++ src/hb-ot-shape-normalize.cc | 32 +++++++++++++++++++++++++++++--- src/hb-unicode-private.hh | 1 + src/hb-unicode.cc | 19 +++++++++++++++++++ src/hb-unicode.h | 34 ++++++++++++++++++++++++++++++++++ test/test-unicode.c | 39 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 196 insertions(+), 3 deletions(-) diff --git a/src/hb-glib.cc b/src/hb-glib.cc index f990988..6283461 100644 --- a/src/hb-glib.cc +++ b/src/hb-glib.cc @@ -327,6 +327,39 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_glib_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ +#if GLIB_CHECK_VERSION(2,29,12) + return g_unichar_fully_decompose (ab, TRUE, decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN); +#endif + + /* If the user doesn't have GLib >= 2.29.12 we have to perform + * a round trip to UTF-8 and the associated memory management dance. */ + gchar utf8[6]; + gchar *utf8_decomposed, *c; + gsize utf8_len, utf8_decomposed_len, i; + + /* Convert @ab to UTF-8 and normalise it in NFKD mode. This performs the compatibility decomposition. */ + utf8_len = g_unichar_to_utf8 (ab, utf8); + utf8_decomposed = g_utf8_normalize (utf8, utf8_len, G_NORMALIZE_NFKD); + utf8_decomposed_len = g_utf8_strlen (utf8_decomposed, -1); + + /* Don't overrun the output buffer. */ + assert (utf8_decomposed_len <= HB_UNICODE_MAX_DECOMPOSITION_LEN); + + /* Copy the decomposed UTF-8 string into our output buffer, @decomposed. */ + for (i = 0, c = utf8_decomposed; i < utf8_decomposed_len; i++, c = g_utf8_next_char (c)) { + *decomposed++ = g_utf8_get_char (c); + } + + g_free (utf8_decomposed); + + return utf8_decomposed_len; +} extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_glib; hb_unicode_funcs_t _hb_glib_unicode_funcs = { diff --git a/src/hb-icu.cc b/src/hb-icu.cc index 0f5ed1c..88fecbe 100644 --- a/src/hb-icu.cc +++ b/src/hb-icu.cc @@ -37,6 +37,7 @@ #include #include #include +#include @@ -271,6 +272,46 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ + UChar utf16[2], normalized[20]; + gint len; + int32_t utf32_len; + hb_bool_t err; + UErrorCode icu_err; + + /* Copy @ab into a UTF-16 array to be passed to ICU. */ + len = 0; + err = FALSE; + U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); + if (err) + goto error; + + /* Normalise the codepoint using NFKD mode. */ + icu_err = U_ZERO_ERROR; + len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); + if (icu_err) + goto error; + + /* Convert the decomposed form from UTF-16 to UTF-32. */ + icu_err = U_ZERO_ERROR; + u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); + if (icu_err) + goto error; + + return utf32_len; + +error: + /* Pretend the codepoint has no compat decomposition. */ + decomposed[0] = ab; + + return 1; +} + extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_icu; hb_unicode_funcs_t _hb_icu_unicode_funcs = { HB_OBJECT_HEADER_STATIC, diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index eb9f32a..c31cf5a 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -60,7 +60,8 @@ * We need ot provide assistance to the itemizer. * * - When a font does not support a character but supports its decomposition, - * well, use the decomposition. + * well, use the decomposition (preferring the canonical decomposition, but + * falling back to the compatibility decomposition if necessary). * * - The Indic shaper requests decomposed output. This will handle splitting * matra for the Indic shaper. @@ -83,9 +84,34 @@ decompose (hb_ot_shape_context_t *c, { hb_codepoint_t a, b, glyph; + /* Try the (first step of a) canonical decomposition of ab. If the codepoint can't + * be decomposed, or if decomposed codepoint b isn't in the font, skip to... */ if (!hb_unicode_decompose (c->buffer->unicode, ab, &a, &b) || - (b && !hb_font_get_glyph (c->font, b, 0, &glyph))) - return FALSE; + (b && !hb_font_get_glyph (c->font, b, 0, &glyph))) { + /* ...trying the compatibility decomposition of ab. */ + unsigned int len, i; + hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN]; + + len = hb_unicode_decompose_compatibility (c->buffer->unicode, ab, decomposed); + if (len == 1) { + return FALSE; + } + + /* Check we have all the glyphs before outputting them. */ + for (i = 0; i < len; i++) { + if (!hb_font_get_glyph (c->font, decomposed[i], 0, &glyph)) { + return FALSE; + } + } + + /* Output the compatibility decomposition, since we don't have the glyphs + * for the original codepoint or its canonical decomposition. */ + for (i = 0; i < len; i++) { + output_glyph (c, decomposed[i]); + } + + return TRUE; + } bool has_a = hb_font_get_glyph (c->font, a, 0, &glyph); if (shortest && has_a) { diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh index 2ad8a49..a53720b 100644 --- a/src/hb-unicode-private.hh +++ b/src/hb-unicode-private.hh @@ -50,6 +50,7 @@ HB_UNICODE_FUNC_IMPLEMENT (script) \ HB_UNICODE_FUNC_IMPLEMENT (compose) \ HB_UNICODE_FUNC_IMPLEMENT (decompose) \ + HB_UNICODE_FUNC_IMPLEMENT (decompose_compatibility) \ /* ^--- Add new callbacks here */ /* Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t */ diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc index 4b285c5..58a16ea 100644 --- a/src/hb-unicode.cc +++ b/src/hb-unicode.cc @@ -100,6 +100,18 @@ hb_unicode_decompose_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, return FALSE; } +static unsigned int +hb_unicode_decompose_compatibility_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ + /* We require that decomposed has enough space for this. */ + decomposed[0] = ab; + + return 1; +} + hb_unicode_funcs_t _hb_unicode_funcs_nil = { HB_OBJECT_HEADER_STATIC, @@ -271,3 +283,10 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, return ufuncs->func.decompose (ufuncs, ab, a, b, ufuncs->user_data.decompose); } +unsigned int +hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed) +{ + return ufuncs->func.decompose_compatibility (ufuncs, ab, decomposed, ufuncs->user_data.decompose_compatibility); +} diff --git a/src/hb-unicode.h b/src/hb-unicode.h index 13886df..078545e 100644 --- a/src/hb-unicode.h +++ b/src/hb-unicode.h @@ -118,6 +118,31 @@ typedef hb_bool_t (*hb_unicode_decompose_func_t) (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *b, void *user_data); +/** + * hb_unicode_decompose_compatibility_func_t: + * @ufuncs: Unicode function structure + * @ab: codepoint to decompose + * @decomposed: address of codepoint array (of length at least %HB_UNICODE_MAX_DECOMPOSITION_LEN) to write decomposition into + * @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_compatibility_func() + * + * Fully decompose @ab to its compatibility decomposition. The codepoints of the decomposition will be written to @decomposed. + * The complete length of the decomposition will be returned. + * + * If @ab has no compatibility decomposition, the codepoint is copied into @decomposed and 1 is returned. + * + * The Unicode standard guarantees that a buffer of length %HB_UNICODE_MAX_DECOMPOSITION_LEN codepoints will always be sufficient for any + * compatibility decomposition. Consequently, @decompose must be allocated by the caller to be at least this length. + * + * Return value: number of codepoints in the full compatibility decomposition of @ab + */ +typedef unsigned int (*hb_unicode_decompose_compatibility_func_t) (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed, + void *user_data); + +/* See Unicode 6.1 for details on the maximum decomposition length. */ +#define HB_UNICODE_MAX_DECOMPOSITION_LEN 18 /* codepoints */ + /* setters */ void @@ -155,6 +180,10 @@ hb_unicode_funcs_set_decompose_func (hb_unicode_funcs_t *ufuncs, hb_unicode_decompose_func_t decompose_func, void *user_data, hb_destroy_func_t destroy); +void +hb_unicode_funcs_set_decompose_compatibility_func (hb_unicode_funcs_t *ufuncs, + hb_unicode_decompose_compatibility_func_t decompose_compatibility_func, + void *user_data, hb_destroy_func_t destroy); /* accessors */ @@ -189,6 +218,11 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *a, hb_codepoint_t *b); +unsigned int +hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t ab, + hb_codepoint_t *decomposed); + HB_END_DECLS #endif /* HB_UNICODE_H */ diff --git a/test/test-unicode.c b/test/test-unicode.c index 3482a05..2391696 100644 --- a/test/test-unicode.c +++ b/test/test-unicode.c @@ -787,6 +787,7 @@ test_unicode_normalization (gconstpointer user_data) { hb_unicode_funcs_t *uf = (hb_unicode_funcs_t *) user_data; gunichar a, b, ab; + hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN]; /* Test compose() */ @@ -849,6 +850,44 @@ test_unicode_normalization (gconstpointer user_data) g_assert (hb_unicode_decompose (uf, 0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8); g_assert (hb_unicode_decompose (uf, 0xCE20, &a, &b) && a == 0x110E && b == 0x1173); + + /* Test decompose_compatibility() */ + + /* Not decomposable */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x0041, decomposed) == 1 && decomposed[0] == 0x0041); + g_assert (hb_unicode_decompose_compatibility (uf, 0x1F632, decomposed) == 1 && decomposed[0] == 0x1F632); + + /* Singletons */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x00B5, decomposed) == 1 && decomposed[0] == 0x03BC); + g_assert (hb_unicode_decompose_compatibility (uf, 0x03D6, decomposed) == 1 && decomposed[0] == 0x03C0); + + /* Spaces */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x2002, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2003, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2004, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2005, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2006, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2008, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2009, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x200A, decomposed) == 1 && decomposed[0] == 0x0020); + + /* Pairs */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x0587, decomposed) == 2 && + decomposed[0] == 0x0565 && decomposed[1] == 0x0582); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2017, decomposed) == 2 && + decomposed[0] == 0x0020 && decomposed[1] == 0x0333); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2025, decomposed) == 2 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2033, decomposed) == 2 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032); + + /* Triples */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x2026, decomposed) == 3 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E && decomposed[2] == 0x002E); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2034, decomposed) == 3 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032 && decomposed[2] == 0x2032); + g_assert (hb_unicode_decompose_compatibility (uf, 0x213B, decomposed) == 3 && + decomposed[0] == 0x0046 && decomposed[1] == 0x0041 && decomposed[2] == 0x0058); } -- 1.7.7.4