[tin-dev] [tin 2.4.3] snapshots - please test

Urs Janßen urs at tin.org
Tue Jul 10 09:25:42 CEST 2018


In <mailman.173.1530633342.592.tin-dev at tin.org>, Corinna Vinschen wrote:
>> | o  add check for ICU unorm2.h/unorm2_normalize(); prefer it over
>> |    ICU unorm.h/unorm_normalize()
>> 
>> volunteers? ,-)
> 
> Hmm, the call has changed significantly.  Not that easy...

untested (I ommited the configure diff, so you either need to
run autoconf (special version needed
[<ftp://ftp.invisible-island.net/autoconf/autoconf-2.13-20110430.tgz>])
once or hard define HAVE_UNICODE_UNORM2_H for the time beeing):

=== modified file 'configure.in'
--- configure.in	2018-07-02 12:19:05 +0000
+++ configure.in	2018-07-09 14:27:02 +0000
@@ -962,6 +962,10 @@
 	])
 ])
 
+AC_CHECK_HEADER(unicode/unorm2.h, [
+	AC_DEFINE(HAVE_UNICODE_UNORM2_H)
+])
+
 # without icuuc try GNU libunistring for normalization
 if test $cf_try_icuuc = no ; then
 	AC_CHECK_HEADER(unitypes.h,[

=== modified file 'include/autoconf.hin'
--- include/autoconf.hin	2018-07-02 12:19:05 +0000
+++ include/autoconf.hin	2018-07-09 13:59:12 +0000
@@ -683,6 +683,7 @@
  *       used for unicode normalization
  */
 #	undef HAVE_UNICODE_UNORM_H
+#	undef HAVE_UNICODE_UNORM2_H
 #	undef HAVE_UNICODE_USTRING_H
 #	undef HAVE_UNICODE_UBIDI_H
 #	undef HAVE_UNICODE_UIDNA_H

=== modified file 'src/string.c'
--- src/string.c	2018-07-03 15:52:14 +0000
+++ src/string.c	2018-07-10 07:12:04 +0000
@@ -1202,35 +1202,72 @@
 		int32_t needed, norm_len;
 		UChar *ustr, *norm;
 		UErrorCode status = U_ZERO_ERROR;
+
+#ifdef HAVE_UNICODE_UNORM2_H
+		static const char *uname[] = {"nfc", "nfkc", "nfkc"};
+		const char *unamep;
+		UNormalization2Mode mode;
+#else
 		UNormalizationMode mode;
+#endif /* !HAVE_UNICODE_UNORM2_H */
+
+		/* convert to UTF-16 which is used internally by ICU */
+		if ((ustr = char2UChar(tmp)) == NULL) /* something went wrong, return the original string (as valid UTF8) */
+			return tmp;
 
 		switch (tinrc.normalization_form) {
 			case NORMALIZE_NFD:
+#ifdef HAVE_UNICODE_UNORM2_H
+				unamep = uname[0];
+				mode = UNORM2_DECOMPOSE;
+#else
 				mode = UNORM_NFD;
+#endif /* HAVE_UNICODE_UNORM2_H */
 				break;
 
 			case NORMALIZE_NFC:
+#ifdef HAVE_UNICODE_UNORM2_H
+				unamep = uname[0];
+				mode = UNORM2_COMPOSE;
+#else
 				mode = UNORM_NFC;
+#endif /* HAVE_UNICODE_UNORM2_H */
 				break;
 
 			case NORMALIZE_NFKD:
+#ifdef HAVE_UNICODE_UNORM2_H
+				unamep = uname[1];
+				mode = UNORM2_DECOMPOSE;
+#else
 				mode = UNORM_NFKD;
+#endif /* HAVE_UNICODE_UNORM2_H */
 				break;
 
 			case NORMALIZE_NFKC:
 			default:
+#ifdef HAVE_UNICODE_UNORM2_H
+				unamep = uname[1];
+				mode = UNORM2_COMPOSE;
+#else
 				mode = UNORM_NFKC;
+#endif /* HAVE_UNICODE_UNORM2_H */
 		}
 
-		/* convert to UTF-16 which is used internally by ICU */
-		if ((ustr = char2UChar(tmp)) == NULL) /* something went wrong, return the original string (as valid UTF8) */
-			return tmp;
-
+#ifdef HAVE_UNICODE_UNORM2_H
+		needed = unorm2_normalize(unorm2_getInstance(NULL, unamep, mode, &status), ustr, -1, NULL, 0, &status);
+#else
 		needed = unorm_normalize(ustr, -1, mode, 0, NULL, 0, &status);
+#endif /* HAVE_UNICODE_UNORM2_H */
+
 		status = U_ZERO_ERROR;		/* reset status */
 		norm_len = needed + 1;
 		norm = my_malloc(sizeof(UChar) * norm_len);
+#ifdef HAVE_UNICODE_UNORM2_H
+		(void) unorm2_normalize(unorm2_getInstance(NULL, unamep, mode, &status), ustr, -1, norm, norm_len, &status);
+#else
 		(void) unorm_normalize(ustr, -1, mode, 0, norm, norm_len, &status);
+#endif /* HAVE_UNICODE_UNORM2_H */
+
 		if (U_FAILURE(status)) {
 			/* something went wrong, return the original string (as valid UTF8) */
 			free(ustr);




More information about the tin-dev mailing list