[tin-dev] [PATCH] remove SOFT HYPHENs (U+00AD) from UTF-8 articles

Dennis Preiser dennis at d--p.de
Tue Jun 28 16:52:08 CEST 2022


Hi,

in de.test there was a discussion about the display of SOFT HYPHENs
(U+00AD). The thread started here:

| Subject: Soft hyphen (was: FDP: Keine eAuto-Förderung mehr. Grüne: Kohleverstromung fördern. Putin: Pumpe defekt, so sorry.)
| Message-ID: <AABitvftEqIAAA7s.A3.flnews at WStation7.micha.freeshell.org>

The trigger was an article in ger.ct:

| From: Jochen Kremer <jochen.news at kremerweb.de>
| Newsgroups: ger.ct
| Subject: Re: FDP: Keine eAuto-Förderung mehr. Grüne: Kohleverstromung fördern. Putin: Pumpe defekt, so sorry.
| Date: Sat, 25 Jun 2022 00:27:27 +0200
| Message-ID: <jhms2gF1vn8U1 at mid.individual.net>

Under macOS, for example, the display of SOFT HYPHENs depends on the
terminal program used. Apple's Terminal.app displays the SOFT HYPHENs
like HYPHENs:

<http://d--p.de/tmp/2022-06-28_Terminal.app.png>

while e.g. iTerm.app does not display them:

<http://d--p.de/tmp/2022-06-28_iTerm.app.png>

Unicode Standard Annex #14 §5.4 "Use of Soft Hyphens"
<https://www.unicode.org/reports/tr14/#SoftHyphen> states:

"[...] the character U+00AD SOFT HYPHEN (SHY) is an invisible format
character that merely indicates a preferred intraword line break
position."

According to this definition ("invisible format character") we could
remove the SOFT HYPHENs before displaying the article. Then it would not
matter how the terminal program would display such characters.

I'm not sure if this makes sense or if the terminal programs would have
to handle it. Anyway, the attached patch removes the SOFT HYPHENs if the
article was in UTF-8 and the local charset is also UTF-8.

Dennis
-------------- next part --------------
diff -urp tin-2.6.2/include/proto.h tin-2.6.2_r1/include/proto.h
--- tin-2.6.2/include/proto.h	2022-02-19 06:32:19.000000000 +0100
+++ tin-2.6.2_r1/include/proto.h	2022-06-28 16:39:26.000000000 +0200
@@ -390,6 +390,9 @@ extern void make_group_path(const char *
 extern void process_charsets(char **line, size_t *max_line_len, const char *network_charset, const char *local_charset, t_bool conv_tex2iso);
 extern void read_input_history_file(void);
 extern void rename_file(const char *old_filename, const char *new_filename);
+#if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
+	extern void remove_soft_hyphens(char *line);
+#endif /* MULTIBYTE_ABLE && !NO_LOCALE */
 extern void show_inverse_video_status(void);
 extern void strip_name(const char *from, char *address);
 extern _Noreturn void tin_done(int ret, const char *fmt, ...);
diff -urp tin-2.6.2/src/cook.c tin-2.6.2_r1/src/cook.c
--- tin-2.6.2/src/cook.c	2022-05-03 18:44:45.000000000 +0200
+++ tin-2.6.2_r1/src/cook.c	2022-06-28 16:34:25.000000000 +0200
@@ -477,8 +477,12 @@ process_text_body_part(
 		process_charsets(&line, &max_line_len, ncharset ? ncharset : "US-ASCII", tinrc.mm_local_charset, curr_group->attribute->tex2iso_conv && art->tex2iso);
 
 #if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
-		if (IS_LOCAL_CHARSET("UTF-8"))
+		if (IS_LOCAL_CHARSET("UTF-8")) {
 			utf8_valid(line);
+
+			if (ncharset && !strcasecmp(ncharset, "UTF-8"))
+				remove_soft_hyphens(line);
+		}
 #endif /* MULTIBYTE_ABLE && !NO_LOCALE */
 
 		len = (int) strlen(line);
diff -urp tin-2.6.2/src/misc.c tin-2.6.2_r1/src/misc.c
--- tin-2.6.2/src/misc.c	2022-04-30 13:37:49.000000000 +0200
+++ tin-2.6.2_r1/src/misc.c	2022-06-28 16:39:19.000000000 +0200
@@ -3806,6 +3806,48 @@ utf8_valid(
 #endif /* CHARSET_CONVERSION || (MULTIBYTE_ABLE && !NO_LOCALE) */
 
 
+#if defined(MULTIBYTE_ABLE) && !defined(NO_LOCALE)
+/*
+ * Unicode Standard Annex #14 §5.4 "Use of Soft Hyphens"
+ * <https://www.unicode.org/reports/tr14/#SoftHyphen> states:
+ *
+ * "[...] the character U+00AD SOFT HYPHEN (SHY) is an invisible format
+ * character that merely indicates a preferred intraword line break
+ * position."
+ *
+ * -> remove SOFT HYPHENs from the given UTF-8 string to prevent
+ *    terminal programs from displaying them incorrectly
+ */
+void
+remove_soft_hyphens(
+	char *line)
+{
+	char *buffer;
+	wchar_t *wbuffer, *rptr, *wptr;
+	size_t len;
+
+	if ((wbuffer = char2wchar_t(line)) != NULL) {
+		rptr = wptr = wbuffer;
+		while (*rptr) {
+			if (*rptr == 0xad)
+				++rptr;
+			if (*rptr)
+				*wptr++ = *rptr++;	
+		}
+		*wptr = '\0';
+
+		if ((buffer = wchar_t2char(wbuffer)) != NULL) {
+			len = strlen(line) + 1;
+			strncpy(line, buffer, len);
+			line[len - 1] = '\0';
+			free(buffer);
+		}
+		free(wbuffer);
+	}
+}
+#endif /* MULTIBYTE_ABLE && !NO_LOCALE */
+
+
 char *
 idna_decode(
 	char *in)


More information about the tin-dev mailing list