charsets: new method to do our best to convert from current charset to UTF-8

This method will try to convert the input string to UTF-8. The input string is supposed to be in the given charset; or otherwise is supposed to be the hex representation of the string in the given charset.
author: Aleksander Morgado <aleksander@lanedo.com> 2012-02-07 17:50:49 +0100
committer: Dan Williams <dcbw@redhat.com> 2012-02-07 14:02:07 -0600
commit: f01f92b4572f864f79d2acbc1486ed602ded5035 (patch)
tree: 5927afdd381bb33a7a93bbcc22633899e4450735
parent: 8b423bcc548b73c438a044bf6498d45234487b95 (diff)
2 files changed, 95 insertions, 1 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index 549b3efe..e7e97f95 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
+#include <ctype.h>
 
 #include "mm-charsets.h"
 #include "mm-utils.h"
@@ -672,3 +673,94 @@ gsm_pack (const guint8 *src,
     return packed;
 }
 
+/* We do all our best to get the given string, which is possibly given in the
+ * specified charset, to UTF8. It may happen that the given string is really
+ * the hex representation of the charset-encoded string, so we need to cope with
+ * that case. */
+gchar *
+mm_charset_take_and_convert_to_utf8 (gchar *str,
+                                     MMModemCharset charset)
+{
+    gchar *utf8 = NULL;
+
+    switch (charset) {
+    case MM_MODEM_CHARSET_UNKNOWN:
+        g_warn_if_reached ();
+        utf8 = str;
+        break;
+
+    case MM_MODEM_CHARSET_HEX:
+        /* We'll assume that the HEX string is really valid ASCII at the end */
+        utf8 = str;
+        break;
+
+    case MM_MODEM_CHARSET_GSM:
+    case MM_MODEM_CHARSET_8859_1:
+    case MM_MODEM_CHARSET_PCCP437:
+    case MM_MODEM_CHARSET_PCDN: {
+        const gchar *iconv_from;
+        GError *error = NULL;
+
+        iconv_from = charset_iconv_from (charset);
+        utf8 = g_convert (str, strlen (str),
+                          "UTF-8//TRANSLIT", iconv_from,
+                          NULL, NULL, &error);
+        if (!utf8 || error) {
+            g_clear_error (&error);
+            utf8 = NULL;
+        }
+
+        g_free (str);
+        break;
+    }
+
+    case MM_MODEM_CHARSET_UCS2: {
+        gsize len;
+        gboolean possibly_hex = TRUE;
+
+        /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
+        len = strlen (str);
+        if ((len < 4) || ((len % 4) != 0))
+            possibly_hex = FALSE;
+        else {
+            const gchar *p = str;
+
+            /* All chars in the string must be hex */
+            while (*p && possibly_hex)
+                possibly_hex = isxdigit (*p++);
+        }
+
+        /* If we get UCS-2, we expect the HEX representation of the string */
+        if (possibly_hex) {
+            utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
+            if (!utf8) {
+                /* If we couldn't convert the string as HEX-UCS-2, try to see if
+                 * the string is valid UTF-8 itself. */
+                utf8 = str;
+            } else
+                g_free (str);
+        } else
+            /* If we already know it's not hex, try to use the string as it is */
+            utf8 = str;
+
+        break;
+    }
+
+    /* If the given charset is ASCII or UTF8, we really expect the final string
+     * already here */
+    case MM_MODEM_CHARSET_IRA:
+    case MM_MODEM_CHARSET_UTF8:
+        utf8 = str;
+        break;
+    }
+
+    /* Validate UTF-8 always before returning. This result will be exposed in DBus
+     * very likely... */
+    if (!g_utf8_validate (utf8, -1, NULL)) {
+        /* Better return NULL than an invalid UTF-8 string */
+        g_free (utf8);
+        utf8 = NULL;
+    }
+
+    return utf8;
+}
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index 7e43c3a8..ff701e53 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -67,5 +67,7 @@ guint8 *gsm_pack (const guint8 *src,
                   guint8 start_offset,  /* in bits */
                   guint32 *out_packed_len);
 
-#endif /* MM_CHARSETS_H */
+gchar *mm_charset_take_and_convert_to_utf8 (gchar *str,
+                                            MMModemCharset charset);
 
+#endif /* MM_CHARSETS_H */
author	Aleksander Morgado <aleksander@lanedo.com>	2012-02-07 17:50:49 +0100
committer	Dan Williams <dcbw@redhat.com>	2012-02-07 14:02:07 -0600
commit	f01f92b4572f864f79d2acbc1486ed602ded5035 (patch)
tree	5927afdd381bb33a7a93bbcc22633899e4450735
parent	8b423bcc548b73c438a044bf6498d45234487b95 (diff)