diff options
author | Aleksander Morgado <aleksander@lanedo.com> | 2012-02-07 17:50:49 +0100 |
---|---|---|
committer | Dan Williams <dcbw@redhat.com> | 2012-02-07 14:02:07 -0600 |
commit | f01f92b4572f864f79d2acbc1486ed602ded5035 (patch) | |
tree | 5927afdd381bb33a7a93bbcc22633899e4450735 | |
parent | 8b423bcc548b73c438a044bf6498d45234487b95 (diff) |
charsets: new method to do our best to convert from current charset to UTF-8
This method will try to convert the input string to UTF-8. The input string is
supposed to be in the given charset; or otherwise is supposed to be the hex
representation of the string in the given charset.
-rw-r--r-- | src/mm-charsets.c | 92 | ||||
-rw-r--r-- | src/mm-charsets.h | 4 |
2 files changed, 95 insertions, 1 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c index 549b3efe..e7e97f95 100644 --- a/src/mm-charsets.c +++ b/src/mm-charsets.c @@ -18,6 +18,7 @@ #include <stdlib.h> #include <unistd.h> #include <string.h> +#include <ctype.h> #include "mm-charsets.h" #include "mm-utils.h" @@ -672,3 +673,94 @@ gsm_pack (const guint8 *src, return packed; } +/* We do all our best to get the given string, which is possibly given in the + * specified charset, to UTF8. It may happen that the given string is really + * the hex representation of the charset-encoded string, so we need to cope with + * that case. */ +gchar * +mm_charset_take_and_convert_to_utf8 (gchar *str, + MMModemCharset charset) +{ + gchar *utf8 = NULL; + + switch (charset) { + case MM_MODEM_CHARSET_UNKNOWN: + g_warn_if_reached (); + utf8 = str; + break; + + case MM_MODEM_CHARSET_HEX: + /* We'll assume that the HEX string is really valid ASCII at the end */ + utf8 = str; + break; + + case MM_MODEM_CHARSET_GSM: + case MM_MODEM_CHARSET_8859_1: + case MM_MODEM_CHARSET_PCCP437: + case MM_MODEM_CHARSET_PCDN: { + const gchar *iconv_from; + GError *error = NULL; + + iconv_from = charset_iconv_from (charset); + utf8 = g_convert (str, strlen (str), + "UTF-8//TRANSLIT", iconv_from, + NULL, NULL, &error); + if (!utf8 || error) { + g_clear_error (&error); + utf8 = NULL; + } + + g_free (str); + break; + } + + case MM_MODEM_CHARSET_UCS2: { + gsize len; + gboolean possibly_hex = TRUE; + + /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */ + len = strlen (str); + if ((len < 4) || ((len % 4) != 0)) + possibly_hex = FALSE; + else { + const gchar *p = str; + + /* All chars in the string must be hex */ + while (*p && possibly_hex) + possibly_hex = isxdigit (*p++); + } + + /* If we get UCS-2, we expect the HEX representation of the string */ + if (possibly_hex) { + utf8 = mm_modem_charset_hex_to_utf8 (str, charset); + if (!utf8) { + /* If we couldn't convert the string as HEX-UCS-2, try to see if + * the string is valid UTF-8 itself. */ + utf8 = str; + } else + g_free (str); + } else + /* If we already know it's not hex, try to use the string as it is */ + utf8 = str; + + break; + } + + /* If the given charset is ASCII or UTF8, we really expect the final string + * already here */ + case MM_MODEM_CHARSET_IRA: + case MM_MODEM_CHARSET_UTF8: + utf8 = str; + break; + } + + /* Validate UTF-8 always before returning. This result will be exposed in DBus + * very likely... */ + if (!g_utf8_validate (utf8, -1, NULL)) { + /* Better return NULL than an invalid UTF-8 string */ + g_free (utf8); + utf8 = NULL; + } + + return utf8; +} diff --git a/src/mm-charsets.h b/src/mm-charsets.h index 7e43c3a8..ff701e53 100644 --- a/src/mm-charsets.h +++ b/src/mm-charsets.h @@ -67,5 +67,7 @@ guint8 *gsm_pack (const guint8 *src, guint8 start_offset, /* in bits */ guint32 *out_packed_len); -#endif /* MM_CHARSETS_H */ +gchar *mm_charset_take_and_convert_to_utf8 (gchar *str, + MMModemCharset charset); +#endif /* MM_CHARSETS_H */ |