aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAleksander Morgado <aleksander@lanedo.com>2012-02-07 17:50:49 +0100
committerDan Williams <dcbw@redhat.com>2012-02-07 14:02:07 -0600
commitf01f92b4572f864f79d2acbc1486ed602ded5035 (patch)
tree5927afdd381bb33a7a93bbcc22633899e4450735
parent8b423bcc548b73c438a044bf6498d45234487b95 (diff)
charsets: new method to do our best to convert from current charset to UTF-8
This method will try to convert the input string to UTF-8. The input string is supposed to be in the given charset; or otherwise is supposed to be the hex representation of the string in the given charset.
-rw-r--r--src/mm-charsets.c92
-rw-r--r--src/mm-charsets.h4
2 files changed, 95 insertions, 1 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index 549b3efe..e7e97f95 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#include <ctype.h>
#include "mm-charsets.h"
#include "mm-utils.h"
@@ -672,3 +673,94 @@ gsm_pack (const guint8 *src,
return packed;
}
+/* We do all our best to get the given string, which is possibly given in the
+ * specified charset, to UTF8. It may happen that the given string is really
+ * the hex representation of the charset-encoded string, so we need to cope with
+ * that case. */
+gchar *
+mm_charset_take_and_convert_to_utf8 (gchar *str,
+ MMModemCharset charset)
+{
+ gchar *utf8 = NULL;
+
+ switch (charset) {
+ case MM_MODEM_CHARSET_UNKNOWN:
+ g_warn_if_reached ();
+ utf8 = str;
+ break;
+
+ case MM_MODEM_CHARSET_HEX:
+ /* We'll assume that the HEX string is really valid ASCII at the end */
+ utf8 = str;
+ break;
+
+ case MM_MODEM_CHARSET_GSM:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN: {
+ const gchar *iconv_from;
+ GError *error = NULL;
+
+ iconv_from = charset_iconv_from (charset);
+ utf8 = g_convert (str, strlen (str),
+ "UTF-8//TRANSLIT", iconv_from,
+ NULL, NULL, &error);
+ if (!utf8 || error) {
+ g_clear_error (&error);
+ utf8 = NULL;
+ }
+
+ g_free (str);
+ break;
+ }
+
+ case MM_MODEM_CHARSET_UCS2: {
+ gsize len;
+ gboolean possibly_hex = TRUE;
+
+ /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
+ len = strlen (str);
+ if ((len < 4) || ((len % 4) != 0))
+ possibly_hex = FALSE;
+ else {
+ const gchar *p = str;
+
+ /* All chars in the string must be hex */
+ while (*p && possibly_hex)
+ possibly_hex = isxdigit (*p++);
+ }
+
+ /* If we get UCS-2, we expect the HEX representation of the string */
+ if (possibly_hex) {
+ utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
+ if (!utf8) {
+ /* If we couldn't convert the string as HEX-UCS-2, try to see if
+ * the string is valid UTF-8 itself. */
+ utf8 = str;
+ } else
+ g_free (str);
+ } else
+ /* If we already know it's not hex, try to use the string as it is */
+ utf8 = str;
+
+ break;
+ }
+
+ /* If the given charset is ASCII or UTF8, we really expect the final string
+ * already here */
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_UTF8:
+ utf8 = str;
+ break;
+ }
+
+ /* Validate UTF-8 always before returning. This result will be exposed in DBus
+ * very likely... */
+ if (!g_utf8_validate (utf8, -1, NULL)) {
+ /* Better return NULL than an invalid UTF-8 string */
+ g_free (utf8);
+ utf8 = NULL;
+ }
+
+ return utf8;
+}
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index 7e43c3a8..ff701e53 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -67,5 +67,7 @@ guint8 *gsm_pack (const guint8 *src,
guint8 start_offset, /* in bits */
guint32 *out_packed_len);
-#endif /* MM_CHARSETS_H */
+gchar *mm_charset_take_and_convert_to_utf8 (gchar *str,
+ MMModemCharset charset);
+#endif /* MM_CHARSETS_H */