00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <glib.h>
00021 #include <string.h>
00022 #include <libaudcore/audstrings.h>
00023
00024 #include "config.h"
00025 #include "debug.h"
00026 #include "i18n.h"
00027 #include "main.h"
00028 #include "misc.h"
00029
00030 #ifdef USE_CHARDET
00031 # include <libguess.h>
00032 #endif
00033
00034 static char * cd_chardet_to_utf8 (const char * str, int len,
00035 int * arg_bytes_read, int * arg_bytes_written);
00036
00037 static char * str_to_utf8_fallback (const char * str)
00038 {
00039 char * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL);
00040
00041 for (char * c = out; * c; c ++)
00042 {
00043 if (* c & 0x80)
00044 * c = '?';
00045 }
00046
00047 return out;
00048 }
00049
00050 static char * cd_str_to_utf8 (const char * str)
00051 {
00052 char *out_str;
00053
00054 if (str == NULL)
00055 return NULL;
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080 #ifdef USE_CHARDET
00081 if (libguess_validate_utf8(str, strlen(str)))
00082 return g_strdup(str);
00083 #else
00084 if (g_utf8_validate(str, strlen(str), NULL))
00085 return g_strdup(str);
00086 #endif
00087
00088
00089 if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL)))
00090 return out_str;
00091
00092
00093 return str_to_utf8_fallback(str);
00094 }
00095
00096 static char * cd_chardet_to_utf8 (const char * str, int len,
00097 int * arg_bytes_read, int * arg_bytes_write)
00098 {
00099 char *ret = NULL;
00100 int * bytes_read, * bytes_write;
00101 int my_bytes_read, my_bytes_write;
00102
00103 bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
00104 bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
00105
00106 g_return_val_if_fail(str != NULL, NULL);
00107
00108 #ifdef USE_CHARDET
00109 if (libguess_validate_utf8(str, len))
00110 #else
00111 if (g_utf8_validate(str, len, NULL))
00112 #endif
00113 {
00114 if (len < 0)
00115 len = strlen (str);
00116
00117 ret = g_malloc (len + 1);
00118 memcpy (ret, str, len);
00119 ret[len] = 0;
00120
00121 if (arg_bytes_read != NULL)
00122 * arg_bytes_read = len;
00123 if (arg_bytes_write != NULL)
00124 * arg_bytes_write = len;
00125
00126 return ret;
00127 }
00128
00129 #ifdef USE_CHARDET
00130 char * det = get_string (NULL, "chardet_detector");
00131
00132 if (det[0])
00133 {
00134 AUDDBG("guess encoding (%s) %s\n", det, str);
00135 const char * encoding = libguess_determine_encoding (str, len, det);
00136 AUDDBG("encoding = %s\n", encoding);
00137 if (encoding)
00138 {
00139 gsize read_gsize = 0, written_gsize = 0;
00140 ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL);
00141 * bytes_read = read_gsize;
00142 * bytes_write = written_gsize;
00143 }
00144 }
00145
00146 g_free (det);
00147 #endif
00148
00149
00150 if (! ret)
00151 {
00152 char * fallbacks = get_string (NULL, "chardet_fallback");
00153 char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);
00154
00155 for (char * * enc = split; * enc; enc ++)
00156 {
00157 gsize read_gsize = 0, written_gsize = 0;
00158 ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL);
00159 * bytes_read = read_gsize;
00160 * bytes_write = written_gsize;
00161
00162 if (len == *bytes_read)
00163 break;
00164 else {
00165 g_free(ret);
00166 ret = NULL;
00167 }
00168 }
00169
00170 g_strfreev (split);
00171 g_free (fallbacks);
00172 }
00173
00174
00175 if (ret == NULL)
00176 {
00177 gsize read_gsize = 0, written_gsize = 0;
00178 ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL);
00179 * bytes_read = read_gsize;
00180 * bytes_write = written_gsize;
00181 }
00182
00183
00184 if (ret == NULL)
00185 {
00186 gsize read_gsize = 0, written_gsize = 0;
00187 ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL);
00188 * bytes_read = read_gsize;
00189 * bytes_write = written_gsize;
00190 }
00191
00192 if (ret != NULL)
00193 {
00194 if (g_utf8_validate(ret, -1, NULL))
00195 return ret;
00196 else
00197 {
00198 g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
00199 g_free(ret);
00200 return NULL;
00201 }
00202 }
00203
00204 return NULL;
00205 }
00206
00207 void chardet_init (void)
00208 {
00209 #ifdef USE_CHARDET
00210 libguess_determine_encoding(NULL, -1, "");
00211 #endif
00212 str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8);
00213 }