utf8.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. /* Charset handling for GNU tar.
  2. Copyright (C) 2004 Free Software Foundation, Inc.
  3. This program is free software; you can redistribute it and/or modify it
  4. under the terms of the GNU General Public License as published by the
  5. Free Software Foundation; either version 2, or (at your option) any later
  6. version.
  7. This program is distributed in the hope that it will be useful, but
  8. WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
  10. Public License for more details.
  11. You should have received a copy of the GNU General Public License along
  12. with this program; if not, write to the Free Software Foundation, Inc.,
  13. 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
  14. #include "system.h"
  15. #include <quotearg.h>
  16. #include "common.h"
  17. #ifdef HAVE_ICONV_H
  18. # include <iconv.h>
  19. #endif
  20. #ifndef ICONV_CONST
  21. # define ICONV_CONST
  22. #endif
  23. #ifdef HAVE_LIBICONV
  24. struct langtab
  25. {
  26. char const *lang; /* Language code */
  27. char const *terr; /* Territory code */
  28. char const *charset; /* Corresponding charset */
  29. };
  30. /* The list of language codes defined in ISO 639 with the corresponding
  31. default character sets.
  32. NOTES:
  33. 1) The list must be ordered by:
  34. a) lang field in ascending order
  35. b) terr field in descending order.
  36. NULL fields are considered less than non-null ones.
  37. 2) Many entries have NULL charset fields. Please help fill them!
  38. 3) The "default" character set for a given language is a matter
  39. of preference. Possibly the table should contain a *list* of
  40. possible character sets.
  41. 4) LC_ALL "modifier" field is not taken into account */
  42. static struct langtab langtab[] = {
  43. { "C", NULL, "ASCII"},
  44. { "POSIX", NULL, "ASCII" },
  45. { "aa", NULL, NULL}, /* Afar */
  46. { "ab", NULL, NULL}, /* Abkhazian */
  47. { "ae", NULL, NULL}, /* Avestan */
  48. { "af", NULL, "iso-8859-1"}, /* Afrikaans */
  49. { "am", NULL, "UTF-8"}, /* Amharic */
  50. { "ar", NULL, "iso-8859-6"}, /* Arabic */
  51. { "as", NULL, NULL}, /* Assamese */
  52. { "ay", NULL, "iso-8859-1"}, /* Aymara */
  53. { "az", NULL, NULL}, /* Azerbaijani */
  54. { "ba", NULL, NULL}, /* Bashkir */
  55. { "be", NULL, "UTF-8"}, /* Byelorussian; Belarusian */
  56. { "bg", NULL, "iso-8859-5"}, /* Bulgarian */
  57. { "bh", NULL, NULL}, /* Bihari */
  58. { "bi", NULL, NULL}, /* Bislama */
  59. { "bn", NULL, NULL}, /* Bengali; Bangla */
  60. { "bo", NULL, NULL}, /* Tibetan */
  61. { "br", NULL, "iso-8859-1"}, /* Breton: 1,5,8,9 */
  62. { "bs", NULL, NULL}, /* Bosnian */
  63. { "ca", NULL, "iso-8859-1"}, /* Catalan: 1,5,8,9 */
  64. { "ce", NULL, NULL}, /* Chechen */
  65. { "ch", NULL, NULL}, /* Chamorro */
  66. { "co", NULL, "iso-8859-1"}, /* Corsican */
  67. { "cs", NULL, "iso-8859-2"}, /* Czech */
  68. { "cu", NULL, NULL }, /* Church Slavic */
  69. { "cv", NULL, NULL}, /* Chuvash */
  70. { "cy", NULL, "iso-8859-1"}, /* Welsh */
  71. { "da", NULL, "iso-8859-1"}, /* Danish: 4-9 */
  72. { "de", NULL, "iso-8859-1"}, /* German */
  73. { "dz", NULL, NULL }, /* Dzongkha; Bhutani */
  74. { "el", NULL, "iso-8859-7"}, /* Greek */
  75. { "en", NULL, "iso-8859-1"}, /* English */
  76. { "eo", NULL, "iso-8859-3"}, /* Esperanto */
  77. { "es", NULL, "iso-8859-1"}, /* Spanish */
  78. { "et", NULL, "iso-8859-15"}, /* Estonian: 6,7,9 */
  79. { "eu", NULL, "iso-8859-1"}, /* Basque: 5,8,9 */
  80. { "fa", NULL, "UTF-8"}, /* Persian */
  81. { "fi", NULL, "iso-8859-15"}, /* Finnish */
  82. { "fj", NULL, NULL }, /* Fijian; Fiji */
  83. { "fo", NULL, "iso-8859-1"}, /* Faroese: 6,9 */
  84. { "fr", NULL, "iso-8859-1"}, /* French */
  85. { "fy", NULL, "iso-8859-1"}, /* Frisian */
  86. { "ga", NULL, "iso-8859-14"}, /* Irish */
  87. { "gd", NULL, "iso-8859-14" }, /* Scots; Gaelic */
  88. { "gl", NULL, NULL }, /* Gallegan; Galician */
  89. { "gn", NULL, NULL}, /* Guarani */
  90. { "gu", NULL, NULL}, /* Gujarati */
  91. { "gv", NULL, "iso-8859-14"}, /* Manx */
  92. { "ha", NULL, NULL }, /* Hausa (?) */
  93. { "he", NULL, "iso-8859-8" }, /* Hebrew */
  94. { "hi", NULL, NULL}, /* Hindi */
  95. { "ho", NULL, NULL}, /* Hiri Motu */
  96. { "hr", NULL, "iso-8859-2"}, /* Croatian: 10 */
  97. { "hu", NULL, "iso-8859-2"}, /* Hungarian */
  98. { "hy", NULL, NULL}, /* Armenian */
  99. { "hz", NULL, NULL}, /* Herero */
  100. { "id", NULL, "iso-8859-1"}, /* Indonesian (formerly in) */
  101. { "ia", NULL, NULL}, /* Interlingua */
  102. { "ie", NULL, NULL}, /* Interlingue */
  103. { "ik", NULL, NULL}, /* Inupiak */
  104. { "io", NULL, NULL}, /* Ido */
  105. { "is", NULL, "iso-8859-1"}, /* Icelandic */
  106. { "it", NULL, "iso-8859-1"}, /* Italian */
  107. { "iu", NULL, NULL}, /* Inuktitut */
  108. { "ja", NULL, "EUC-JP"}, /* Japanese */
  109. { "jv", NULL, NULL}, /* Javanese */
  110. { "ka", NULL, NULL}, /* Georgian */
  111. { "ki", NULL, NULL}, /* Kikuyu */
  112. { "kj", NULL, NULL}, /* Kuanyama */
  113. { "kk", NULL, NULL}, /* Kazakh */
  114. { "kl", NULL, "iso-8859-1"}, /* Kalaallisut; Greenlandic */
  115. { "km", NULL, NULL}, /* Khmer; Cambodian */
  116. { "kn", NULL, NULL}, /* Kannada */
  117. { "ko", NULL, "EUC-KR"}, /* Korean */
  118. { "ks", NULL, NULL}, /* Kashmiri */
  119. { "ku", NULL, NULL}, /* Kurdish */
  120. { "kv", NULL, NULL}, /* Komi */
  121. { "kw", NULL, "iso-8859-14"}, /* Cornish: 1,5,8 */
  122. { "ky", NULL, NULL}, /* Kirghiz */
  123. { "la", NULL, "iso-8859-1"}, /* Latin */
  124. { "lb", NULL, "iso-8859-1"}, /* Letzeburgesch */
  125. { "ln", NULL, NULL}, /* Lingala */
  126. { "lo", NULL, NULL}, /* Lao; Laotian */
  127. { "lt", NULL, "iso-8859-4"}, /* Lithuanian */
  128. { "lv", NULL, "iso-8859-4"}, /* Latvian; Lettish */
  129. { "mg", NULL, NULL}, /* Malagasy */
  130. { "mh", NULL, NULL}, /* Marshall */
  131. { "mi", NULL, NULL}, /* Maori */
  132. { "mk", NULL, NULL}, /* Macedonian */
  133. { "ml", NULL, NULL}, /* Malayalam */
  134. { "mn", NULL, NULL}, /* Mongolian */
  135. { "mo", NULL, "iso-8859-2"}, /* Moldavian */
  136. { "mr", NULL, NULL}, /* Marathi */
  137. { "ms", NULL, NULL}, /* Malay */
  138. { "mt", NULL, "iso-8859-3"}, /* Maltese */
  139. { "my", NULL, NULL}, /* Burmese */
  140. { "na", NULL, NULL}, /* Nauru */
  141. { "nb", NULL, "iso-8859-1"}, /* Norwegian Bokmål; Bokm@aa{}l */
  142. { "nd", NULL, NULL}, /* Ndebele, North */
  143. { "ne", NULL, NULL}, /* Nepali */
  144. { "ng", NULL, NULL}, /* Ndonga */
  145. { "nl", NULL, "iso-8859-1"}, /* Dutch: 5,9 */
  146. { "nn", NULL, "iso-8859-1"}, /* Norwegian Nynorsk */
  147. { "no", NULL, "iso-8859-1"}, /* Norwegian */
  148. { "nr", NULL, NULL}, /* Ndebele, South */
  149. { "nv", NULL, NULL}, /* Navajo */
  150. { "ny", NULL, NULL}, /* Chichewa; Nyanja */
  151. { "oc", NULL, NULL}, /* Occitan; Provençal; Proven@,{c}al */
  152. { "om", NULL, NULL}, /* (Afan) Oromo */
  153. { "or", NULL, NULL}, /* Oriya */
  154. { "os", NULL, NULL}, /* Ossetian; Ossetic */
  155. { "pa", NULL, NULL}, /* Panjabi; Punjabi */
  156. { "pi", NULL, NULL}, /* Pali */
  157. { "pl", NULL, "iso-8859-2"}, /* Polish */
  158. { "ps", NULL, NULL}, /* Pashto, Pushto */
  159. { "pt", NULL, "iso-8859-1"}, /* Portuguese */
  160. { "qu", NULL, "iso-8859-1"}, /* Quechua */
  161. { "rm", NULL, "iso-8859-1"}, /* Rhaeto-Romance */
  162. { "rn", NULL, NULL }, /* Rundi; Kirundi */
  163. { "ro", NULL, "iso-8859-2"}, /* Romanian */
  164. { "ru", NULL, "koi8-r"}, /* Russian */
  165. { "rw", NULL, NULL}, /* Kinyarwanda */
  166. { "sa", NULL, NULL}, /* Sanskrit */
  167. { "sc", NULL, "iso-8859-1"}, /* Sardinian */
  168. { "sd", NULL, NULL}, /* Sindhi */
  169. { "se", NULL, "iso-8859-10"}, /* Northern Sami */
  170. { "sg", NULL, NULL}, /* Sango; Sangro */
  171. { "si", NULL, NULL}, /* Sinhalese */
  172. { "sk", NULL, "iso-8859-2"}, /* Slovak */
  173. { "sl", NULL, "iso-8859-1"}, /* Slovenian */
  174. { "sm", NULL, NULL}, /* Samoan */
  175. { "sn", NULL, NULL}, /* Shona */
  176. { "so", NULL, NULL}, /* Somali */
  177. { "sq", NULL, "iso-8859-1"}, /* Albanian: 2,5,8,9,10 */
  178. { "sr", NULL, "iso-8859-2"}, /* Serbian */
  179. { "ss", NULL, NULL}, /* Swati; Siswati */
  180. { "st", NULL, NULL}, /* Sesotho; Sotho, Southern */
  181. { "su", NULL, NULL}, /* Sundanese */
  182. { "sv", NULL, "iso-8859-1"}, /* Swedish */
  183. { "sw", NULL, NULL}, /* Swahili */
  184. { "ta", NULL, NULL}, /* Tamil */
  185. { "te", NULL, NULL}, /* Telugu */
  186. { "tg", NULL, NULL}, /* Tajik */
  187. { "th", NULL, "iso-8859-11"}, /* Thai */
  188. { "ti", NULL, NULL}, /* Tigrinya */
  189. { "tk", NULL, NULL}, /* Turkmen */
  190. { "tl", NULL, "iso-8859-1"}, /* Tagalog */
  191. { "tn", NULL, NULL}, /* Tswana; Setswana */
  192. { "to", NULL, NULL}, /* Tonga (?) */
  193. { "tr", NULL, "iso-8859-9"}, /* Turkish */
  194. { "ts", NULL, NULL}, /* Tsonga */
  195. { "tt", NULL, NULL}, /* Tatar */
  196. { "tw", NULL, NULL}, /* Twi */
  197. { "ty", NULL, NULL}, /* Tahitian */
  198. { "ug", NULL, NULL}, /* Uighur */
  199. { "uk", NULL, "koi8-u"}, /* Ukrainian */
  200. { "ur", NULL, NULL}, /* Urdu */
  201. { "uz", NULL, NULL}, /* Uzbek */
  202. { "vi", NULL, NULL}, /* Vietnamese */
  203. { "vo", NULL, NULL}, /* Volapük; Volap@"{u}k; Volapuk */
  204. { "wa", NULL, "iso-8859-1"}, /* Walloon */
  205. { "wo", NULL, NULL}, /* Wolof */
  206. { "xh", NULL, NULL}, /* Xhosa */
  207. { "yi", NULL, "iso-8859-8"}, /* Yiddish (formerly ji) */
  208. { "yo", NULL, NULL}, /* Yoruba */
  209. { "za", NULL, NULL}, /* Zhuang */
  210. { "zh", "TW", "big5"}, /* Chinese */
  211. { "zh", NULL, "gb2312"}, /* Chinese */
  212. { "zu", NULL, NULL}, /* Zulu */
  213. { NULL, NULL, NULL}
  214. };
  215. /* Given the language and (optionally) territory code, return the
  216. default character set for that language. See notes above. */
  217. static char const *
  218. charset_lookup (char const *lang, char const *terr)
  219. {
  220. struct langtab const *p;
  221. if (!lang)
  222. return NULL;
  223. for (p = langtab; p->lang; p++)
  224. if (strcasecmp (p->lang, lang) == 0
  225. && (terr == NULL
  226. || p->terr == NULL
  227. || !strcasecmp (p->terr, terr) == 0))
  228. return p->charset;
  229. return NULL;
  230. }
  231. static const char *
  232. get_input_charset (void)
  233. {
  234. const char *charset = NULL;
  235. char *tmp;
  236. /* Try to deduce the charset from LC_ALL or LANG variables */
  237. tmp = getenv ("LC_ALL");
  238. if (!tmp)
  239. tmp = getenv ("LANG");
  240. if (tmp)
  241. {
  242. char *lang;
  243. char *terr;
  244. lang = strtok (tmp, "_");
  245. terr = strtok (NULL, ".");
  246. charset = strtok (NULL, "@");
  247. if (!charset)
  248. charset = charset_lookup (lang, terr);
  249. }
  250. if (!charset)
  251. charset = "iso-8859-1";
  252. return charset;
  253. }
  254. #else /* !defined HAVE_LIBICONV */
  255. # undef iconv_open
  256. # define iconv_open(tocode, fromcode) ((iconv_t) -1)
  257. # undef iconv
  258. # define iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft) ((size_t) 0)
  259. # undef iconv_close
  260. # define iconv_close(cd) 0
  261. #endif /* !defined HAVE_LIBICONV */
  262. static iconv_t conv_desc[2] = { (iconv_t) -1, (iconv_t) -1 };
  263. static iconv_t
  264. utf8_init (bool to_utf)
  265. {
  266. if (conv_desc[(int) to_utf] == (iconv_t) -1)
  267. {
  268. if (to_utf)
  269. conv_desc[(int) to_utf] = iconv_open ("UTF-8", get_input_charset ());
  270. else
  271. conv_desc[(int) to_utf] = iconv_open (get_input_charset (), "UTF-8");
  272. }
  273. return conv_desc[(int) to_utf];
  274. }
  275. bool
  276. utf8_convert (bool to_utf, char const *input, char **output)
  277. {
  278. char ICONV_CONST *ib;
  279. char *ob;
  280. size_t inlen;
  281. size_t outlen;
  282. size_t rc;
  283. iconv_t cd = utf8_init (to_utf);
  284. if (cd == 0)
  285. {
  286. *output = xstrdup (input);
  287. return true;
  288. }
  289. else if (cd == (iconv_t)-1)
  290. return false;
  291. inlen = strlen (input) + 1;
  292. outlen = inlen * MB_LEN_MAX + 1;
  293. ob = *output = xmalloc (outlen);
  294. ib = (char ICONV_CONST *) input;
  295. rc = iconv (cd, &ib, &inlen, &ob, &outlen);
  296. *ob = 0;
  297. return rc != -1;
  298. }
  299. bool
  300. string_ascii_p (const char *str)
  301. {
  302. const unsigned char *p = (const unsigned char *)str;
  303. for (; *p; p++)
  304. if (*p > 127)
  305. return false;
  306. return true;
  307. }