/* Charset conversion. Copyright (C) 2001-2006 Free Software Foundation, Inc. Written by Bruno Haible and Simon Josefsson. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include <config.h> /* Specification. */ #include "striconv.h" #include <errno.h> #include <stdlib.h> #include <string.h> #if HAVE_ICONV # include <iconv.h> /* Get MB_LEN_MAX, CHAR_BIT. */ # include <limits.h> #endif #include "strdup.h" #include "c-strcase.h" #ifndef SIZE_MAX # define SIZE_MAX ((size_t) -1) #endif #if HAVE_ICONV int mem_cd_iconv (const char *src, size_t srclen, iconv_t cd, char **resultp, size_t *lengthp) { # define tmpbufsize 4096 size_t length; char *result; /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ # if defined _LIBICONV_VERSION \ || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) /* Set to the initial state. */ iconv (cd, NULL, NULL, NULL, NULL); # endif /* Determine the length we need. */ { size_t count = 0; char tmpbuf[tmpbufsize]; const char *inptr = src; size_t insize = srclen; while (insize > 0) { char *outptr = tmpbuf; size_t outsize = tmpbufsize; size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); if (res == (size_t)(-1)) { if (errno == E2BIG) ; else if (errno == EINVAL) break; else return -1; } # if !defined _LIBICONV_VERSION && !defined __GLIBC__ /* Irix iconv() inserts a NUL byte if it cannot convert. NetBSD iconv() inserts a question mark if it cannot convert. Only GNU libiconv and GNU libc are known to prefer to fail rather than doing a lossy conversion. */ else if (res > 0) { errno = EILSEQ; return -1; } # endif count += outptr - tmpbuf; } /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */ # if defined _LIBICONV_VERSION \ || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) { char *outptr = tmpbuf; size_t outsize = tmpbufsize; size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); if (res == (size_t)(-1)) return -1; count += outptr - tmpbuf; } # endif length = count; } if (length == 0) { *lengthp = 0; return 0; } result = (*resultp != NULL ? realloc (*resultp, length) : malloc (length)); if (result == NULL) { errno = ENOMEM; return -1; } *resultp = result; *lengthp = length; /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ # if defined _LIBICONV_VERSION \ || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) /* Return to the initial state. */ iconv (cd, NULL, NULL, NULL, NULL); # endif /* Do the conversion for real. */ { const char *inptr = src; size_t insize = srclen; char *outptr = result; size_t outsize = length; while (insize > 0) { size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); if (res == (size_t)(-1)) { if (errno == EINVAL) break; else return -1; } # if !defined _LIBICONV_VERSION && !defined __GLIBC__ /* Irix iconv() inserts a NUL byte if it cannot convert. NetBSD iconv() inserts a question mark if it cannot convert. Only GNU libiconv and GNU libc are known to prefer to fail rather than doing a lossy conversion. */ else if (res > 0) { errno = EILSEQ; return -1; } # endif } /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */ # if defined _LIBICONV_VERSION \ || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) { size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); if (res == (size_t)(-1)) return -1; } # endif if (outsize != 0) abort (); } return 0; # undef tmpbufsize } char * str_cd_iconv (const char *src, iconv_t cd) { /* For most encodings, a trailing NUL byte in the input will be converted to a trailing NUL byte in the output. But not for UTF-7. So that this function is usable for UTF-7, we have to exclude the NUL byte from the conversion and add it by hand afterwards. */ # if PROBABLY_SLOWER char *result = NULL; size_t length; int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length); char *final_result; if (retval < 0) { if (result != NULL) { int saved_errno = errno; free (result); errno = saved_errno; } return NULL; } /* Add the terminating NUL byte. */ final_result = (result != NULL ? realloc (result, length + 1) : malloc (length + 1)); if (final_result == NULL) { if (result != NULL) free (result); errno = ENOMEM; return NULL; } final_result[length] = '\0'; return final_result; # else char *result; size_t result_size; size_t length; const char *inptr = src; size_t inbytes_remaining = strlen (src); /* Make a guess for the worst-case output size, in order to avoid a realloc. It's OK if the guess is wrong as long as it is not zero and doesn't lead to an integer overflow. */ result_size = inbytes_remaining; { size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2); if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX) result_size *= MB_LEN_MAX; } result_size += 1; /* for the terminating NUL */ result = (char *) malloc (result_size); if (result == NULL) { errno = ENOMEM; return NULL; } /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ # if defined _LIBICONV_VERSION \ || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) /* Set to the initial state. */ iconv (cd, NULL, NULL, NULL, NULL); # endif /* Do the conversion. */ { char *outptr = result; size_t outbytes_remaining = result_size - 1; for (;;) { /* Here inptr + inbytes_remaining = src + strlen (src), outptr + outbytes_remaining = result + result_size - 1. */ size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &inbytes_remaining, &outptr, &outbytes_remaining); if (res == (size_t)(-1)) { if (errno == EINVAL) break; else if (errno == E2BIG) { size_t used = outptr - result; size_t newsize = result_size * 2; char *newresult; if (!(newsize > result_size)) { errno = ENOMEM; goto failed; } newresult = (char *) realloc (result, newsize); if (newresult == NULL) { errno = ENOMEM; goto failed; } result = newresult; result_size = newsize; outptr = result + used; outbytes_remaining = result_size - 1 - used; } else goto failed; } # if !defined _LIBICONV_VERSION && !defined __GLIBC__ /* Irix iconv() inserts a NUL byte if it cannot convert. NetBSD iconv() inserts a question mark if it cannot convert. Only GNU libiconv and GNU libc are known to prefer to fail rather than doing a lossy conversion. */ else if (res > 0) { errno = EILSEQ; goto failed; } # endif else break; } /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */ # if defined _LIBICONV_VERSION \ || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) for (;;) { /* Here outptr + outbytes_remaining = result + result_size - 1. */ size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining); if (res == (size_t)(-1)) { if (errno == E2BIG) { size_t used = outptr - result; size_t newsize = result_size * 2; char *newresult; if (!(newsize > result_size)) { errno = ENOMEM; goto failed; } newresult = (char *) realloc (result, newsize); if (newresult == NULL) { errno = ENOMEM; goto failed; } result = newresult; result_size = newsize; outptr = result + used; outbytes_remaining = result_size - 1 - used; } else goto failed; } else break; } # endif /* Add the terminating NUL byte. */ *outptr++ = '\0'; length = outptr - result; } /* Give away unused memory. */ if (length < result_size) { char *smaller_result = (char *) realloc (result, length); if (smaller_result != NULL) result = smaller_result; } return result; failed: { int saved_errno = errno; free (result); errno = saved_errno; return NULL; } # endif } #endif char * str_iconv (const char *src, const char *from_codeset, const char *to_codeset) { if (c_strcasecmp (from_codeset, to_codeset) == 0) return strdup (src); else { #if HAVE_ICONV iconv_t cd; char *result; /* Avoid glibc-2.1 bug with EUC-KR. */ # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION if (c_strcasecmp (from_codeset, "EUC-KR") == 0 || c_strcasecmp (to_codeset, "EUC-KR") == 0) { errno = EINVAL; return NULL; } # endif cd = iconv_open (to_codeset, from_codeset); if (cd == (iconv_t) -1) return NULL; result = str_cd_iconv (src, cd); if (result == NULL) { /* Close cd, but preserve the errno from str_cd_iconv. */ int saved_errno = errno; iconv_close (cd); errno = saved_errno; } else { if (iconv_close (cd) < 0) { /* Return NULL, but free the allocated memory, and while doing that, preserve the errno from iconv_close. */ int saved_errno = errno; free (result); errno = saved_errno; return NULL; } } return result; #else /* This is a different error code than if iconv_open existed but didn't support from_codeset and to_codeset, so that the caller can emit an error message such as "iconv() is not supported. Installing GNU libiconv and then reinstalling this package would fix this." */ errno = ENOSYS; return NULL; #endif } }