@@ -18,6 +18,10 @@ extern int winerror_to_errno(int);
18
18
#include <sys/ioctl.h>
19
19
#endif
20
20
21
+ #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
22
+ #include <iconv.h>
23
+ #endif
24
+
21
25
#ifdef HAVE_FCNTL_H
22
26
#include <fcntl.h>
23
27
#endif /* HAVE_FCNTL_H */
@@ -93,6 +97,12 @@ _Py_device_encoding(int fd)
93
97
static size_t
94
98
is_valid_wide_char (wchar_t ch )
95
99
{
100
+ #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
101
+ /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
102
+ for non-Unicode locales, which makes values higher than MAX_UNICODE
103
+ possibly valid. */
104
+ return 1 ;
105
+ #endif
96
106
if (Py_UNICODE_IS_SURROGATE (ch )) {
97
107
// Reject lone surrogate characters
98
108
return 0 ;
@@ -922,6 +932,102 @@ _Py_GetLocaleEncodingObject(void)
922
932
return str ;
923
933
}
924
934
935
+ #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
936
+
937
+ /* Check whether current locale uses Unicode as internal wchar_t form. */
938
+ int
939
+ _Py_LocaleUsesNonUnicodeWchar (void )
940
+ {
941
+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
942
+ non-Unicode locales and hence needs conversion to UTF first. */
943
+ char * codeset = nl_langinfo (CODESET );
944
+ if (!codeset ) {
945
+ return 0 ;
946
+ }
947
+ /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
948
+ return (strcmp (codeset , "UTF-8" ) != 0 && strcmp (codeset , "646" ) != 0 );
949
+ }
950
+
951
+ static wchar_t *
952
+ _Py_ConvertWCharForm (const wchar_t * source , Py_ssize_t size ,
953
+ const char * tocode , const char * fromcode )
954
+ {
955
+ Py_BUILD_ASSERT (sizeof (wchar_t ) == 4 );
956
+
957
+ /* Ensure we won't overflow the size. */
958
+ if (size > (PY_SSIZE_T_MAX / (Py_ssize_t )sizeof (wchar_t ))) {
959
+ PyErr_NoMemory ();
960
+ return NULL ;
961
+ }
962
+
963
+ /* the string doesn't have to be NULL terminated */
964
+ wchar_t * target = PyMem_Malloc (size * sizeof (wchar_t ));
965
+ if (target == NULL ) {
966
+ PyErr_NoMemory ();
967
+ return NULL ;
968
+ }
969
+
970
+ iconv_t cd = iconv_open (tocode , fromcode );
971
+ if (cd == (iconv_t )- 1 ) {
972
+ PyErr_Format (PyExc_ValueError , "iconv_open() failed" );
973
+ PyMem_Free (target );
974
+ return NULL ;
975
+ }
976
+
977
+ char * inbuf = (char * ) source ;
978
+ char * outbuf = (char * ) target ;
979
+ size_t inbytesleft = sizeof (wchar_t ) * size ;
980
+ size_t outbytesleft = inbytesleft ;
981
+
982
+ size_t ret = iconv (cd , & inbuf , & inbytesleft , & outbuf , & outbytesleft );
983
+ if (ret == DECODE_ERROR ) {
984
+ PyErr_Format (PyExc_ValueError , "iconv() failed" );
985
+ PyMem_Free (target );
986
+ iconv_close (cd );
987
+ return NULL ;
988
+ }
989
+
990
+ iconv_close (cd );
991
+ return target ;
992
+ }
993
+
994
+ /* Convert a wide character string to the UCS-4 encoded string. This
995
+ is necessary on systems where internal form of wchar_t are not Unicode
996
+ code points (e.g. Oracle Solaris).
997
+
998
+ Return a pointer to a newly allocated string, use PyMem_Free() to free
999
+ the memory. Return NULL and raise exception on conversion or memory
1000
+ allocation error. */
1001
+ wchar_t *
1002
+ _Py_DecodeNonUnicodeWchar (const wchar_t * native , Py_ssize_t size )
1003
+ {
1004
+ return _Py_ConvertWCharForm (native , size , "UCS-4-INTERNAL" , "wchar_t" );
1005
+ }
1006
+
1007
+ /* Convert a UCS-4 encoded string to native wide character string. This
1008
+ is necessary on systems where internal form of wchar_t are not Unicode
1009
+ code points (e.g. Oracle Solaris).
1010
+
1011
+ The conversion is done in place. This can be done because both wchar_t
1012
+ and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
1013
+ to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
1014
+ which is currently the only system using these functions; it doesn't have
1015
+ to be for other systems).
1016
+
1017
+ Return 0 on success. Return -1 and raise exception on conversion
1018
+ or memory allocation error. */
1019
+ int
1020
+ _Py_EncodeNonUnicodeWchar_InPlace (wchar_t * unicode , Py_ssize_t size )
1021
+ {
1022
+ wchar_t * result = _Py_ConvertWCharForm (unicode , size , "wchar_t" , "UCS-4-INTERNAL" );
1023
+ if (!result ) {
1024
+ return -1 ;
1025
+ }
1026
+ memcpy (unicode , result , size * sizeof (wchar_t ));
1027
+ PyMem_Free (result );
1028
+ return 0 ;
1029
+ }
1030
+ #endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
925
1031
926
1032
#ifdef MS_WINDOWS
927
1033
static __int64 secs_between_epochs = 11644473600 ; /* Seconds between 1.1.1601 and 1.1.1970 */
0 commit comments