[svn:parrot] r49751 - in branches/string_checks: config/gen/makefiles src/string/encoding t/op

nwellnhof at svn.parrot.org nwellnhof at svn.parrot.org
Sun Oct 31 15:01:56 UTC 2010


Author: nwellnhof
Date: Sun Oct 31 15:01:56 2010
New Revision: 49751
URL: https://trac.parrot.org/parrot/changeset/49751

Log:
[str] Add checks for UCS-2 and UCS-4

Modified:
   branches/string_checks/config/gen/makefiles/root.in
   branches/string_checks/src/string/encoding/shared.c
   branches/string_checks/src/string/encoding/shared.h
   branches/string_checks/src/string/encoding/ucs2.c
   branches/string_checks/src/string/encoding/ucs4.c
   branches/string_checks/t/op/stringu.t

Modified: branches/string_checks/config/gen/makefiles/root.in
==============================================================================
--- branches/string_checks/config/gen/makefiles/root.in	Sun Oct 31 15:01:21 2010	(r49750)
+++ branches/string_checks/config/gen/makefiles/root.in	Sun Oct 31 15:01:56 2010	(r49751)
@@ -1660,11 +1660,14 @@
     src/string/encoding/shared.h \
     src/string/unicode.h
 src/string/encoding/utf16$(O) : $(PARROT_H_HEADERS) \
-    src/string/encoding/shared.h
+    src/string/encoding/shared.h \
+    src/string/unicode.h
 src/string/encoding/ucs2$(O) : $(PARROT_H_HEADERS) \
-    src/string/encoding/shared.h
+    src/string/encoding/shared.h \
+    src/string/unicode.h
 src/string/encoding/ucs4$(O) : $(PARROT_H_HEADERS) \
-    src/string/encoding/shared.h
+    src/string/encoding/shared.h \
+    src/string/unicode.h
 
 src/pbc_merge$(O) : \
     $(INC_DIR)/embed.h \

Modified: branches/string_checks/src/string/encoding/shared.c
==============================================================================
--- branches/string_checks/src/string/encoding/shared.c	Sun Oct 31 15:01:21 2010	(r49750)
+++ branches/string_checks/src/string/encoding/shared.c	Sun Oct 31 15:01:56 2010	(r49751)
@@ -1300,37 +1300,6 @@
 
 /*
 
-=item C<UINTVAL unicode_validate(PARROT_INTERP, const STRING *src)>
-
-Returns 1 if the STRING C<src> is a valid unicode string, returns 0 otherwise.
-
-=cut
-
-*/
-
-UINTVAL
-unicode_validate(PARROT_INTERP, ARGIN(const STRING *src))
-{
-    ASSERT_ARGS(unicode_validate)
-    String_iter iter;
-    const UINTVAL length = Parrot_str_length(interp, src);
-
-    STRING_ITER_INIT(interp, &iter);
-    while (iter.charpos < length) {
-        const UINTVAL codepoint = STRING_iter_get_and_advance(interp, src, &iter);
-        /* Check for Unicode non-characters */
-        if (codepoint >= 0xfdd0
-        && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
-        &&  codepoint <= 0x10ffff)
-            return 0;
-    }
-
-    return 1;
-}
-
-
-/*
-
 =item C<STRING* unicode_compose(PARROT_INTERP, const STRING *src)>
 
 If Parrot is built with ICU, composes the STRING C<src>. Attempts to

Modified: branches/string_checks/src/string/encoding/shared.h
==============================================================================
--- branches/string_checks/src/string/encoding/shared.h	Sun Oct 31 15:01:21 2010	(r49750)
+++ branches/string_checks/src/string/encoding/shared.h	Sun Oct 31 15:01:56 2010	(r49751)
@@ -292,10 +292,6 @@
 STRING* unicode_upcase_first(PARROT_INTERP, SHIM(const STRING *src))
         __attribute__nonnull__(1);
 
-UINTVAL unicode_validate(PARROT_INTERP, ARGIN(const STRING *src))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2);
-
 #define ASSERT_ARGS_encoding_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(lhs) \
@@ -418,9 +414,6 @@
     , PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_unicode_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_unicode_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src))
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 /* HEADERIZER END: src/string/encoding/shared.c */
 

Modified: branches/string_checks/src/string/encoding/ucs2.c
==============================================================================
--- branches/string_checks/src/string/encoding/ucs2.c	Sun Oct 31 15:01:21 2010	(r49750)
+++ branches/string_checks/src/string/encoding/ucs2.c	Sun Oct 31 15:01:56 2010	(r49751)
@@ -19,6 +19,7 @@
 */
 
 #include "parrot/parrot.h"
+#include "../unicode.h"
 #include "shared.h"
 
 /* HEADERIZER HFILE: none */
@@ -26,6 +27,10 @@
 /* HEADERIZER BEGIN: static */
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 
+PARROT_INLINE
+static void ucs2_check_codepoint(PARROT_INTERP, UINTVAL c)
+        __attribute__nonnull__(1);
+
 static size_t ucs2_hash(SHIM_INTERP,
     ARGIN(const STRING *src),
     size_t hashval)
@@ -45,10 +50,11 @@
         __attribute__nonnull__(3)
         FUNC_MODIFIES(*i);
 
-static void ucs2_iter_set_and_advance(SHIM_INTERP,
+static void ucs2_iter_set_and_advance(PARROT_INTERP,
     ARGMOD(STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL c)
+        __attribute__nonnull__(1)
         __attribute__nonnull__(2)
         __attribute__nonnull__(3)
         FUNC_MODIFIES(*str)
@@ -83,6 +89,8 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+#define ASSERT_ARGS_ucs2_check_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp))
 #define ASSERT_ARGS_ucs2_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_ucs2_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -92,7 +100,8 @@
        PARROT_ASSERT_ARG(str) \
     , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_ucs2_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(str) \
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
     , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_ucs2_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(i))
@@ -137,13 +146,36 @@
     /* conversion to utf16 downgrads to ucs-2 if possible - check result */
     if (result->encoding == Parrot_utf16_encoding_ptr)
         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
-            "can't convert string with surrogates to ucs2");
+            "Lossy conversion to UCS-2\n");
 
     return result;
 }
 
 /*
 
+=item C<static void ucs2_check_codepoint(PARROT_INTERP, UINTVAL c)>
+
+Throws an exception if codepoint C<c> is invalid.
+
+=cut
+
+*/
+
+PARROT_INLINE
+static void
+ucs2_check_codepoint(PARROT_INTERP, UINTVAL c)
+{
+    ASSERT_ARGS(ucs2_check_codepoint)
+
+    if (UNICODE_IS_SURROGATE(c)
+    || (c >= 0xFDD0 && c <= 0xFDEF)
+    ||  c >= 0xFFFE)
+        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                "Invalid character in UCS-2 string\n");
+}
+
+/*
+
 =item C<static UINTVAL ucs2_scan(PARROT_INTERP, const STRING *src)>
 
 Returns the number of codepoints in string C<src>.
@@ -157,7 +189,19 @@
 ucs2_scan(PARROT_INTERP, ARGIN(const STRING *src))
 {
     ASSERT_ARGS(ucs2_scan)
-    return src->bufused >> 1;
+    const utf16_t * const ptr = (utf16_t *)src->strstart;
+    const UINTVAL         len = src->bufused >> 1;
+    UINTVAL               i;
+
+    if (src->bufused & 1)
+        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+            "Unaligned end in UCS-2 string\n");
+
+    for (i = 0; i < len; ++i) {
+        ucs2_check_codepoint(interp, ptr[i]);
+    }
+
+    return len;
 }
 
 /*
@@ -174,8 +218,8 @@
 ucs2_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
 {
     ASSERT_ARGS(ucs2_ord)
-    const UINTVAL len = STRING_length(src);
-    const Parrot_UInt2 *s;
+    const utf16_t * const ptr = (utf16_t *)src->strstart;
+    const UINTVAL         len = STRING_length(src);
 
     if (idx < 0)
         idx += len;
@@ -183,9 +227,7 @@
     if ((UINTVAL)idx >= len)
         encoding_ord_error(interp, src, idx);
 
-    s = (const Parrot_UInt2 *)src->strstart;
-
-    return s[idx];
+    return ptr[idx];
 }
 
 
@@ -205,9 +247,9 @@
     ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
 {
     ASSERT_ARGS(ucs2_iter_get)
-    const Parrot_UInt2 * const s = (const Parrot_UInt2 *)str->strstart;
+    const utf16_t * const ptr = (utf16_t *)str->strstart;
 
-    return s[i->charpos + offset];
+    return ptr[i->charpos + offset];
 }
 
 /*
@@ -247,10 +289,10 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i))
 {
     ASSERT_ARGS(ucs2_iter_get_and_advance)
-    const Parrot_UInt2 * const s = (Parrot_UInt2 *)str->strstart;
-    const UINTVAL c = s[i->charpos];
+    const utf16_t * const ptr = (utf16_t *)str->strstart;
+    const UINTVAL         c   = ptr[i->charpos];
 
-    i->charpos++;
+    i->charpos += 1;
     i->bytepos += 2;
 
     return c;
@@ -269,15 +311,17 @@
 */
 
 static void
-ucs2_iter_set_and_advance(SHIM_INTERP,
+ucs2_iter_set_and_advance(PARROT_INTERP,
     ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
 {
     ASSERT_ARGS(ucs2_iter_set_and_advance)
-    Parrot_UInt2 * const s = (Parrot_UInt2 *) str->strstart;
+    utf16_t * const ptr = (utf16_t *)str->strstart;
 
-    s[i->charpos] = c;
+    ucs2_check_codepoint(interp, c);
 
-    i->charpos++;
+    ptr[i->charpos] = c;
+
+    i->charpos += 1;
     i->bytepos += 2;
 }
 
@@ -318,16 +362,13 @@
 {
     ASSERT_ARGS(ucs2_hash)
     DECL_CONST_CAST;
-    STRING * const s = PARROT_const_cast(STRING *, src);
-    const Parrot_UInt2 *pos;
-    UINTVAL len;
-
-    pos = (const Parrot_UInt2*)s->strstart;
-    len = s->strlen;
+    STRING * const s   = PARROT_const_cast(STRING *, src);
+    const utf16_t *ptr = (utf16_t *)s->strstart;
+    UINTVAL        len = s->strlen;
 
     while (len--) {
         hashval += hashval << 5;
-        hashval += *(pos++);
+        hashval += *(ptr++);
     }
 
     s->hashval = hashval;
@@ -348,8 +389,8 @@
     encoding_compare,
     encoding_index,
     encoding_rindex,
-    encoding_hash,
-    unicode_validate,
+    ucs2_hash,
+    encoding_validate,
 
     ucs2_scan,
     ucs2_ord,

Modified: branches/string_checks/src/string/encoding/ucs4.c
==============================================================================
--- branches/string_checks/src/string/encoding/ucs4.c	Sun Oct 31 15:01:21 2010	(r49750)
+++ branches/string_checks/src/string/encoding/ucs4.c	Sun Oct 31 15:01:56 2010	(r49751)
@@ -19,6 +19,7 @@
 */
 
 #include "parrot/parrot.h"
+#include "../unicode.h"
 #include "shared.h"
 
 /* HEADERIZER HFILE: none */
@@ -45,10 +46,11 @@
         __attribute__nonnull__(3)
         FUNC_MODIFIES(*i);
 
-static void ucs4_iter_set_and_advance(SHIM_INTERP,
+static void ucs4_iter_set_and_advance(PARROT_INTERP,
     ARGMOD(STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL c)
+        __attribute__nonnull__(1)
         __attribute__nonnull__(2)
         __attribute__nonnull__(3)
         FUNC_MODIFIES(*str)
@@ -73,7 +75,8 @@
         __attribute__nonnull__(2);
 
 PARROT_WARN_UNUSED_RESULT
-static UINTVAL ucs4_scan(SHIM_INTERP, ARGIN(const STRING *src))
+static UINTVAL ucs4_scan(PARROT_INTERP, ARGIN(const STRING *src))
+        __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
 PARROT_WARN_UNUSED_RESULT
@@ -91,7 +94,8 @@
        PARROT_ASSERT_ARG(str) \
     , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_ucs4_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(str) \
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
     , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_ucs4_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(i))
@@ -101,7 +105,8 @@
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_ucs4_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(src))
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_ucs4_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
@@ -125,24 +130,23 @@
 ucs4_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
 {
     ASSERT_ARGS(ucs4_to_encoding)
-    UINTVAL       len;
-    STRING       *res;
-    Parrot_UInt4 *buf;
+    const UINTVAL  len = src->strlen;
+    STRING        *res;
+    utf32_t       *ptr;
 
     if (src->encoding == Parrot_ucs4_encoding_ptr)
-        return Parrot_str_clone(interp, src);
+        return Parrot_str_copy(interp, src);
 
-    len = STRING_length(src);
     res = Parrot_str_new_init(interp, NULL, len * 4,
             Parrot_ucs4_encoding_ptr, 0);
-    buf = (Parrot_UInt4 *) res->strstart;
+    ptr = (utf32_t *)res->strstart;
 
     if (STRING_max_bytes_per_codepoint(src) == 1) {
-        const unsigned char *s = (const unsigned char *)src->strstart;
+        const unsigned char *s = (unsigned char *)src->strstart;
         UINTVAL i;
 
         for (i = 0; i < len; i++) {
-            buf[i] = s[i];
+            ptr[i] = s[i];
         }
     }
     else {
@@ -151,7 +155,7 @@
         STRING_ITER_INIT(interp, &iter);
 
         while (iter.charpos < len) {
-            buf[iter.charpos] = STRING_iter_get_and_advance(interp, src, &iter);
+            ptr[iter.charpos] = STRING_iter_get_and_advance(interp, src, &iter);
         }
     }
 
@@ -174,11 +178,26 @@
 
 PARROT_WARN_UNUSED_RESULT
 static UINTVAL
-ucs4_scan(SHIM_INTERP, ARGIN(const STRING *src))
+ucs4_scan(PARROT_INTERP, ARGIN(const STRING *src))
 {
     ASSERT_ARGS(ucs4_scan)
+    const utf32_t * const ptr = (utf32_t *)src->strstart;
+    const UINTVAL         len = src->bufused >> 2;
+    UINTVAL               i;
+
+    if (src->bufused & 3)
+        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+            "Unaligned end in UCS-4 string\n");
+
+    for (i = 0; i < len; ++i) {
+        UINTVAL c = ptr[i];
+
+        if (UNICODE_IS_INVALID(c))
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                    "Invalid character in UCS-4 string\n");
+    }
 
-    return src->bufused >> 2;
+    return len;
 }
 
 
@@ -196,8 +215,8 @@
 ucs4_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
 {
     ASSERT_ARGS(ucs4_ord)
-    const UINTVAL  len = STRING_length(src);
-    const Parrot_UInt4 *s;
+    const utf32_t * const ptr = (utf32_t *)src->strstart;
+    const UINTVAL         len = src->strlen;
 
     if (idx < 0)
         idx += len;
@@ -205,9 +224,7 @@
     if ((UINTVAL)idx >= len)
         encoding_ord_error(interp, src, idx);
 
-    s = (const Parrot_UInt4 *)src->strstart;
-
-    return s[idx];
+    return ptr[idx];
 }
 
 
@@ -227,9 +244,9 @@
     ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
 {
     ASSERT_ARGS(ucs4_iter_get)
-    const Parrot_UInt4 * const s = (const Parrot_UInt4 *)str->strstart;
+    const utf32_t * const ptr = (utf32_t *)str->strstart;
 
-    return s[i->charpos + offset];
+    return ptr[i->charpos + offset];
 }
 
 
@@ -271,10 +288,10 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i))
 {
     ASSERT_ARGS(ucs4_iter_get_and_advance)
-    const Parrot_UInt4 * const s = (const Parrot_UInt4 *)str->strstart;
-    const UINTVAL c = s[i->charpos];
+    const utf32_t * const ptr = (utf32_t *)str->strstart;
+    const UINTVAL         c   = ptr[i->charpos];
 
-    i->charpos++;
+    i->charpos += 1;
     i->bytepos += 4;
 
     return c;
@@ -294,15 +311,19 @@
 */
 
 static void
-ucs4_iter_set_and_advance(SHIM_INTERP,
+ucs4_iter_set_and_advance(PARROT_INTERP,
     ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
 {
     ASSERT_ARGS(ucs4_iter_set_and_advance)
-    Parrot_UInt4 * const s = (Parrot_UInt4 *)str->strstart;
+    utf32_t * const ptr = (utf32_t *)str->strstart;
 
-    s[i->charpos] = c;
+    if (UNICODE_IS_INVALID(c))
+        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                "Invalid character in UCS-4 string\n");
 
-    i->charpos++;
+    ptr[i->charpos] = c;
+
+    i->charpos += 1;
     i->bytepos += 4;
 }
 
@@ -345,16 +366,13 @@
 {
     ASSERT_ARGS(ucs4_hash)
     DECL_CONST_CAST;
-    STRING * const s = PARROT_const_cast(STRING *, src);
-    const Parrot_UInt4 *pos;
-    UINTVAL len;
-
-    pos = (const Parrot_UInt4 *)s->strstart;
-    len = s->strlen;
+    STRING * const  s   = PARROT_const_cast(STRING *, src);
+    const utf32_t  *ptr = (utf32_t *)s->strstart;
+    UINTVAL         len = s->strlen;
 
     while (len--) {
         hashval += hashval << 5;
-        hashval += *(pos++);
+        hashval += *(ptr++);
     }
 
     s->hashval = hashval;
@@ -377,7 +395,7 @@
     encoding_index,
     encoding_rindex,
     ucs4_hash,
-    unicode_validate,
+    encoding_validate,
 
     ucs4_scan,
     ucs4_ord,

Modified: branches/string_checks/t/op/stringu.t
==============================================================================
--- branches/string_checks/t/op/stringu.t	Sun Oct 31 15:01:21 2010	(r49750)
+++ branches/string_checks/t/op/stringu.t	Sun Oct 31 15:01:56 2010	(r49751)
@@ -6,7 +6,7 @@
 use warnings;
 use lib qw( . lib ../lib ../../lib );
 use Test::More;
-use Parrot::Test tests => 36;
+use Parrot::Test tests => 40;
 use Parrot::Config;
 
 =head1 NAME
@@ -696,10 +696,13 @@
 OUT
 
 sub units_to_code {
+    my $bytes_per_unit = shift;
+
+    my $pack_format = $bytes_per_unit == 2 ? 'S*' : 'L*';
     my $code = '';
 
     for my $unit (@_) {
-        my $str = pack('S*', @$unit);
+        my $str = pack($pack_format, @$unit);
         $str =~ s/./sprintf("\\x%02X", ord($&))/egs;
         $code .= qq{    'test_chars'(binary:"$str")\n};
     }
@@ -709,6 +712,7 @@
 
 my $code = qq{    'test_chars'(binary:"\\x41\\x42\\x43")\n};
 $code .= units_to_code(
+    2,
     [ 0xD800 ],
     [ 0xDFFF ],
     [ 0xD800, 0x0041 ],
@@ -768,6 +772,7 @@
 OUT
 
 $code = units_to_code(
+    2,
     [ 0x0041 ],
     [ 0xD7FF ],
     [ 0xE000 ],
@@ -808,6 +813,218 @@
 0x10FFFD
 OUT
 
+$code = qq{    'test_chars'(binary:"\\x41\\x42\\x43")\n};
+$code .= units_to_code(
+    2,
+    [ 0xD800 ],
+    [ 0xDFFF ],
+    [ 0xD800, 0x0041 ],
+    [ 0xD900, 0xDAFF ],
+    [ 0xDBFF, 0xD800 ],
+    [ 0xDC00, 0xD8FF ],
+    [ 0xDDFF, 0xDE00 ],
+    [ 0xDFFF, 0x0041 ],
+    [ 0xFDD0 ],
+    [ 0xFDEF ],
+    [ 0xFFFE ],
+    [ 0xFFFF ],
+    [ 0xD800, 0xDC00 ],
+    [ 0xD912, 0xDE34 ],
+    [ 0xDBFF, 0xDFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'illegal ucs2 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc eh, ex, bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    eh = new 'ExceptionHandler'
+    set_addr eh, handler
+    push_eh eh
+    chars = bb.'get_string'('ucs2')
+    say 'valid'
+    goto end
+  handler:
+    .local pmc ex
+    .get_results (ex)
+    \$S0 = ex['message']
+    print \$S0
+  end:
+    pop_eh
+.end
+CODE
+Unaligned end in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+OUT
+
+$code = units_to_code(
+    2,
+    [ 0x0041 ],
+    [ 0xD7FF ],
+    [ 0xE000 ],
+    [ 0xFDCF ],
+    [ 0xFDF0 ],
+    [ 0xFFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'valid ucs2 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    chars = bb.'get_string'('ucs2')
+    \$I0 = ord chars
+    \$P0 = new 'FixedIntegerArray', 1
+    \$P0[0] = \$I0
+    \$S0 = sprintf '0x%X', \$P0
+    say \$S0
+.end
+CODE
+0x41
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+OUT
+
+$code = <<CODE;
+    'test_chars'(binary:"\\x00\\x00\\x00")
+    'test_chars'(binary:"\\x00\\x00\\x00\\x00\\x00")
+    'test_chars'(binary:"\\x00\\x00\\x00\\x00\\x00\\x00")
+CODE
+$code .= units_to_code(
+    4,
+    [ 0xD800 ],
+    [ 0xDFFF ],
+    [ 0xFDD0 ],
+    [ 0xFDEF ],
+    [ 0xFFFE ],
+    [ 0xFFFF ],
+    [ 0x01FFFE ],
+    [ 0x02FFFF ],
+    [ 0x10FFFE ],
+    [ 0x10FFFF ],
+    [ 0x110000 ],
+    [ 0x12345678 ],
+    [ 0xFFFFFFFF ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'illegal ucs4 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc eh, ex, bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    eh = new 'ExceptionHandler'
+    set_addr eh, handler
+    push_eh eh
+    chars = bb.'get_string'('ucs4')
+    say 'valid'
+    goto end
+  handler:
+    .local pmc ex
+    .get_results (ex)
+    \$S0 = ex['message']
+    print \$S0
+  end:
+    pop_eh
+.end
+CODE
+Unaligned end in UCS-4 string
+Unaligned end in UCS-4 string
+Unaligned end in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+OUT
+
+$code = units_to_code(
+    4,
+    [ 0x0041 ],
+    [ 0xD7FF ],
+    [ 0xE000 ],
+    [ 0xFDCF ],
+    [ 0xFDF0 ],
+    [ 0xFFFD ],
+    [ 0x010000 ],
+    [ 0x01FFFD ],
+    [ 0x020000 ],
+    [ 0x07FFFD ],
+    [ 0x0B0000 ],
+    [ 0x10FFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'valid ucs4 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    chars = bb.'get_string'('ucs4')
+    \$I0 = ord chars
+    \$P0 = new 'FixedIntegerArray', 1
+    \$P0[0] = \$I0
+    \$S0 = sprintf '0x%X', \$P0
+    say \$S0
+.end
+CODE
+0x41
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+0x10000
+0x1FFFD
+0x20000
+0x7FFFD
+0xB0000
+0x10FFFD
+OUT
+
 SKIP: {
     skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
 


More information about the parrot-commits mailing list