[svn:parrot] r49751 - in branches/string_checks: config/gen/makefiles src/string/encoding t/op
nwellnhof at svn.parrot.org
nwellnhof at svn.parrot.org
Sun Oct 31 15:01:56 UTC 2010
Author: nwellnhof
Date: Sun Oct 31 15:01:56 2010
New Revision: 49751
URL: https://trac.parrot.org/parrot/changeset/49751
Log:
[str] Add checks for UCS-2 and UCS-4
Modified:
branches/string_checks/config/gen/makefiles/root.in
branches/string_checks/src/string/encoding/shared.c
branches/string_checks/src/string/encoding/shared.h
branches/string_checks/src/string/encoding/ucs2.c
branches/string_checks/src/string/encoding/ucs4.c
branches/string_checks/t/op/stringu.t
Modified: branches/string_checks/config/gen/makefiles/root.in
==============================================================================
--- branches/string_checks/config/gen/makefiles/root.in Sun Oct 31 15:01:21 2010 (r49750)
+++ branches/string_checks/config/gen/makefiles/root.in Sun Oct 31 15:01:56 2010 (r49751)
@@ -1660,11 +1660,14 @@
src/string/encoding/shared.h \
src/string/unicode.h
src/string/encoding/utf16$(O) : $(PARROT_H_HEADERS) \
- src/string/encoding/shared.h
+ src/string/encoding/shared.h \
+ src/string/unicode.h
src/string/encoding/ucs2$(O) : $(PARROT_H_HEADERS) \
- src/string/encoding/shared.h
+ src/string/encoding/shared.h \
+ src/string/unicode.h
src/string/encoding/ucs4$(O) : $(PARROT_H_HEADERS) \
- src/string/encoding/shared.h
+ src/string/encoding/shared.h \
+ src/string/unicode.h
src/pbc_merge$(O) : \
$(INC_DIR)/embed.h \
Modified: branches/string_checks/src/string/encoding/shared.c
==============================================================================
--- branches/string_checks/src/string/encoding/shared.c Sun Oct 31 15:01:21 2010 (r49750)
+++ branches/string_checks/src/string/encoding/shared.c Sun Oct 31 15:01:56 2010 (r49751)
@@ -1300,37 +1300,6 @@
/*
-=item C<UINTVAL unicode_validate(PARROT_INTERP, const STRING *src)>
-
-Returns 1 if the STRING C<src> is a valid unicode string, returns 0 otherwise.
-
-=cut
-
-*/
-
-UINTVAL
-unicode_validate(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(unicode_validate)
- String_iter iter;
- const UINTVAL length = Parrot_str_length(interp, src);
-
- STRING_ITER_INIT(interp, &iter);
- while (iter.charpos < length) {
- const UINTVAL codepoint = STRING_iter_get_and_advance(interp, src, &iter);
- /* Check for Unicode non-characters */
- if (codepoint >= 0xfdd0
- && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
- && codepoint <= 0x10ffff)
- return 0;
- }
-
- return 1;
-}
-
-
-/*
-
=item C<STRING* unicode_compose(PARROT_INTERP, const STRING *src)>
If Parrot is built with ICU, composes the STRING C<src>. Attempts to
Modified: branches/string_checks/src/string/encoding/shared.h
==============================================================================
--- branches/string_checks/src/string/encoding/shared.h Sun Oct 31 15:01:21 2010 (r49750)
+++ branches/string_checks/src/string/encoding/shared.h Sun Oct 31 15:01:56 2010 (r49751)
@@ -292,10 +292,6 @@
STRING* unicode_upcase_first(PARROT_INTERP, SHIM(const STRING *src))
__attribute__nonnull__(1);
-UINTVAL unicode_validate(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
#define ASSERT_ARGS_encoding_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(lhs) \
@@ -418,9 +414,6 @@
, PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_unicode_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_unicode_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding/shared.c */
Modified: branches/string_checks/src/string/encoding/ucs2.c
==============================================================================
--- branches/string_checks/src/string/encoding/ucs2.c Sun Oct 31 15:01:21 2010 (r49750)
+++ branches/string_checks/src/string/encoding/ucs2.c Sun Oct 31 15:01:56 2010 (r49751)
@@ -19,6 +19,7 @@
*/
#include "parrot/parrot.h"
+#include "../unicode.h"
#include "shared.h"
/* HEADERIZER HFILE: none */
@@ -26,6 +27,10 @@
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+PARROT_INLINE
+static void ucs2_check_codepoint(PARROT_INTERP, UINTVAL c)
+ __attribute__nonnull__(1);
+
static size_t ucs2_hash(SHIM_INTERP,
ARGIN(const STRING *src),
size_t hashval)
@@ -45,10 +50,11 @@
__attribute__nonnull__(3)
FUNC_MODIFIES(*i);
-static void ucs2_iter_set_and_advance(SHIM_INTERP,
+static void ucs2_iter_set_and_advance(PARROT_INTERP,
ARGMOD(STRING *str),
ARGMOD(String_iter *i),
UINTVAL c)
+ __attribute__nonnull__(1)
__attribute__nonnull__(2)
__attribute__nonnull__(3)
FUNC_MODIFIES(*str)
@@ -83,6 +89,8 @@
__attribute__nonnull__(1)
__attribute__nonnull__(2);
+#define ASSERT_ARGS_ucs2_check_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_ucs2_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_ucs2_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -92,7 +100,8 @@
PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_ucs2_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(str) \
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_ucs2_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(i))
@@ -137,13 +146,36 @@
/* conversion to utf16 downgrads to ucs-2 if possible - check result */
if (result->encoding == Parrot_utf16_encoding_ptr)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
- "can't convert string with surrogates to ucs2");
+ "Lossy conversion to UCS-2\n");
return result;
}
/*
+=item C<static void ucs2_check_codepoint(PARROT_INTERP, UINTVAL c)>
+
+Throws an exception if codepoint C<c> is invalid.
+
+=cut
+
+*/
+
+PARROT_INLINE
+static void
+ucs2_check_codepoint(PARROT_INTERP, UINTVAL c)
+{
+ ASSERT_ARGS(ucs2_check_codepoint)
+
+ if (UNICODE_IS_SURROGATE(c)
+ || (c >= 0xFDD0 && c <= 0xFDEF)
+ || c >= 0xFFFE)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character in UCS-2 string\n");
+}
+
+/*
+
=item C<static UINTVAL ucs2_scan(PARROT_INTERP, const STRING *src)>
Returns the number of codepoints in string C<src>.
@@ -157,7 +189,19 @@
ucs2_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(ucs2_scan)
- return src->bufused >> 1;
+ const utf16_t * const ptr = (utf16_t *)src->strstart;
+ const UINTVAL len = src->bufused >> 1;
+ UINTVAL i;
+
+ if (src->bufused & 1)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Unaligned end in UCS-2 string\n");
+
+ for (i = 0; i < len; ++i) {
+ ucs2_check_codepoint(interp, ptr[i]);
+ }
+
+ return len;
}
/*
@@ -174,8 +218,8 @@
ucs2_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
{
ASSERT_ARGS(ucs2_ord)
- const UINTVAL len = STRING_length(src);
- const Parrot_UInt2 *s;
+ const utf16_t * const ptr = (utf16_t *)src->strstart;
+ const UINTVAL len = STRING_length(src);
if (idx < 0)
idx += len;
@@ -183,9 +227,7 @@
if ((UINTVAL)idx >= len)
encoding_ord_error(interp, src, idx);
- s = (const Parrot_UInt2 *)src->strstart;
-
- return s[idx];
+ return ptr[idx];
}
@@ -205,9 +247,9 @@
ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
{
ASSERT_ARGS(ucs2_iter_get)
- const Parrot_UInt2 * const s = (const Parrot_UInt2 *)str->strstart;
+ const utf16_t * const ptr = (utf16_t *)str->strstart;
- return s[i->charpos + offset];
+ return ptr[i->charpos + offset];
}
/*
@@ -247,10 +289,10 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i))
{
ASSERT_ARGS(ucs2_iter_get_and_advance)
- const Parrot_UInt2 * const s = (Parrot_UInt2 *)str->strstart;
- const UINTVAL c = s[i->charpos];
+ const utf16_t * const ptr = (utf16_t *)str->strstart;
+ const UINTVAL c = ptr[i->charpos];
- i->charpos++;
+ i->charpos += 1;
i->bytepos += 2;
return c;
@@ -269,15 +311,17 @@
*/
static void
-ucs2_iter_set_and_advance(SHIM_INTERP,
+ucs2_iter_set_and_advance(PARROT_INTERP,
ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
{
ASSERT_ARGS(ucs2_iter_set_and_advance)
- Parrot_UInt2 * const s = (Parrot_UInt2 *) str->strstart;
+ utf16_t * const ptr = (utf16_t *)str->strstart;
- s[i->charpos] = c;
+ ucs2_check_codepoint(interp, c);
- i->charpos++;
+ ptr[i->charpos] = c;
+
+ i->charpos += 1;
i->bytepos += 2;
}
@@ -318,16 +362,13 @@
{
ASSERT_ARGS(ucs2_hash)
DECL_CONST_CAST;
- STRING * const s = PARROT_const_cast(STRING *, src);
- const Parrot_UInt2 *pos;
- UINTVAL len;
-
- pos = (const Parrot_UInt2*)s->strstart;
- len = s->strlen;
+ STRING * const s = PARROT_const_cast(STRING *, src);
+ const utf16_t *ptr = (utf16_t *)s->strstart;
+ UINTVAL len = s->strlen;
while (len--) {
hashval += hashval << 5;
- hashval += *(pos++);
+ hashval += *(ptr++);
}
s->hashval = hashval;
@@ -348,8 +389,8 @@
encoding_compare,
encoding_index,
encoding_rindex,
- encoding_hash,
- unicode_validate,
+ ucs2_hash,
+ encoding_validate,
ucs2_scan,
ucs2_ord,
Modified: branches/string_checks/src/string/encoding/ucs4.c
==============================================================================
--- branches/string_checks/src/string/encoding/ucs4.c Sun Oct 31 15:01:21 2010 (r49750)
+++ branches/string_checks/src/string/encoding/ucs4.c Sun Oct 31 15:01:56 2010 (r49751)
@@ -19,6 +19,7 @@
*/
#include "parrot/parrot.h"
+#include "../unicode.h"
#include "shared.h"
/* HEADERIZER HFILE: none */
@@ -45,10 +46,11 @@
__attribute__nonnull__(3)
FUNC_MODIFIES(*i);
-static void ucs4_iter_set_and_advance(SHIM_INTERP,
+static void ucs4_iter_set_and_advance(PARROT_INTERP,
ARGMOD(STRING *str),
ARGMOD(String_iter *i),
UINTVAL c)
+ __attribute__nonnull__(1)
__attribute__nonnull__(2)
__attribute__nonnull__(3)
FUNC_MODIFIES(*str)
@@ -73,7 +75,8 @@
__attribute__nonnull__(2);
PARROT_WARN_UNUSED_RESULT
-static UINTVAL ucs4_scan(SHIM_INTERP, ARGIN(const STRING *src))
+static UINTVAL ucs4_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
__attribute__nonnull__(2);
PARROT_WARN_UNUSED_RESULT
@@ -91,7 +94,8 @@
PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_ucs4_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(str) \
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_ucs4_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(i))
@@ -101,7 +105,8 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_ucs4_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_ucs4_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
@@ -125,24 +130,23 @@
ucs4_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(ucs4_to_encoding)
- UINTVAL len;
- STRING *res;
- Parrot_UInt4 *buf;
+ const UINTVAL len = src->strlen;
+ STRING *res;
+ utf32_t *ptr;
if (src->encoding == Parrot_ucs4_encoding_ptr)
- return Parrot_str_clone(interp, src);
+ return Parrot_str_copy(interp, src);
- len = STRING_length(src);
res = Parrot_str_new_init(interp, NULL, len * 4,
Parrot_ucs4_encoding_ptr, 0);
- buf = (Parrot_UInt4 *) res->strstart;
+ ptr = (utf32_t *)res->strstart;
if (STRING_max_bytes_per_codepoint(src) == 1) {
- const unsigned char *s = (const unsigned char *)src->strstart;
+ const unsigned char *s = (unsigned char *)src->strstart;
UINTVAL i;
for (i = 0; i < len; i++) {
- buf[i] = s[i];
+ ptr[i] = s[i];
}
}
else {
@@ -151,7 +155,7 @@
STRING_ITER_INIT(interp, &iter);
while (iter.charpos < len) {
- buf[iter.charpos] = STRING_iter_get_and_advance(interp, src, &iter);
+ ptr[iter.charpos] = STRING_iter_get_and_advance(interp, src, &iter);
}
}
@@ -174,11 +178,26 @@
PARROT_WARN_UNUSED_RESULT
static UINTVAL
-ucs4_scan(SHIM_INTERP, ARGIN(const STRING *src))
+ucs4_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(ucs4_scan)
+ const utf32_t * const ptr = (utf32_t *)src->strstart;
+ const UINTVAL len = src->bufused >> 2;
+ UINTVAL i;
+
+ if (src->bufused & 3)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Unaligned end in UCS-4 string\n");
+
+ for (i = 0; i < len; ++i) {
+ UINTVAL c = ptr[i];
+
+ if (UNICODE_IS_INVALID(c))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character in UCS-4 string\n");
+ }
- return src->bufused >> 2;
+ return len;
}
@@ -196,8 +215,8 @@
ucs4_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
{
ASSERT_ARGS(ucs4_ord)
- const UINTVAL len = STRING_length(src);
- const Parrot_UInt4 *s;
+ const utf32_t * const ptr = (utf32_t *)src->strstart;
+ const UINTVAL len = src->strlen;
if (idx < 0)
idx += len;
@@ -205,9 +224,7 @@
if ((UINTVAL)idx >= len)
encoding_ord_error(interp, src, idx);
- s = (const Parrot_UInt4 *)src->strstart;
-
- return s[idx];
+ return ptr[idx];
}
@@ -227,9 +244,9 @@
ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
{
ASSERT_ARGS(ucs4_iter_get)
- const Parrot_UInt4 * const s = (const Parrot_UInt4 *)str->strstart;
+ const utf32_t * const ptr = (utf32_t *)str->strstart;
- return s[i->charpos + offset];
+ return ptr[i->charpos + offset];
}
@@ -271,10 +288,10 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i))
{
ASSERT_ARGS(ucs4_iter_get_and_advance)
- const Parrot_UInt4 * const s = (const Parrot_UInt4 *)str->strstart;
- const UINTVAL c = s[i->charpos];
+ const utf32_t * const ptr = (utf32_t *)str->strstart;
+ const UINTVAL c = ptr[i->charpos];
- i->charpos++;
+ i->charpos += 1;
i->bytepos += 4;
return c;
@@ -294,15 +311,19 @@
*/
static void
-ucs4_iter_set_and_advance(SHIM_INTERP,
+ucs4_iter_set_and_advance(PARROT_INTERP,
ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
{
ASSERT_ARGS(ucs4_iter_set_and_advance)
- Parrot_UInt4 * const s = (Parrot_UInt4 *)str->strstart;
+ utf32_t * const ptr = (utf32_t *)str->strstart;
- s[i->charpos] = c;
+ if (UNICODE_IS_INVALID(c))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character in UCS-4 string\n");
- i->charpos++;
+ ptr[i->charpos] = c;
+
+ i->charpos += 1;
i->bytepos += 4;
}
@@ -345,16 +366,13 @@
{
ASSERT_ARGS(ucs4_hash)
DECL_CONST_CAST;
- STRING * const s = PARROT_const_cast(STRING *, src);
- const Parrot_UInt4 *pos;
- UINTVAL len;
-
- pos = (const Parrot_UInt4 *)s->strstart;
- len = s->strlen;
+ STRING * const s = PARROT_const_cast(STRING *, src);
+ const utf32_t *ptr = (utf32_t *)s->strstart;
+ UINTVAL len = s->strlen;
while (len--) {
hashval += hashval << 5;
- hashval += *(pos++);
+ hashval += *(ptr++);
}
s->hashval = hashval;
@@ -377,7 +395,7 @@
encoding_index,
encoding_rindex,
ucs4_hash,
- unicode_validate,
+ encoding_validate,
ucs4_scan,
ucs4_ord,
Modified: branches/string_checks/t/op/stringu.t
==============================================================================
--- branches/string_checks/t/op/stringu.t Sun Oct 31 15:01:21 2010 (r49750)
+++ branches/string_checks/t/op/stringu.t Sun Oct 31 15:01:56 2010 (r49751)
@@ -6,7 +6,7 @@
use warnings;
use lib qw( . lib ../lib ../../lib );
use Test::More;
-use Parrot::Test tests => 36;
+use Parrot::Test tests => 40;
use Parrot::Config;
=head1 NAME
@@ -696,10 +696,13 @@
OUT
sub units_to_code {
+ my $bytes_per_unit = shift;
+
+ my $pack_format = $bytes_per_unit == 2 ? 'S*' : 'L*';
my $code = '';
for my $unit (@_) {
- my $str = pack('S*', @$unit);
+ my $str = pack($pack_format, @$unit);
$str =~ s/./sprintf("\\x%02X", ord($&))/egs;
$code .= qq{ 'test_chars'(binary:"$str")\n};
}
@@ -709,6 +712,7 @@
my $code = qq{ 'test_chars'(binary:"\\x41\\x42\\x43")\n};
$code .= units_to_code(
+ 2,
[ 0xD800 ],
[ 0xDFFF ],
[ 0xD800, 0x0041 ],
@@ -768,6 +772,7 @@
OUT
$code = units_to_code(
+ 2,
[ 0x0041 ],
[ 0xD7FF ],
[ 0xE000 ],
@@ -808,6 +813,218 @@
0x10FFFD
OUT
+$code = qq{ 'test_chars'(binary:"\\x41\\x42\\x43")\n};
+$code .= units_to_code(
+ 2,
+ [ 0xD800 ],
+ [ 0xDFFF ],
+ [ 0xD800, 0x0041 ],
+ [ 0xD900, 0xDAFF ],
+ [ 0xDBFF, 0xD800 ],
+ [ 0xDC00, 0xD8FF ],
+ [ 0xDDFF, 0xDE00 ],
+ [ 0xDFFF, 0x0041 ],
+ [ 0xFDD0 ],
+ [ 0xFDEF ],
+ [ 0xFFFE ],
+ [ 0xFFFF ],
+ [ 0xD800, 0xDC00 ],
+ [ 0xD912, 0xDE34 ],
+ [ 0xDBFF, 0xDFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'illegal ucs2 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc eh, ex, bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ eh = new 'ExceptionHandler'
+ set_addr eh, handler
+ push_eh eh
+ chars = bb.'get_string'('ucs2')
+ say 'valid'
+ goto end
+ handler:
+ .local pmc ex
+ .get_results (ex)
+ \$S0 = ex['message']
+ print \$S0
+ end:
+ pop_eh
+.end
+CODE
+Unaligned end in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+Invalid character in UCS-2 string
+OUT
+
+$code = units_to_code(
+ 2,
+ [ 0x0041 ],
+ [ 0xD7FF ],
+ [ 0xE000 ],
+ [ 0xFDCF ],
+ [ 0xFDF0 ],
+ [ 0xFFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'valid ucs2 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ chars = bb.'get_string'('ucs2')
+ \$I0 = ord chars
+ \$P0 = new 'FixedIntegerArray', 1
+ \$P0[0] = \$I0
+ \$S0 = sprintf '0x%X', \$P0
+ say \$S0
+.end
+CODE
+0x41
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+OUT
+
+$code = <<CODE;
+ 'test_chars'(binary:"\\x00\\x00\\x00")
+ 'test_chars'(binary:"\\x00\\x00\\x00\\x00\\x00")
+ 'test_chars'(binary:"\\x00\\x00\\x00\\x00\\x00\\x00")
+CODE
+$code .= units_to_code(
+ 4,
+ [ 0xD800 ],
+ [ 0xDFFF ],
+ [ 0xFDD0 ],
+ [ 0xFDEF ],
+ [ 0xFFFE ],
+ [ 0xFFFF ],
+ [ 0x01FFFE ],
+ [ 0x02FFFF ],
+ [ 0x10FFFE ],
+ [ 0x10FFFF ],
+ [ 0x110000 ],
+ [ 0x12345678 ],
+ [ 0xFFFFFFFF ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'illegal ucs4 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc eh, ex, bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ eh = new 'ExceptionHandler'
+ set_addr eh, handler
+ push_eh eh
+ chars = bb.'get_string'('ucs4')
+ say 'valid'
+ goto end
+ handler:
+ .local pmc ex
+ .get_results (ex)
+ \$S0 = ex['message']
+ print \$S0
+ end:
+ pop_eh
+.end
+CODE
+Unaligned end in UCS-4 string
+Unaligned end in UCS-4 string
+Unaligned end in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+Invalid character in UCS-4 string
+OUT
+
+$code = units_to_code(
+ 4,
+ [ 0x0041 ],
+ [ 0xD7FF ],
+ [ 0xE000 ],
+ [ 0xFDCF ],
+ [ 0xFDF0 ],
+ [ 0xFFFD ],
+ [ 0x010000 ],
+ [ 0x01FFFD ],
+ [ 0x020000 ],
+ [ 0x07FFFD ],
+ [ 0x0B0000 ],
+ [ 0x10FFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'valid ucs4 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ chars = bb.'get_string'('ucs4')
+ \$I0 = ord chars
+ \$P0 = new 'FixedIntegerArray', 1
+ \$P0[0] = \$I0
+ \$S0 = sprintf '0x%X', \$P0
+ say \$S0
+.end
+CODE
+0x41
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+0x10000
+0x1FFFD
+0x20000
+0x7FFFD
+0xB0000
+0x10FFFD
+OUT
+
SKIP: {
skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
More information about the parrot-commits
mailing list