[svn:parrot] r49750 - in branches/string_checks: config/gen/makefiles src/string src/string/encoding t/op t/pmc t/tools
nwellnhof at svn.parrot.org
nwellnhof at svn.parrot.org
Sun Oct 31 15:01:22 UTC 2010
Author: nwellnhof
Date: Sun Oct 31 15:01:21 2010
New Revision: 49750
URL: https://trac.parrot.org/parrot/changeset/49750
Log:
[str] Rewrite UTF-16 encoding to work without ICU
Perform all checks during initial scan.
Modified:
branches/string_checks/config/gen/makefiles/root.in
branches/string_checks/src/string/encoding/shared.c
branches/string_checks/src/string/encoding/shared.h
branches/string_checks/src/string/encoding/utf16.c
branches/string_checks/src/string/encoding/utf8.c
branches/string_checks/src/string/unicode.h
branches/string_checks/t/op/string_cs.t
branches/string_checks/t/op/stringu.t
branches/string_checks/t/pmc/bytebuffer.t
branches/string_checks/t/tools/pbc_disassemble.t
branches/string_checks/t/tools/pbc_dump.t
Modified: branches/string_checks/config/gen/makefiles/root.in
==============================================================================
--- branches/string_checks/config/gen/makefiles/root.in Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/config/gen/makefiles/root.in Sun Oct 31 15:01:21 2010 (r49750)
@@ -1815,7 +1815,7 @@
t/configure/*.t \
t/steps/*.t
PBC_TEST_FILES = \
-#IF(has_icu): t/op/testlib/test_strings.pbc \
+ t/op/testlib/test_strings.pbc \
t/pmc/testlib/annotations.pbc \
t/pmc/testlib/number.pbc
Modified: branches/string_checks/src/string/encoding/shared.c
==============================================================================
--- branches/string_checks/src/string/encoding/shared.c Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/src/string/encoding/shared.c Sun Oct 31 15:01:21 2010 (r49750)
@@ -48,6 +48,70 @@
/*
+=item C<STRING * encoding_to_encoding(PARROT_INTERP, const STRING *src, const
+STR_VTABLE *encoding, double avg_bytes)>
+
+Converts the string C<src> to encoding C<encoding>.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+encoding_to_encoding(PARROT_INTERP, ARGIN(const STRING *src),
+ ARGIN(const STR_VTABLE *encoding), double avg_bytes)
+{
+ ASSERT_ARGS(encoding_to_encoding)
+ STRING *result;
+ String_iter src_iter, dest_iter;
+ UINTVAL src_len, alloc_bytes;
+ UINTVAL max_bytes = encoding->max_bytes_per_codepoint;
+
+ if (src->encoding == encoding)
+ return Parrot_str_clone(interp, src);
+
+ src_len = src->strlen;
+ result = Parrot_gc_new_string_header(interp, 0);
+ result->encoding = encoding;
+ result->strlen = src_len;
+
+ if (!src_len)
+ return result;
+
+ alloc_bytes = (UINTVAL)(src_len * avg_bytes);
+ if (alloc_bytes < max_bytes)
+ alloc_bytes = max_bytes;
+ Parrot_gc_allocate_string_storage(interp, result, alloc_bytes);
+ result->bufused = alloc_bytes;
+
+ STRING_ITER_INIT(interp, &src_iter);
+ STRING_ITER_INIT(interp, &dest_iter);
+
+ while (src_iter.charpos < src_len) {
+ const UINTVAL c = STRING_iter_get_and_advance(interp, src, &src_iter);
+ const UINTVAL needed = dest_iter.bytepos + max_bytes;
+
+ if (needed > result->bufused) {
+ alloc_bytes = src_len - src_iter.charpos;
+ alloc_bytes = (UINTVAL)(alloc_bytes * avg_bytes);
+ alloc_bytes += needed;
+ Parrot_gc_reallocate_string_storage(interp, result, alloc_bytes);
+ result->bufused = alloc_bytes;
+ }
+
+ STRING_iter_set_and_advance(interp, result, &dest_iter, c);
+ }
+
+ result->bufused = dest_iter.bytepos;
+
+ return result;
+}
+
+
+/*
+
=item C<INTVAL encoding_equal(PARROT_INTERP, const STRING *lhs, const STRING
*rhs)>
Modified: branches/string_checks/src/string/encoding/shared.h
==============================================================================
--- branches/string_checks/src/string/encoding/shared.h Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/src/string/encoding/shared.h Sun Oct 31 15:01:21 2010 (r49750)
@@ -108,6 +108,16 @@
__attribute__nonnull__(1)
__attribute__nonnull__(2);
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * encoding_to_encoding(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ ARGIN(const STR_VTABLE *encoding),
+ double avg_bytes)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
PARROT_WARN_UNUSED_RESULT
UINTVAL encoding_validate(PARROT_INTERP, ARGIN(const STRING *src))
__attribute__nonnull__(1)
@@ -323,6 +333,10 @@
#define ASSERT_ARGS_encoding_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src) \
+ , PARROT_ASSERT_ARG(encoding))
#define ASSERT_ARGS_encoding_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
Modified: branches/string_checks/src/string/encoding/utf16.c
==============================================================================
--- branches/string_checks/src/string/encoding/utf16.c Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/src/string/encoding/utf16.c Sun Oct 31 15:01:21 2010 (r49750)
@@ -8,7 +8,7 @@
=head1 DESCRIPTION
-UTF-16 encoding with the help of the ICU library.
+UTF-16 encoding
=head2 Functions
@@ -19,6 +19,7 @@
*/
#include "parrot/parrot.h"
+#include "../unicode.h"
#include "shared.h"
/* HEADERIZER HFILE: none */
@@ -26,6 +27,20 @@
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL utf16_decode(PARROT_INTERP, ARGIN(const utf16_t *p))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static utf16_t * utf16_encode(PARROT_INTERP,
+ ARGMOD(utf16_t *ptr),
+ UINTVAL c)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ FUNC_MODIFIES(*ptr);
+
static UINTVAL utf16_iter_get(PARROT_INTERP,
ARGIN(const STRING *str),
ARGIN(const String_iter *i),
@@ -56,7 +71,7 @@
static void utf16_iter_set_position(PARROT_INTERP,
ARGIN(const STRING *str),
ARGMOD(String_iter *i),
- UINTVAL n)
+ UINTVAL pos)
__attribute__nonnull__(1)
__attribute__nonnull__(2)
__attribute__nonnull__(3)
@@ -82,14 +97,19 @@
__attribute__nonnull__(1)
__attribute__nonnull__(2);
+PARROT_CANNOT_RETURN_NULL
PARROT_WARN_UNUSED_RESULT
+static const utf16_t * utf16_skip_backward(
+ ARGIN(const utf16_t *p),
+ UINTVAL count)
+ __attribute__nonnull__(1);
+
PARROT_CANNOT_RETURN_NULL
-static STRING * utf16_substr(PARROT_INTERP,
- ARGIN(const STRING *src),
- INTVAL offset,
- INTVAL length)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
+PARROT_WARN_UNUSED_RESULT
+static const utf16_t * utf16_skip_forward(
+ ARGIN(const utf16_t *p),
+ UINTVAL count)
+ __attribute__nonnull__(1);
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
@@ -97,6 +117,12 @@
__attribute__nonnull__(1)
__attribute__nonnull__(2);
+#define ASSERT_ARGS_utf16_decode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(p))
+#define ASSERT_ARGS_utf16_encode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(ptr))
#define ASSERT_ARGS_utf16_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(str) \
@@ -123,32 +149,22 @@
#define ASSERT_ARGS_utf16_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_utf16_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf16_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(p))
+#define ASSERT_ARGS_utf16_skip_forward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(p))
#define ASSERT_ARGS_utf16_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: static */
-#if PARROT_HAS_ICU
-# include <unicode/utf16.h>
-# include <unicode/ustring.h>
-#endif
-
-#define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
- "unimpl utf16")
-
/*
=item C<static STRING * utf16_to_encoding(PARROT_INTERP, const STRING *src)>
-Converts the string C<src> to this particular encoding. If C<dest> is
-provided, it will contain the result. Otherwise this function operates in
-place.
-
+Converts the string C<src> to this particular encoding.
=cut
@@ -160,70 +176,47 @@
utf16_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(utf16_to_encoding)
- int src_len, dest_len;
- Parrot_UInt2 *p;
- STRING *result;
-
- if (src->encoding == Parrot_utf16_encoding_ptr
- || src->encoding == Parrot_ucs2_encoding_ptr)
- /* we have to use clone instead of copy because the Unicode upcase
- * and downcase functions assume to get an unshared buffer */
- return Parrot_str_clone(interp, src);
+ STRING *result;
+ UINTVAL src_len;
- result = Parrot_gc_new_string_header(interp, 0);
src_len = STRING_length(src);
- if (!src_len) {
+ if (STRING_max_bytes_per_codepoint(src) == 1) {
+ result = Parrot_gc_new_string_header(interp, 0);
result->encoding = Parrot_ucs2_encoding_ptr;
- return result;
- }
-
- Parrot_gc_allocate_string_storage(interp, result, 2 * src_len);
- p = (Parrot_UInt2 *)result->strstart;
+ result->bufused = 2 * src_len;
+ result->strlen = src_len;
- if (STRING_max_bytes_per_codepoint(src) == 1) {
- for (dest_len = 0; dest_len < src_len; ++dest_len) {
- p[dest_len] = (unsigned char)src->strstart[dest_len];
- }
- }
- else if (src->encoding == Parrot_utf8_encoding_ptr) {
-#if PARROT_HAS_ICU
- UErrorCode err = U_ZERO_ERROR;
-
- u_strFromUTF8(p, src_len, &dest_len, src->strstart, src->bufused, &err);
-
- if (!U_SUCCESS(err)) {
- /*
- * have to resize - required len in UChars is in dest_len
- */
- result->bufused = 2 * dest_len;
- Parrot_gc_reallocate_string_storage(interp, result, 2 * dest_len);
+ if (src_len) {
+ UINTVAL i;
+ Parrot_UInt2 *p;
+ Parrot_gc_allocate_string_storage(interp, result, 2 * src_len);
p = (Parrot_UInt2 *)result->strstart;
- u_strFromUTF8(p, dest_len, &dest_len, src->strstart, src->bufused, &err);
- PARROT_ASSERT(U_SUCCESS(err));
+
+ for (i = 0; i < src_len; ++i) {
+ p[i] = (unsigned char)src->strstart[i];
+ }
}
-#else
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
}
- else {
- UNIMPL;
+ else if (src->encoding == Parrot_utf16_encoding_ptr
+ || src->encoding == Parrot_ucs2_encoding_ptr) {
+ /* we have to use clone instead of copy because the Unicode upcase
+ * and downcase functions assume to get an unshared buffer */
+ result = Parrot_str_clone(interp, src);
}
+ else {
+ result = encoding_to_encoding(interp, src, Parrot_utf16_encoding_ptr, 2.2);
- result->bufused = 2 * dest_len;
- result->strlen = src_len;
-
- /* downgrade if possible */
- if (dest_len == src_len)
- result->encoding = Parrot_ucs2_encoding_ptr;
- else
- result->encoding = Parrot_utf16_encoding_ptr;
+ /* downgrade if possible */
+ if (result->bufused == result->strlen << 1)
+ result->encoding = Parrot_ucs2_encoding_ptr;
+ }
return result;
}
+
/*
=item C<static UINTVAL utf16_scan(PARROT_INTERP, const STRING *src)>
@@ -240,136 +233,200 @@
utf16_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(utf16_scan)
-#if PARROT_HAS_ICU
- const UChar * const s = (UChar*) src->strstart;
- UINTVAL pos = 0, charpos = 0;
- /*
- * this is used to initially calculate src->strlen,
- * therefore we must scan the whole string
- */
- while (pos * sizeof (UChar) < src->bufused) {
- U16_FWD_1_UNSAFE(s, pos);
- ++charpos;
- }
- return charpos;
-#else
- UNUSED(src);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ const utf16_t *p = (utf16_t *)src->strstart;
+ UINTVAL len = 0;
+ UINTVAL i, n;
+
+ if (src->bufused & 1)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Unaligned end in UTF-16 string\n");
+
+ n = src->bufused >> 1;
+
+ for (i = 0; i < n; ++i) {
+ UINTVAL c = p[i];
+
+ if (UNICODE_IS_HIGH_SURROGATE(c)) {
+ ++i;
+
+ if (i >= n)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Unaligned end in UTF-16 string\n");
+
+ if (!UNICODE_IS_LOW_SURROGATE(p[i]))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Malformed UTF-16 string\n");
+
+ c = UNICODE_DECODE_SURROGATE(c, p[i]);
+ }
+ else {
+ if (UNICODE_IS_LOW_SURROGATE(c))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Malformed UTF-16 string\n");
+ }
+
+ if (UNICODE_IS_NON_CHARACTER(c))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+ "Non-character in UTF-16 string\n");
+
+ ++len;
+ }
+
+ return len;
}
+
/*
-=item C<static UINTVAL utf16_ord(PARROT_INTERP, const STRING *src, INTVAL idx)>
+=item C<static const utf16_t * utf16_skip_forward(const utf16_t *p, UINTVAL
+count)>
-Returns the codepoint in string C<src> at position C<offset>.
+Skips C<count> codepoints starting from C<p>. Returns the new pointer.
=cut
*/
-static UINTVAL
-utf16_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static const utf16_t *
+utf16_skip_forward(ARGIN(const utf16_t *p), UINTVAL count)
{
- ASSERT_ARGS(utf16_ord)
-#if PARROT_HAS_ICU
- const UINTVAL len = STRING_length(src);
- const UChar *s;
- UINTVAL c, pos;
+ ASSERT_ARGS(utf16_skip_forward)
+ UINTVAL i;
- if (idx < 0)
- idx += len;
+ for (i = 0; i < count; ++i) {
+ if (UNICODE_IS_HIGH_SURROGATE(*p))
+ p += 2;
+ else
+ p += 1;
+ }
- if ((UINTVAL)idx >= len)
- encoding_ord_error(interp, src, idx);
+ return p;
+}
- s = (UChar *)src->strstart;
- pos = 0;
- U16_FWD_N_UNSAFE(s, pos, idx);
- U16_GET_UNSAFE(s, pos, c);
- return c;
-#else
- UNUSED(src);
- UNUSED(idx);
+/*
+
+=item C<static const utf16_t * utf16_skip_backward(const utf16_t *p, UINTVAL
+count)>
+
+Skips C<count> codepoints backwards starting from C<p>. Returns the new
+pointer.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static const utf16_t *
+utf16_skip_backward(ARGIN(const utf16_t *p), UINTVAL count)
+{
+ ASSERT_ARGS(utf16_skip_backward)
+ UINTVAL i;
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ for (i = 0; i < count; ++i) {
+ --p;
+ if (UNICODE_IS_LOW_SURROGATE(*p))
+ --p;
+ }
+
+ return p;
}
+
/*
-=item C<static STRING * utf16_substr(PARROT_INTERP, const STRING *src, INTVAL
-offset, INTVAL length)>
+=item C<static UINTVAL utf16_decode(PARROT_INTERP, const utf16_t *p)>
-Returns the codepoints in string C<src> at position C<offset> and length
-C<count>.
+Decodes the codepoint starting at C<p>.
=cut
*/
PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-utf16_substr(PARROT_INTERP, ARGIN(const STRING *src), INTVAL offset, INTVAL length)
+static UINTVAL
+utf16_decode(PARROT_INTERP, ARGIN(const utf16_t *p))
{
- ASSERT_ARGS(utf16_substr)
-#if PARROT_HAS_ICU
- const UChar * const s = (UChar*) src->strstart;
- const UINTVAL strlen = STRING_length(src);
- STRING *return_string;
- UINTVAL pos = 0, start;
+ UINTVAL c = *p;
- if (offset < 0)
- offset += strlen;
+ if (UNICODE_IS_HIGH_SURROGATE(c))
+ c = UNICODE_DECODE_SURROGATE(c, p[1]);
- if ((UINTVAL)offset >= strlen || length <= 0) {
- /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
- if ((UINTVAL)offset == strlen || length <= 0)
- return Parrot_str_new_noinit(interp, 0);
+ return c;
+}
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_SUBSTR_OUT_OF_STRING,
- "Cannot take substr outside string");
- }
- return_string = Parrot_str_copy(interp, src);
+/*
- if (offset == 0 && (UINTVAL)length >= strlen)
- return return_string;
+=item C<static utf16_t * utf16_encode(PARROT_INTERP, utf16_t *ptr, UINTVAL c)>
- U16_FWD_N_UNSAFE(s, pos, offset);
+Decodes the codepoint starting at C<p>.
- start = pos * sizeof (UChar);
- return_string->strstart += start;
+=cut
- if ((UINTVAL)length >= strlen - (UINTVAL)offset) {
- return_string->bufused -= start;
- return_string->strlen -= offset;
+*/
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static utf16_t *
+utf16_encode(PARROT_INTERP, ARGMOD(utf16_t *ptr), UINTVAL c)
+{
+ ASSERT_ARGS(utf16_encode)
+
+ if (c < 0xFFFE) {
+ if (UNICODE_IS_SURROGATE(c)
+ || (c >= 0xFDD0 && c <= 0xFDEF))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character for UTF-16 encoding\n");
+
+ *ptr++ = c;
}
else {
- U16_FWD_N_UNSAFE(s, pos, length);
- return_string->bufused = pos * sizeof (UChar) - start;
- return_string->strlen = length;
- }
-
- return_string->hashval = 0;
-
- return return_string;
-#else
- UNUSED(src);
- UNUSED(offset);
- UNUSED(length);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ if ((c & 0xFFFE) == 0xFFFE
+ || c > 0x10FFFF)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character for UTF-16 encoding\n");
+
+ *ptr++ = UNICODE_HIGH_SURROGATE(c);
+ *ptr++ = UNICODE_LOW_SURROGATE(c);
+ }
+
+ return ptr;
+}
+
+
+/*
+
+=item C<static UINTVAL utf16_ord(PARROT_INTERP, const STRING *src, INTVAL idx)>
+
+Returns the codepoint in string C<src> at position C<offset>.
+
+=cut
+
+*/
+
+static UINTVAL
+utf16_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
+{
+ ASSERT_ARGS(utf16_ord)
+ const UINTVAL len = STRING_length(src);
+ const utf16_t *start;
+
+ if (idx < 0)
+ idx += len;
+
+ if ((UINTVAL)idx >= len)
+ encoding_ord_error(interp, src, idx);
+
+ start = utf16_skip_forward((const utf16_t *)src->strstart, idx);
+
+ return utf16_decode(interp, start);
}
+
/*
=item C<static UINTVAL utf16_iter_get(PARROT_INTERP, const STRING *str, const
@@ -386,28 +443,16 @@
ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
{
ASSERT_ARGS(utf16_iter_get)
-#if PARROT_HAS_ICU
- const UChar * const s = (UChar*) str->strstart;
- UINTVAL c, pos;
-
- pos = i->bytepos / sizeof (UChar);
- if (offset > 0) {
- U16_FWD_N_UNSAFE(s, pos, offset);
- }
- else if (offset < 0) {
- U16_BACK_N_UNSAFE(s, pos, -offset);
- }
- U16_GET_UNSAFE(s, pos, c);
+ const utf16_t *ptr = (utf16_t *)(str->strstart + i->bytepos);
- return c;
-#else
- UNUSED(str);
- UNUSED(i);
- UNUSED(offset);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ PARROT_ASSERT((UINTVAL)(i->charpos + offset) <= str->strlen);
+
+ if (offset > 0)
+ ptr = utf16_skip_forward(ptr, offset);
+ else if (offset < 0)
+ ptr = utf16_skip_backward(ptr, -offset);
+
+ return utf16_decode(interp, ptr);
}
/*
@@ -426,27 +471,20 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
{
ASSERT_ARGS(utf16_iter_skip)
-#if PARROT_HAS_ICU
- const UChar * const s = (UChar*) str->strstart;
- UINTVAL pos = i->bytepos / sizeof (UChar);
-
- if (skip > 0) {
- U16_FWD_N_UNSAFE(s, pos, skip);
- }
- else if (skip < 0) {
- U16_BACK_N_UNSAFE(s, pos, -skip);
- }
+ const utf16_t *ptr = (utf16_t *)(str->strstart + i->bytepos);
i->charpos += skip;
- i->bytepos = pos * sizeof (UChar);
-#else
- UNUSED(str);
- UNUSED(i);
- UNUSED(skip);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+
+ PARROT_ASSERT(i->charpos <= str->strlen);
+
+ if (skip > 0)
+ ptr = utf16_skip_forward(ptr, skip);
+ else if (skip < 0)
+ ptr = utf16_skip_backward(ptr, -skip);
+
+ i->bytepos = (const char *)ptr - (const char *)str->strstart;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
}
/*
@@ -466,24 +504,19 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i))
{
ASSERT_ARGS(utf16_iter_get_and_advance)
-#if PARROT_HAS_ICU
- const UChar * const s = (UChar*) str->strstart;
- UINTVAL c, pos;
- pos = i->bytepos / sizeof (UChar);
- /* TODO either make sure that we don't go past end or use SAFE
- * iter versions
- */
- U16_NEXT_UNSAFE(s, pos, c);
- i->charpos++;
- i->bytepos = pos * sizeof (UChar);
+ const utf16_t *ptr = (utf16_t *)(str->strstart + i->bytepos);
+ UINTVAL c = utf16_decode(interp, ptr);
+
+ i->charpos += 1;
+
+ if (UNICODE_IS_HIGH_SURROGATE(*ptr))
+ i->bytepos += 4;
+ else
+ i->bytepos += 2;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
+
return c;
-#else
- UNUSED(str);
- UNUSED(i);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
}
/*
@@ -503,29 +536,21 @@
ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
{
ASSERT_ARGS(utf16_iter_set_and_advance)
-#if PARROT_HAS_ICU
- UChar * const s = (UChar*) str->strstart;
- UINTVAL pos;
- pos = i->bytepos / sizeof (UChar);
- U16_APPEND_UNSAFE(s, pos, c);
- i->charpos++;
- i->bytepos = pos * sizeof (UChar);
-#else
- UNUSED(str);
- UNUSED(i);
- UNUSED(c);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ utf16_t * const ptr = (utf16_t *)(str->strstart + i->bytepos);
+ utf16_t * const end = utf16_encode(interp, ptr, c);
+
+ i->charpos += 1;
+ i->bytepos += (char *)end - (char *)ptr;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
}
/*
=item C<static void utf16_iter_set_position(PARROT_INTERP, const STRING *str,
-String_iter *i, UINTVAL n)>
+String_iter *i, UINTVAL pos)>
-Moves the string iterator C<i> to the position C<n> in the string.
+Moves the string iterator C<i> to the position C<pos> in the string.
=cut
@@ -533,24 +558,53 @@
static void
utf16_iter_set_position(PARROT_INTERP,
- ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n)
+ ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos)
{
ASSERT_ARGS(utf16_iter_set_position)
-#if PARROT_HAS_ICU
- UChar * const s = (UChar*) str->strstart;
- UINTVAL pos;
- pos = 0;
- U16_FWD_N_UNSAFE(s, pos, n);
- i->charpos = n;
- i->bytepos = pos * sizeof (UChar);
-#else
- UNUSED(str);
- UNUSED(i);
- UNUSED(n);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ const utf16_t *ptr;
+
+ if (pos == 0) {
+ i->charpos = 0;
+ i->bytepos = 0;
+ return;
+ }
+
+ PARROT_ASSERT(pos <= str->strlen);
+
+ /*
+ * we know the byte offsets of three positions: start, current and end
+ * now find the shortest way to reach pos
+ */
+ if (pos < i->charpos) {
+ if (pos <= (i->charpos >> 1)) {
+ /* go forward from start */
+ ptr = (utf16_t *)str->strstart;
+ ptr = utf16_skip_forward(ptr, pos);
+ }
+ else {
+ /* go backward from current */
+ ptr = (utf16_t *)(str->strstart + i->bytepos);
+ ptr = utf16_skip_backward(ptr, i->charpos - pos);
+ }
+ }
+ else {
+ const UINTVAL len = str->strlen;
+ if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
+ /* go forward from current */
+ ptr = (utf16_t *)(str->strstart + i->bytepos);
+ ptr = utf16_skip_forward(ptr, pos - i->charpos);
+ }
+ else {
+ /* go backward from end */
+ ptr = (utf16_t *)(str->strstart + str->bufused);
+ ptr = utf16_skip_backward(ptr, len - pos);
+ }
+ }
+
+ i->charpos = pos;
+ i->bytepos = (const char *)ptr - (const char *)str->strstart;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
}
@@ -568,11 +622,11 @@
encoding_index,
encoding_rindex,
encoding_hash,
- unicode_validate,
+ encoding_validate,
utf16_scan,
utf16_ord,
- utf16_substr,
+ encoding_substr,
encoding_is_cclass,
encoding_find_cclass,
Modified: branches/string_checks/src/string/encoding/utf8.c
==============================================================================
--- branches/string_checks/src/string/encoding/utf8.c Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/src/string/encoding/utf8.c Sun Oct 31 15:01:21 2010 (r49750)
@@ -32,9 +32,10 @@
__attribute__nonnull__(2);
PARROT_CANNOT_RETURN_NULL
-static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
+static utf8_t * utf8_encode(PARROT_INTERP, ARGMOD(utf8_t *ptr), UINTVAL c)
__attribute__nonnull__(1)
- __attribute__nonnull__(2);
+ __attribute__nonnull__(2)
+ FUNC_MODIFIES(*ptr);
static UINTVAL utf8_iter_get(PARROT_INTERP,
ARGIN(const STRING *str),
@@ -88,11 +89,13 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
-static const utf8_t * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
+static const utf8_t * utf8_skip_backward(
+ ARGIN(const utf8_t *ptr),
+ UINTVAL n)
__attribute__nonnull__(1);
PARROT_CANNOT_RETURN_NULL
-static const utf8_t * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
+static const utf8_t * utf8_skip_forward(ARGIN(const utf8_t *ptr), UINTVAL n)
__attribute__nonnull__(1);
PARROT_CAN_RETURN_NULL
@@ -180,56 +183,14 @@
utf8_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(utf8_to_encoding)
- STRING *result;
- const STR_VTABLE *src_encoding = src->encoding;
- UINTVAL dest_len, dest_pos, src_len;
- unsigned char *p;
-
- if (src_encoding == Parrot_utf8_encoding_ptr)
- return Parrot_str_clone(interp, src);
-
- src_len = src->strlen;
- result = Parrot_gc_new_string_header(interp, 0);
- result->encoding = Parrot_utf8_encoding_ptr;
- result->strlen = src_len;
-
- if (!src_len)
- return result;
-
- Parrot_gc_allocate_string_storage(interp, result, src_len);
- p = (unsigned char *)result->strstart;
-
- if (src_encoding == Parrot_ascii_encoding_ptr) {
- for (dest_len = 0; dest_len < src_len; ++dest_len) {
- p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
- }
- result->bufused = dest_len;
+ STRING *result;
+
+ if (src->encoding == Parrot_ascii_encoding_ptr) {
+ result = Parrot_str_clone(interp, src);
+ result->encoding = Parrot_utf8_encoding_ptr;
}
else {
- String_iter src_iter;
- STRING_ITER_INIT(interp, &src_iter);
- dest_len = src_len;
- dest_pos = 0;
- while (src_iter.charpos < src_len) {
- const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter);
- unsigned char *new_pos;
- unsigned char *pos;
-
- if (dest_len - dest_pos < 6) {
- UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5);
- if (need < 16)
- need = 16;
- dest_len += need;
- result->bufused = dest_pos;
- Parrot_gc_reallocate_string_storage(interp, result, dest_len);
- p = (unsigned char *)result->strstart;
- }
-
- pos = p + dest_pos;
- new_pos = (unsigned char *)utf8_encode(interp, pos, c);
- dest_pos += (new_pos - pos);
- }
- result->bufused = dest_pos;
+ result = encoding_to_encoding(interp, src, Parrot_utf8_encoding_ptr, 1.2);
}
return result;
@@ -322,7 +283,7 @@
if ((UINTVAL)idx >= len)
encoding_ord_error(interp, src, idx);
- start = utf8_skip_forward(src->strstart, idx);
+ start = utf8_skip_forward((utf8_t *)src->strstart, idx);
return utf8_decode(interp, start);
}
@@ -363,7 +324,7 @@
/*
-=item C<static void * utf8_encode(PARROT_INTERP, void *ptr, UINTVAL c)>
+=item C<static utf8_t * utf8_encode(PARROT_INTERP, utf8_t *ptr, UINTVAL c)>
Returns the UTF-8 encoding of integer C<c>.
@@ -372,35 +333,31 @@
*/
PARROT_CANNOT_RETURN_NULL
-static void *
-utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
+static utf8_t *
+utf8_encode(PARROT_INTERP, ARGMOD(utf8_t *ptr), UINTVAL c)
{
ASSERT_ARGS(utf8_encode)
- const UINTVAL len = UNISKIP(c);
-
- /* the const is good on u8ptr, but using ptr on other variables avoids the
- * need to do a yucky cast to remove constness */
- const utf8_t * const u8ptr = (utf8_t *)ptr;
- utf8_t *u8end = (utf8_t *)ptr + len - 1;
+ const UINTVAL len = UNISKIP(c);
+ utf8_t *end = ptr + len - 1;
if (UNICODE_IS_INVALID(c))
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
"Invalid character for UTF-8 encoding\n");
- while (u8end > u8ptr) {
- *u8end-- =
- (utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK);
+ while (end > ptr) {
+ *end-- = (c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK;
c >>= UTF8_ACCUMULATION_SHIFT;
}
- *u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len));
- return (utf8_t *)ptr + len;
+ *end = (c & UTF8_START_MASK(len)) | UTF8_START_MARK(len);
+
+ return ptr + len;
}
/*
-=item C<static const utf8_t * utf8_skip_forward(const void *ptr, UINTVAL n)>
+=item C<static const utf8_t * utf8_skip_forward(const utf8_t *ptr, UINTVAL n)>
Moves C<ptr> C<n> characters forward.
@@ -410,22 +367,21 @@
PARROT_CANNOT_RETURN_NULL
static const utf8_t *
-utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
+utf8_skip_forward(ARGIN(const utf8_t *ptr), UINTVAL n)
{
ASSERT_ARGS(utf8_skip_forward)
- const utf8_t *u8ptr = (const utf8_t *)ptr;
while (n-- > 0) {
- u8ptr += UTF8SKIP(u8ptr);
+ ptr += UTF8SKIP(ptr);
}
- return u8ptr;
+ return ptr;
}
/*
-=item C<static const utf8_t * utf8_skip_backward(const void *ptr, UINTVAL n)>
+=item C<static const utf8_t * utf8_skip_backward(const utf8_t *ptr, UINTVAL n)>
Moves C<ptr> C<n> characters back.
@@ -436,18 +392,17 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static const utf8_t *
-utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
+utf8_skip_backward(ARGIN(const utf8_t *ptr), UINTVAL n)
{
ASSERT_ARGS(utf8_skip_backward)
- const utf8_t *u8ptr = (const utf8_t *)ptr;
while (n-- > 0) {
- --u8ptr;
- while (UTF8_IS_CONTINUATION(*u8ptr))
- --u8ptr;
+ --ptr;
+ while (UTF8_IS_CONTINUATION(*ptr))
+ --ptr;
}
- return u8ptr;
+ return ptr;
}
@@ -467,18 +422,16 @@
ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
{
ASSERT_ARGS(utf8_iter_get)
- const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+ const utf8_t *ptr = (utf8_t *)(str->strstart + i->bytepos);
PARROT_ASSERT(i->charpos + offset < str->strlen);
- if (offset > 0) {
- u8ptr = utf8_skip_forward(u8ptr, offset);
- }
- else if (offset < 0) {
- u8ptr = utf8_skip_backward(u8ptr, -offset);
- }
+ if (offset > 0)
+ ptr = utf8_skip_forward(ptr, offset);
+ else if (offset < 0)
+ ptr = utf8_skip_backward(ptr, -offset);
- return utf8_decode(interp, u8ptr);
+ return utf8_decode(interp, ptr);
}
@@ -498,20 +451,18 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
{
ASSERT_ARGS(utf8_iter_skip)
- const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+ const utf8_t *ptr = (utf8_t *)(str->strstart + i->bytepos);
i->charpos += skip;
PARROT_ASSERT(i->charpos <= str->strlen);
- if (skip > 0) {
- u8ptr = utf8_skip_forward(u8ptr, skip);
- }
- else if (skip < 0) {
- u8ptr = utf8_skip_backward(u8ptr, -skip);
- }
+ if (skip > 0)
+ ptr = utf8_skip_forward(ptr, skip);
+ else if (skip < 0)
+ ptr = utf8_skip_backward(ptr, -skip);
- i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+ i->bytepos = (const char *)ptr - (const char *)str->strstart;
PARROT_ASSERT(i->bytepos <= str->bufused);
}
@@ -534,11 +485,11 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i))
{
ASSERT_ARGS(utf8_iter_get_and_advance)
- const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
- UINTVAL c = utf8_decode(interp, u8ptr);
+ const utf8_t *ptr = (utf8_t *)(str->strstart + i->bytepos);
+ UINTVAL c = utf8_decode(interp, ptr);
i->charpos += 1;
- i->bytepos += UTF8SKIP(u8ptr);
+ i->bytepos += UTF8SKIP(ptr);
PARROT_ASSERT(i->bytepos <= str->bufused);
@@ -563,11 +514,11 @@
ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
{
ASSERT_ARGS(utf8_iter_set_and_advance)
- unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos;
- unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
+ utf8_t * const ptr = (utf8_t *)(str->strstart + i->bytepos);
+ utf8_t * const end = utf8_encode(interp, ptr, c);
i->charpos += 1;
- i->bytepos += new_pos - pos;
+ i->bytepos += end - ptr;
PARROT_ASSERT(i->bytepos <= str->bufused);
}
@@ -590,7 +541,7 @@
ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos)
{
ASSERT_ARGS(utf8_iter_set_position)
- const utf8_t *u8ptr = (const utf8_t *)str->strstart;
+ const utf8_t *ptr = (utf8_t *)str->strstart;
if (pos == 0) {
i->charpos = 0;
@@ -607,27 +558,27 @@
if (pos < i->charpos) {
if (pos <= (i->charpos >> 1)) {
/* go forward from start */
- u8ptr = utf8_skip_forward(u8ptr, pos);
+ ptr = utf8_skip_forward(ptr, pos);
}
else {
/* go backward from current */
- u8ptr = utf8_skip_backward(u8ptr + i->bytepos, i->charpos - pos);
+ ptr = utf8_skip_backward(ptr + i->bytepos, i->charpos - pos);
}
}
else {
const UINTVAL len = str->strlen;
if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
/* go forward from current */
- u8ptr = utf8_skip_forward(u8ptr + i->bytepos, pos - i->charpos);
+ ptr = utf8_skip_forward(ptr + i->bytepos, pos - i->charpos);
}
else {
/* go backward from end */
- u8ptr = utf8_skip_backward(u8ptr + str->bufused, len - pos);
+ ptr = utf8_skip_backward(ptr + str->bufused, len - pos);
}
}
i->charpos = pos;
- i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+ i->bytepos = (const char *)ptr - (const char *)str->strstart;
PARROT_ASSERT(i->bytepos <= str->bufused);
}
Modified: branches/string_checks/src/string/unicode.h
==============================================================================
--- branches/string_checks/src/string/unicode.h Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/src/string/unicode.h Sun Oct 31 15:01:21 2010 (r49750)
@@ -14,8 +14,8 @@
#define PARROT_UNICODE_H_GUARD
typedef unsigned char utf8_t;
-typedef unsigned short utf16_t;
-typedef unsigned long utf32_t;
+typedef Parrot_UInt2 utf16_t;
+typedef Parrot_Int4 utf32_t;
#define UNICODE_SURROGATE_FIRST 0xD800u
#define UNICODE_SURROGATE_LAST 0xDFFFu
Modified: branches/string_checks/t/op/string_cs.t
==============================================================================
--- branches/string_checks/t/op/string_cs.t Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/t/op/string_cs.t Sun Oct 31 15:01:21 2010 (r49750)
@@ -460,10 +460,7 @@
abcdefgefgefgefghi\xc2\xa9jk
OUTPUT
-SKIP: {
- skip( 'no ICU lib', 19 ) unless $PConfig{has_icu};
-
- pir_output_is( <<'CODE', <<OUTPUT, "literal encoding persistence - TT #468" );
+pir_output_is( <<'CODE', <<OUTPUT, "literal encoding persistence - TT #468" );
.include 'stdio.pasm'
.sub main
# set output encoding to normalize printed strings
@@ -501,7 +498,7 @@
hello(10): ucs2
OUTPUT
- pir_output_is( <<'CODE', <<OUTPUT, "empty literal encoding persistence - TT #1791");
+pir_output_is( <<'CODE', <<OUTPUT, "empty literal encoding persistence - TT #1791");
.sub main
load_bytecode 't/op/testlib/test_strings.pbc'
$P0 = 'get_empties'()
@@ -533,58 +530,14 @@
(0): ucs2
OUTPUT
- pir_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
-.sub main :main
- set $S0, iso-8859-1:"TÖTSCH"
- find_encoding $I0, "utf8"
- trans_encoding $S1, $S0, $I0
- $S1 = downcase $S1
- getstdout $P0 # need to convert back to utf8
- $P0.'encoding'("utf8") # set utf8 output
- print $S1
- print "\n"
- end
-.end
-CODE
-t\xc3\xb6tsch
-OUTPUT
-
- pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_encoding_s_s_i" );
- set S0, iso-8859-1:"TÖTSCH"
- find_encoding I0, "utf8"
- trans_encoding S1, S0, I0
- downcase S1, S1
- find_encoding I0, "iso-8859-1"
- trans_encoding S1, S1, I0
- print S1
- print "\n"
- end
-CODE
-t\xf6tsch
-OUTPUT
-
- pasm_error_output_like( <<'CODE', <<"OUTPUT", "negative encoding number" );
+pasm_error_output_like( <<'CODE', <<"OUTPUT", "negative encoding number" );
trans_encoding S2, 'foo', -1
end
CODE
/encoding #-1 not found/
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transencoding" );
- set S0, iso-8859-1:"TÖTSCH"
- find_encoding I0, "utf8"
- trans_encoding S1, S0, I0
- downcase S1, S1
- find_encoding I0, "utf8"
- trans_encoding S2, S1, I0
- print S2
- print "\n"
- end
-CODE
-t\xc3\xb6tsch
-OUTPUT
-
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
set S1, iso-8859-1:"TÖTSCH"
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
@@ -605,7 +558,7 @@
84_214_84_83_67_72_
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
set S0, iso-8859-1:"TTÖÖ"
find_encoding I0, "utf8"
trans_encoding S1, S0, I0
@@ -624,7 +577,7 @@
TT 2 2
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
set S1, iso-8859-1:"Tötsch"
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
@@ -646,7 +599,7 @@
T\xc3\xb6tsch Leo
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
set S1, iso-8859-1:"Tötsch"
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
@@ -668,7 +621,7 @@
T\xc3\xb6tsch Leo
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
set S1, iso-8859-1:"Tötsch"
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
@@ -682,7 +635,7 @@
\xc3\xb6t
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
set S1, iso-8859-1:"Tötsch"
find_encoding I0, "utf16"
trans_encoding S1, S1, I0
@@ -701,11 +654,10 @@
Toetsch
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
- set S0, iso-8859-1:"TÖTSCH"
- find_encoding I0, "utf8"
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
+ set S0, iso-8859-1:"tötsch"
+ find_encoding I0, "utf16"
trans_encoding S1, S0, I0
- downcase S1, S1
set S2, iso-8859-1:"öt"
index I0, S1, S2
print I0
@@ -715,11 +667,10 @@
1
OUTPUT
- pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
- set S0, iso-8859-1:"TÖTSCH"
- find_encoding I0, "utf8"
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
+ set S0, iso-8859-1:"tötsch"
+ find_encoding I0, "utf16"
trans_encoding S1, S0, I0
- downcase S1, S1
set S2, iso-8859-1:"öt"
index I0, S1, S2
print I0
@@ -734,6 +685,53 @@
6
OUTPUT
+SKIP: {
+ skip( 'no ICU lib', 8 ) unless $PConfig{has_icu};
+
+ pir_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
+.sub main :main
+ set $S0, iso-8859-1:"TÖTSCH"
+ find_encoding $I0, "utf8"
+ trans_encoding $S1, $S0, $I0
+ $S1 = downcase $S1
+ getstdout $P0 # need to convert back to utf8
+ $P0.'encoding'("utf8") # set utf8 output
+ print $S1
+ print "\n"
+ end
+.end
+CODE
+t\xc3\xb6tsch
+OUTPUT
+
+ pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_encoding_s_s_i" );
+ set S0, iso-8859-1:"TÖTSCH"
+ find_encoding I0, "utf8"
+ trans_encoding S1, S0, I0
+ downcase S1, S1
+ find_encoding I0, "iso-8859-1"
+ trans_encoding S1, S1, I0
+ print S1
+ print "\n"
+ end
+CODE
+t\xf6tsch
+OUTPUT
+
+ pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transencoding" );
+ set S0, iso-8859-1:"TÖTSCH"
+ find_encoding I0, "utf8"
+ trans_encoding S1, S0, I0
+ downcase S1, S1
+ find_encoding I0, "utf8"
+ trans_encoding S2, S1, I0
+ print S2
+ print "\n"
+ end
+CODE
+t\xc3\xb6tsch
+OUTPUT
+
pir_output_is( <<'CODE', <<"OUTPUT", "unicode upcase" );
.sub main :main
set $S0, iso-8859-1:"tötsch"
Modified: branches/string_checks/t/op/stringu.t
==============================================================================
--- branches/string_checks/t/op/stringu.t Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/t/op/stringu.t Sun Oct 31 15:01:21 2010 (r49750)
@@ -6,7 +6,7 @@
use warnings;
use lib qw( . lib ../lib ../../lib );
use Test::More;
-use Parrot::Test tests => 35;
+use Parrot::Test tests => 36;
use Parrot::Config;
=head1 NAME
@@ -237,19 +237,6 @@
\xc2\xab
OUTPUT
-pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
- set S0, utf8:unicode:"\xc2\xab"
- length I0, S0
- print I0
- print "\n"
- print S0
- print "\n"
- end
-CODE
-1
-\xc2\xab
-OUTPUT
-
pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
set S0, ascii:"«"
length I0, S0
@@ -508,9 +495,6 @@
hello
OUTPUT
-
-SKIP: {
- skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to int' );
.sub main :main
$S0 = "140"
@@ -555,7 +539,6 @@
140
140
OUT
-}
pir_output_is( <<'CODE', <<'OUT', 'concatenation of utf8 and iso-8859-1 (TT #752)' );
.sub 'main'
@@ -712,6 +695,119 @@
0x10FFFD
OUT
+sub units_to_code {
+ my $code = '';
+
+ for my $unit (@_) {
+ my $str = pack('S*', @$unit);
+ $str =~ s/./sprintf("\\x%02X", ord($&))/egs;
+ $code .= qq{ 'test_chars'(binary:"$str")\n};
+ }
+
+ return $code;
+}
+
+my $code = qq{ 'test_chars'(binary:"\\x41\\x42\\x43")\n};
+$code .= units_to_code(
+ [ 0xD800 ],
+ [ 0xDFFF ],
+ [ 0xD800, 0x0041 ],
+ [ 0xD900, 0xDAFF ],
+ [ 0xDBFF, 0xD800 ],
+ [ 0xDC00, 0xD8FF ],
+ [ 0xDDFF, 0xDE00 ],
+ [ 0xDFFF, 0x0041 ],
+ [ 0xFDD0 ],
+ [ 0xFDEF ],
+ [ 0xFFFE ],
+ [ 0xFFFF ],
+ [ 0xD83F, 0xDFFF ],
+ [ 0xDBFF, 0xDFFE ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'illegal utf16 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc eh, ex, bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ eh = new 'ExceptionHandler'
+ set_addr eh, handler
+ push_eh eh
+ chars = bb.'get_string'('utf16')
+ say 'valid'
+ goto end
+ handler:
+ .local pmc ex
+ .get_results (ex)
+ \$S0 = ex['message']
+ print \$S0
+ end:
+ pop_eh
+.end
+CODE
+Unaligned end in UTF-16 string
+Unaligned end in UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+OUT
+
+$code = units_to_code(
+ [ 0x0041 ],
+ [ 0xD7FF ],
+ [ 0xE000 ],
+ [ 0xFDCF ],
+ [ 0xFDF0 ],
+ [ 0xFFFD ],
+ [ 0xD800, 0xDC00 ],
+ [ 0xD912, 0xDE34 ],
+ [ 0xDBFF, 0xDFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'valid utf16 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ chars = bb.'get_string'('utf16')
+ \$I0 = ord chars
+ \$P0 = new 'FixedIntegerArray', 1
+ \$P0[0] = \$I0
+ \$S0 = sprintf '0x%X', \$P0
+ say \$S0
+.end
+CODE
+0x41
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+0x10000
+0x54A34
+0x10FFFD
+OUT
+
SKIP: {
skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
Modified: branches/string_checks/t/pmc/bytebuffer.t
==============================================================================
--- branches/string_checks/t/pmc/bytebuffer.t Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/t/pmc/bytebuffer.t Sun Oct 31 15:01:21 2010 (r49750)
@@ -152,27 +152,10 @@
is(n, 4, "getting ascii from buffer gives correct length")
is(s, "abcd", "getting ascii from buffer gives correct content")
- $I0 = hasicu()
- unless $I0 goto skip_it
-
bb = new ['ByteBuffer']
# Upper case n tilde: codepoint 0xD1, utf8 encoding 0xC3, 0x91
- #bb = utf16:"\x{D1}"
- # Can't do that, or the program can't be compiled without ICU.
- # Fill the buffer with bytes instead.
-
- # Get endianess to set the bytes in the appropiate order.
- # *** XXX *** Need report from big endian platforms.
- big = isbigendian()
- if big goto isbig
- bb[0] = 0xD1
- bb[1] = 0x00
- goto doit
-isbig:
- bb[0] = 0x00
- bb[1] = 0xD1
-doit:
+ bb = utf16:"\x{D1}"
s = bb.'get_string'('utf16')
n = length s
is(n, 1, "getting utf16 from buffer gives correct length")
@@ -186,10 +169,6 @@
is(n, 1, "getting utf8 from buffer gives correct length")
n = ord s
is(n, 0xD1, "getting utf8 from buffer gives correct codepoint")
- goto end
-skip_it:
- skip(4, "this test needs ICU")
-end:
.end
.sub test_push
@@ -280,9 +259,6 @@
.local pmc bb
.local int i, big, pos, b0, b1, c
- $I0 = hasicu()
- unless $I0 goto skip_it
-
# Get endianess to set the bytes in the appropiate order.
# *** XXX *** Need report from big endian platforms.
big = isbigendian()
@@ -329,9 +305,6 @@
failed:
say i
ok(0, "reallocation")
- goto end
-skip_it:
- skip(1, "this test needs ICU")
end:
.end
Modified: branches/string_checks/t/tools/pbc_disassemble.t
==============================================================================
--- branches/string_checks/t/tools/pbc_disassemble.t Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/t/tools/pbc_disassemble.t Sun Oct 31 15:01:21 2010 (r49750)
@@ -86,16 +86,13 @@
.end
PIR
-SKIP: {
- skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
- my $utf16 = pack('S*', unpack('C*', 'Hello'));
- $utf16 =~ s/\0/\\\\0/g;
- disassemble_output_like( <<PIR, "pir", qr/set_s_sc S0,utf16:"$utf16"/ms, 'pbc_disassemble utf16 string');
+my $utf16 = pack('S*', unpack('C*', 'Hello'));
+$utf16 =~ s/\0/\\\\0/g;
+disassemble_output_like( <<PIR, "pir", qr/set_s_sc S0,utf16:"$utf16"/ms, 'pbc_disassemble utf16 string');
.sub main :main
\$S0 = utf16:"Hello"
.end
PIR
-}
=head1 HELPER SUBROUTINES
Modified: branches/string_checks/t/tools/pbc_dump.t
==============================================================================
--- branches/string_checks/t/tools/pbc_dump.t Sun Oct 31 15:00:38 2010 (r49749)
+++ branches/string_checks/t/tools/pbc_dump.t Sun Oct 31 15:01:21 2010 (r49750)
@@ -73,14 +73,11 @@
PIR
for my $enc qw(binary iso-8859-1 utf8 utf16 ucs2 ucs4) {
- SKIP: {
- skip( 'no ICU lib', 1 ) if $enc eq 'utf16' && !$PConfig{has_icu};
- dump_output_like( <<PIR, "pir", qr/ENCODING.*=>.*$enc/ms, "pbc_dump $enc encoding");
+ dump_output_like( <<PIR, "pir", qr/ENCODING.*=>.*$enc/ms, "pbc_dump $enc encoding");
.sub main :main
\$S0 = $enc:"abc"
.end
PIR
- }
}
my $longcode = ".sub main :main\n";
More information about the parrot-commits
mailing list