[svn:parrot] r49750 - in branches/string_checks: config/gen/makefiles src/string src/string/encoding t/op t/pmc t/tools

nwellnhof at svn.parrot.org nwellnhof at svn.parrot.org
Sun Oct 31 15:01:22 UTC 2010


Author: nwellnhof
Date: Sun Oct 31 15:01:21 2010
New Revision: 49750
URL: https://trac.parrot.org/parrot/changeset/49750

Log:
[str] Rewrite UTF-16 encoding to work without ICU

Perform all checks during initial scan.

Modified:
   branches/string_checks/config/gen/makefiles/root.in
   branches/string_checks/src/string/encoding/shared.c
   branches/string_checks/src/string/encoding/shared.h
   branches/string_checks/src/string/encoding/utf16.c
   branches/string_checks/src/string/encoding/utf8.c
   branches/string_checks/src/string/unicode.h
   branches/string_checks/t/op/string_cs.t
   branches/string_checks/t/op/stringu.t
   branches/string_checks/t/pmc/bytebuffer.t
   branches/string_checks/t/tools/pbc_disassemble.t
   branches/string_checks/t/tools/pbc_dump.t

Modified: branches/string_checks/config/gen/makefiles/root.in
==============================================================================
--- branches/string_checks/config/gen/makefiles/root.in	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/config/gen/makefiles/root.in	Sun Oct 31 15:01:21 2010	(r49750)
@@ -1815,7 +1815,7 @@
     t/configure/*.t \
     t/steps/*.t
 PBC_TEST_FILES = \
-#IF(has_icu):    t/op/testlib/test_strings.pbc \
+    t/op/testlib/test_strings.pbc \
     t/pmc/testlib/annotations.pbc \
     t/pmc/testlib/number.pbc
 

Modified: branches/string_checks/src/string/encoding/shared.c
==============================================================================
--- branches/string_checks/src/string/encoding/shared.c	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/src/string/encoding/shared.c	Sun Oct 31 15:01:21 2010	(r49750)
@@ -48,6 +48,70 @@
 
 /*
 
+=item C<STRING * encoding_to_encoding(PARROT_INTERP, const STRING *src, const
+STR_VTABLE *encoding, double avg_bytes)>
+
+Converts the string C<src> to encoding C<encoding>.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+encoding_to_encoding(PARROT_INTERP, ARGIN(const STRING *src),
+        ARGIN(const STR_VTABLE *encoding), double avg_bytes)
+{
+    ASSERT_ARGS(encoding_to_encoding)
+    STRING           *result;
+    String_iter       src_iter, dest_iter;
+    UINTVAL           src_len, alloc_bytes;
+    UINTVAL           max_bytes = encoding->max_bytes_per_codepoint;
+
+    if (src->encoding == encoding)
+        return Parrot_str_clone(interp, src);
+
+    src_len          = src->strlen;
+    result           = Parrot_gc_new_string_header(interp, 0);
+    result->encoding = encoding;
+    result->strlen   = src_len;
+
+    if (!src_len)
+        return result;
+
+    alloc_bytes = (UINTVAL)(src_len * avg_bytes);
+    if (alloc_bytes < max_bytes)
+        alloc_bytes = max_bytes;
+    Parrot_gc_allocate_string_storage(interp, result, alloc_bytes);
+    result->bufused = alloc_bytes;
+
+    STRING_ITER_INIT(interp, &src_iter);
+    STRING_ITER_INIT(interp, &dest_iter);
+
+    while (src_iter.charpos < src_len) {
+        const UINTVAL c      = STRING_iter_get_and_advance(interp, src, &src_iter);
+        const UINTVAL needed = dest_iter.bytepos + max_bytes;
+
+        if (needed > result->bufused) {
+            alloc_bytes  = src_len - src_iter.charpos;
+            alloc_bytes  = (UINTVAL)(alloc_bytes * avg_bytes);
+            alloc_bytes += needed;
+            Parrot_gc_reallocate_string_storage(interp, result, alloc_bytes);
+            result->bufused = alloc_bytes;
+        }
+
+        STRING_iter_set_and_advance(interp, result, &dest_iter, c);
+    }
+
+    result->bufused = dest_iter.bytepos;
+
+    return result;
+}
+
+
+/*
+
 =item C<INTVAL encoding_equal(PARROT_INTERP, const STRING *lhs, const STRING
 *rhs)>
 

Modified: branches/string_checks/src/string/encoding/shared.h
==============================================================================
--- branches/string_checks/src/string/encoding/shared.h	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/src/string/encoding/shared.h	Sun Oct 31 15:01:21 2010	(r49750)
@@ -108,6 +108,16 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * encoding_to_encoding(PARROT_INTERP,
+    ARGIN(const STRING *src),
+    ARGIN(const STR_VTABLE *encoding),
+    double avg_bytes)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+
 PARROT_WARN_UNUSED_RESULT
 UINTVAL encoding_validate(PARROT_INTERP, ARGIN(const STRING *src))
         __attribute__nonnull__(1)
@@ -323,6 +333,10 @@
 #define ASSERT_ARGS_encoding_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(src) \
+    , PARROT_ASSERT_ARG(encoding))
 #define ASSERT_ARGS_encoding_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))

Modified: branches/string_checks/src/string/encoding/utf16.c
==============================================================================
--- branches/string_checks/src/string/encoding/utf16.c	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/src/string/encoding/utf16.c	Sun Oct 31 15:01:21 2010	(r49750)
@@ -8,7 +8,7 @@
 
 =head1 DESCRIPTION
 
-UTF-16 encoding with the help of the ICU library.
+UTF-16 encoding
 
 =head2 Functions
 
@@ -19,6 +19,7 @@
 */
 
 #include "parrot/parrot.h"
+#include "../unicode.h"
 #include "shared.h"
 
 /* HEADERIZER HFILE: none */
@@ -26,6 +27,20 @@
 /* HEADERIZER BEGIN: static */
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL utf16_decode(PARROT_INTERP, ARGIN(const utf16_t *p))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static utf16_t * utf16_encode(PARROT_INTERP,
+    ARGMOD(utf16_t *ptr),
+    UINTVAL c)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        FUNC_MODIFIES(*ptr);
+
 static UINTVAL utf16_iter_get(PARROT_INTERP,
     ARGIN(const STRING *str),
     ARGIN(const String_iter *i),
@@ -56,7 +71,7 @@
 static void utf16_iter_set_position(PARROT_INTERP,
     ARGIN(const STRING *str),
     ARGMOD(String_iter *i),
-    UINTVAL n)
+    UINTVAL pos)
         __attribute__nonnull__(1)
         __attribute__nonnull__(2)
         __attribute__nonnull__(3)
@@ -82,14 +97,19 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+PARROT_CANNOT_RETURN_NULL
 PARROT_WARN_UNUSED_RESULT
+static const utf16_t * utf16_skip_backward(
+    ARGIN(const utf16_t *p),
+    UINTVAL count)
+        __attribute__nonnull__(1);
+
 PARROT_CANNOT_RETURN_NULL
-static STRING * utf16_substr(PARROT_INTERP,
-    ARGIN(const STRING *src),
-    INTVAL offset,
-    INTVAL length)
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2);
+PARROT_WARN_UNUSED_RESULT
+static const utf16_t * utf16_skip_forward(
+    ARGIN(const utf16_t *p),
+    UINTVAL count)
+        __attribute__nonnull__(1);
 
 PARROT_WARN_UNUSED_RESULT
 PARROT_CANNOT_RETURN_NULL
@@ -97,6 +117,12 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+#define ASSERT_ARGS_utf16_decode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(p))
+#define ASSERT_ARGS_utf16_encode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(ptr))
 #define ASSERT_ARGS_utf16_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(str) \
@@ -123,32 +149,22 @@
 #define ASSERT_ARGS_utf16_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_utf16_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf16_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(p))
+#define ASSERT_ARGS_utf16_skip_forward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(p))
 #define ASSERT_ARGS_utf16_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 /* HEADERIZER END: static */
 
-#if PARROT_HAS_ICU
-#  include <unicode/utf16.h>
-#  include <unicode/ustring.h>
-#endif
-
-#define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
-    "unimpl utf16")
-
 
 /*
 
 =item C<static STRING * utf16_to_encoding(PARROT_INTERP, const STRING *src)>
 
-Converts the string C<src> to this particular encoding.  If C<dest> is
-provided, it will contain the result.  Otherwise this function operates in
-place.
-
+Converts the string C<src> to this particular encoding.
 
 =cut
 
@@ -160,70 +176,47 @@
 utf16_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
 {
     ASSERT_ARGS(utf16_to_encoding)
-    int           src_len, dest_len;
-    Parrot_UInt2 *p;
-    STRING       *result;
-
-    if (src->encoding == Parrot_utf16_encoding_ptr
-    ||  src->encoding == Parrot_ucs2_encoding_ptr)
-        /* we have to use clone instead of copy because the Unicode upcase
-         * and downcase functions assume to get an unshared buffer */
-        return Parrot_str_clone(interp, src);
+    STRING  *result;
+    UINTVAL  src_len;
 
-    result  = Parrot_gc_new_string_header(interp, 0);
     src_len = STRING_length(src);
 
-    if (!src_len) {
+    if (STRING_max_bytes_per_codepoint(src) == 1) {
+        result           = Parrot_gc_new_string_header(interp, 0);
         result->encoding = Parrot_ucs2_encoding_ptr;
-        return result;
-    }
-
-    Parrot_gc_allocate_string_storage(interp, result, 2 * src_len);
-    p = (Parrot_UInt2 *)result->strstart;
+        result->bufused  = 2 * src_len;
+        result->strlen   = src_len;
 
-    if (STRING_max_bytes_per_codepoint(src) == 1) {
-        for (dest_len = 0; dest_len < src_len; ++dest_len) {
-            p[dest_len] = (unsigned char)src->strstart[dest_len];
-        }
-    }
-    else if (src->encoding == Parrot_utf8_encoding_ptr) {
-#if PARROT_HAS_ICU
-        UErrorCode err = U_ZERO_ERROR;
-
-        u_strFromUTF8(p, src_len, &dest_len, src->strstart, src->bufused, &err);
-
-        if (!U_SUCCESS(err)) {
-            /*
-             * have to resize - required len in UChars is in dest_len
-             */
-            result->bufused = 2 * dest_len;
-            Parrot_gc_reallocate_string_storage(interp, result, 2 * dest_len);
+        if (src_len) {
+            UINTVAL       i;
+            Parrot_UInt2 *p;
 
+            Parrot_gc_allocate_string_storage(interp, result, 2 * src_len);
             p = (Parrot_UInt2 *)result->strstart;
-            u_strFromUTF8(p, dest_len, &dest_len, src->strstart, src->bufused, &err);
-            PARROT_ASSERT(U_SUCCESS(err));
+
+            for (i = 0; i < src_len; ++i) {
+                p[i] = (unsigned char)src->strstart[i];
+            }
         }
-#else
-        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-            "no ICU lib loaded");
-#endif
     }
-    else {
-        UNIMPL;
+    else if (src->encoding == Parrot_utf16_encoding_ptr
+         ||  src->encoding == Parrot_ucs2_encoding_ptr) {
+        /* we have to use clone instead of copy because the Unicode upcase
+         * and downcase functions assume to get an unshared buffer */
+        result = Parrot_str_clone(interp, src);
     }
+    else {
+        result = encoding_to_encoding(interp, src, Parrot_utf16_encoding_ptr, 2.2);
 
-    result->bufused  = 2 * dest_len;
-    result->strlen   = src_len;
-
-    /* downgrade if possible */
-    if (dest_len == src_len)
-        result->encoding = Parrot_ucs2_encoding_ptr;
-    else
-        result->encoding = Parrot_utf16_encoding_ptr;
+        /* downgrade if possible */
+        if (result->bufused == result->strlen << 1)
+            result->encoding = Parrot_ucs2_encoding_ptr;
+    }
 
     return result;
 }
 
+
 /*
 
 =item C<static UINTVAL utf16_scan(PARROT_INTERP, const STRING *src)>
@@ -240,136 +233,200 @@
 utf16_scan(PARROT_INTERP, ARGIN(const STRING *src))
 {
     ASSERT_ARGS(utf16_scan)
-#if PARROT_HAS_ICU
-    const UChar * const s = (UChar*) src->strstart;
-    UINTVAL pos = 0, charpos = 0;
-    /*
-     * this is used to initially calculate src->strlen,
-     * therefore we must scan the whole string
-     */
-    while (pos * sizeof (UChar) < src->bufused) {
-        U16_FWD_1_UNSAFE(s, pos);
-        ++charpos;
-    }
-    return charpos;
-#else
-    UNUSED(src);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+    const utf16_t *p   = (utf16_t *)src->strstart;
+    UINTVAL        len = 0;
+    UINTVAL        i, n;
+
+    if (src->bufused & 1)
+        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+            "Unaligned end in UTF-16 string\n");
+
+    n = src->bufused >> 1;
+
+    for (i = 0; i < n; ++i) {
+        UINTVAL c = p[i];
+
+        if (UNICODE_IS_HIGH_SURROGATE(c)) {
+            ++i;
+
+            if (i >= n)
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+                    "Unaligned end in UTF-16 string\n");
+
+            if (!UNICODE_IS_LOW_SURROGATE(p[i]))
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+                    "Malformed UTF-16 string\n");
+
+            c = UNICODE_DECODE_SURROGATE(c, p[i]);
+        }
+        else {
+            if (UNICODE_IS_LOW_SURROGATE(c))
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+                    "Malformed UTF-16 string\n");
+        }
+
+        if (UNICODE_IS_NON_CHARACTER(c))
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF16,
+                "Non-character in UTF-16 string\n");
+
+        ++len;
+    }
+
+    return len;
 }
 
+
 /*
 
-=item C<static UINTVAL utf16_ord(PARROT_INTERP, const STRING *src, INTVAL idx)>
+=item C<static const utf16_t * utf16_skip_forward(const utf16_t *p, UINTVAL
+count)>
 
-Returns the codepoint in string C<src> at position C<offset>.
+Skips C<count> codepoints starting from C<p>. Returns the new pointer.
 
 =cut
 
 */
 
-static UINTVAL
-utf16_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static const utf16_t *
+utf16_skip_forward(ARGIN(const utf16_t *p), UINTVAL count)
 {
-    ASSERT_ARGS(utf16_ord)
-#if PARROT_HAS_ICU
-    const UINTVAL len = STRING_length(src);
-    const UChar  *s;
-    UINTVAL       c, pos;
+    ASSERT_ARGS(utf16_skip_forward)
+    UINTVAL i;
 
-    if (idx < 0)
-        idx += len;
+    for (i = 0; i < count; ++i) {
+        if (UNICODE_IS_HIGH_SURROGATE(*p))
+            p += 2;
+        else
+            p += 1;
+    }
 
-    if ((UINTVAL)idx >= len)
-        encoding_ord_error(interp, src, idx);
+    return p;
+}
 
-    s   = (UChar *)src->strstart;
-    pos = 0;
-    U16_FWD_N_UNSAFE(s, pos, idx);
-    U16_GET_UNSAFE(s, pos, c);
 
-    return c;
-#else
-    UNUSED(src);
-    UNUSED(idx);
+/*
+
+=item C<static const utf16_t * utf16_skip_backward(const utf16_t *p, UINTVAL
+count)>
+
+Skips C<count> codepoints backwards starting from C<p>. Returns the new
+pointer.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static const utf16_t *
+utf16_skip_backward(ARGIN(const utf16_t *p), UINTVAL count)
+{
+    ASSERT_ARGS(utf16_skip_backward)
+    UINTVAL i;
 
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+    for (i = 0; i < count; ++i) {
+        --p;
+        if (UNICODE_IS_LOW_SURROGATE(*p))
+            --p;
+    }
+
+    return p;
 }
 
+
 /*
 
-=item C<static STRING * utf16_substr(PARROT_INTERP, const STRING *src, INTVAL
-offset, INTVAL length)>
+=item C<static UINTVAL utf16_decode(PARROT_INTERP, const utf16_t *p)>
 
-Returns the codepoints in string C<src> at position C<offset> and length
-C<count>.
+Decodes the codepoint starting at C<p>.
 
 =cut
 
 */
 
 PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-utf16_substr(PARROT_INTERP, ARGIN(const STRING *src), INTVAL offset, INTVAL length)
+static UINTVAL
+utf16_decode(PARROT_INTERP, ARGIN(const utf16_t *p))
 {
-    ASSERT_ARGS(utf16_substr)
-#if PARROT_HAS_ICU
-    const UChar * const s = (UChar*) src->strstart;
-    const UINTVAL  strlen = STRING_length(src);
-    STRING        *return_string;
-    UINTVAL        pos = 0, start;
+    UINTVAL c = *p;
 
-    if (offset < 0)
-        offset += strlen;
+    if (UNICODE_IS_HIGH_SURROGATE(c))
+        c = UNICODE_DECODE_SURROGATE(c, p[1]);
 
-    if ((UINTVAL)offset >= strlen || length <= 0) {
-        /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
-        if ((UINTVAL)offset == strlen || length <= 0)
-            return Parrot_str_new_noinit(interp, 0);
+    return c;
+}
 
-        Parrot_ex_throw_from_c_args(interp, NULL,
-            EXCEPTION_SUBSTR_OUT_OF_STRING,
-            "Cannot take substr outside string");
-    }
 
-    return_string = Parrot_str_copy(interp, src);
+/*
 
-    if (offset == 0 && (UINTVAL)length >= strlen)
-        return return_string;
+=item C<static utf16_t * utf16_encode(PARROT_INTERP, utf16_t *ptr, UINTVAL c)>
 
-    U16_FWD_N_UNSAFE(s, pos, offset);
+Decodes the codepoint starting at C<p>.
 
-    start = pos * sizeof (UChar);
-    return_string->strstart += start;
+=cut
 
-    if ((UINTVAL)length >= strlen - (UINTVAL)offset) {
-        return_string->bufused -= start;
-        return_string->strlen  -= offset;
+*/
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static utf16_t *
+utf16_encode(PARROT_INTERP, ARGMOD(utf16_t *ptr), UINTVAL c)
+{
+    ASSERT_ARGS(utf16_encode)
+
+    if (c < 0xFFFE) {
+        if (UNICODE_IS_SURROGATE(c)
+        || (c >= 0xFDD0 && c <= 0xFDEF))
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                    "Invalid character for UTF-16 encoding\n");
+
+        *ptr++ = c;
     }
     else {
-        U16_FWD_N_UNSAFE(s, pos, length);
-        return_string->bufused = pos * sizeof (UChar) - start;
-        return_string->strlen  = length;
-    }
-
-    return_string->hashval = 0;
-
-    return return_string;
-#else
-    UNUSED(src);
-    UNUSED(offset);
-    UNUSED(length);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+        if ((c & 0xFFFE) == 0xFFFE
+        ||   c > 0x10FFFF)
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                    "Invalid character for UTF-16 encoding\n");
+
+        *ptr++ = UNICODE_HIGH_SURROGATE(c);
+        *ptr++ = UNICODE_LOW_SURROGATE(c);
+    }
+
+    return ptr;
+}
+
+
+/*
+
+=item C<static UINTVAL utf16_ord(PARROT_INTERP, const STRING *src, INTVAL idx)>
+
+Returns the codepoint in string C<src> at position C<offset>.
+
+=cut
+
+*/
+
+static UINTVAL
+utf16_ord(PARROT_INTERP, ARGIN(const STRING *src), INTVAL idx)
+{
+    ASSERT_ARGS(utf16_ord)
+    const UINTVAL  len = STRING_length(src);
+    const utf16_t *start;
+
+    if (idx < 0)
+        idx += len;
+
+    if ((UINTVAL)idx >= len)
+        encoding_ord_error(interp, src, idx);
+
+    start = utf16_skip_forward((const utf16_t *)src->strstart, idx);
+
+    return utf16_decode(interp, start);
 }
 
+
 /*
 
 =item C<static UINTVAL utf16_iter_get(PARROT_INTERP, const STRING *str, const
@@ -386,28 +443,16 @@
     ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
 {
     ASSERT_ARGS(utf16_iter_get)
-#if PARROT_HAS_ICU
-    const UChar * const s = (UChar*) str->strstart;
-    UINTVAL c, pos;
-
-    pos = i->bytepos / sizeof (UChar);
-    if (offset > 0) {
-        U16_FWD_N_UNSAFE(s, pos, offset);
-    }
-    else if (offset < 0) {
-        U16_BACK_N_UNSAFE(s, pos, -offset);
-    }
-    U16_GET_UNSAFE(s, pos, c);
+    const utf16_t *ptr = (utf16_t *)(str->strstart + i->bytepos);
 
-    return c;
-#else
-    UNUSED(str);
-    UNUSED(i);
-    UNUSED(offset);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+    PARROT_ASSERT((UINTVAL)(i->charpos + offset) <= str->strlen);
+
+    if (offset > 0)
+        ptr = utf16_skip_forward(ptr, offset);
+    else if (offset < 0)
+        ptr = utf16_skip_backward(ptr, -offset);
+
+    return utf16_decode(interp, ptr);
 }
 
 /*
@@ -426,27 +471,20 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
 {
     ASSERT_ARGS(utf16_iter_skip)
-#if PARROT_HAS_ICU
-    const UChar * const s = (UChar*) str->strstart;
-    UINTVAL pos = i->bytepos / sizeof (UChar);
-
-    if (skip > 0) {
-        U16_FWD_N_UNSAFE(s, pos, skip);
-    }
-    else if (skip < 0) {
-        U16_BACK_N_UNSAFE(s, pos, -skip);
-    }
+    const utf16_t *ptr = (utf16_t *)(str->strstart + i->bytepos);
 
     i->charpos += skip;
-    i->bytepos = pos * sizeof (UChar);
-#else
-    UNUSED(str);
-    UNUSED(i);
-    UNUSED(skip);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+
+    PARROT_ASSERT(i->charpos <= str->strlen);
+
+    if (skip > 0)
+        ptr = utf16_skip_forward(ptr, skip);
+    else if (skip < 0)
+        ptr = utf16_skip_backward(ptr, -skip);
+
+    i->bytepos = (const char *)ptr - (const char *)str->strstart;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 }
 
 /*
@@ -466,24 +504,19 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i))
 {
     ASSERT_ARGS(utf16_iter_get_and_advance)
-#if PARROT_HAS_ICU
-    const UChar * const s = (UChar*) str->strstart;
-    UINTVAL c, pos;
-    pos = i->bytepos / sizeof (UChar);
-    /* TODO either make sure that we don't go past end or use SAFE
-     *      iter versions
-     */
-    U16_NEXT_UNSAFE(s, pos, c);
-    i->charpos++;
-    i->bytepos = pos * sizeof (UChar);
+    const utf16_t *ptr = (utf16_t *)(str->strstart + i->bytepos);
+    UINTVAL        c   = utf16_decode(interp, ptr);
+
+    i->charpos += 1;
+
+    if (UNICODE_IS_HIGH_SURROGATE(*ptr))
+        i->bytepos += 4;
+    else
+        i->bytepos += 2;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
+
     return c;
-#else
-    UNUSED(str);
-    UNUSED(i);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
 }
 
 /*
@@ -503,29 +536,21 @@
     ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
 {
     ASSERT_ARGS(utf16_iter_set_and_advance)
-#if PARROT_HAS_ICU
-    UChar * const s = (UChar*) str->strstart;
-    UINTVAL pos;
-    pos = i->bytepos / sizeof (UChar);
-    U16_APPEND_UNSAFE(s, pos, c);
-    i->charpos++;
-    i->bytepos = pos * sizeof (UChar);
-#else
-    UNUSED(str);
-    UNUSED(i);
-    UNUSED(c);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+    utf16_t * const ptr = (utf16_t *)(str->strstart + i->bytepos);
+    utf16_t * const end = utf16_encode(interp, ptr, c);
+
+    i->charpos += 1;
+    i->bytepos += (char *)end - (char *)ptr;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 }
 
 /*
 
 =item C<static void utf16_iter_set_position(PARROT_INTERP, const STRING *str,
-String_iter *i, UINTVAL n)>
+String_iter *i, UINTVAL pos)>
 
-Moves the string iterator C<i> to the position C<n> in the string.
+Moves the string iterator C<i> to the position C<pos> in the string.
 
 =cut
 
@@ -533,24 +558,53 @@
 
 static void
 utf16_iter_set_position(PARROT_INTERP,
-    ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n)
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos)
 {
     ASSERT_ARGS(utf16_iter_set_position)
-#if PARROT_HAS_ICU
-    UChar * const s = (UChar*) str->strstart;
-    UINTVAL pos;
-    pos = 0;
-    U16_FWD_N_UNSAFE(s, pos, n);
-    i->charpos = n;
-    i->bytepos = pos * sizeof (UChar);
-#else
-    UNUSED(str);
-    UNUSED(i);
-    UNUSED(n);
-
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
+    const utf16_t *ptr;
+
+    if (pos == 0) {
+        i->charpos = 0;
+        i->bytepos = 0;
+        return;
+    }
+
+    PARROT_ASSERT(pos <= str->strlen);
+
+    /*
+     * we know the byte offsets of three positions: start, current and end
+     * now find the shortest way to reach pos
+     */
+    if (pos < i->charpos) {
+        if (pos <= (i->charpos >> 1)) {
+            /* go forward from start */
+            ptr = (utf16_t *)str->strstart;
+            ptr = utf16_skip_forward(ptr, pos);
+        }
+        else {
+            /* go backward from current */
+            ptr = (utf16_t *)(str->strstart + i->bytepos);
+            ptr = utf16_skip_backward(ptr, i->charpos - pos);
+        }
+    }
+    else {
+        const UINTVAL  len = str->strlen;
+        if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
+            /* go forward from current */
+            ptr = (utf16_t *)(str->strstart + i->bytepos);
+            ptr = utf16_skip_forward(ptr, pos - i->charpos);
+        }
+        else {
+            /* go backward from end */
+            ptr = (utf16_t *)(str->strstart + str->bufused);
+            ptr = utf16_skip_backward(ptr, len - pos);
+        }
+    }
+
+    i->charpos = pos;
+    i->bytepos = (const char *)ptr - (const char *)str->strstart;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 }
 
 
@@ -568,11 +622,11 @@
     encoding_index,
     encoding_rindex,
     encoding_hash,
-    unicode_validate,
+    encoding_validate,
 
     utf16_scan,
     utf16_ord,
-    utf16_substr,
+    encoding_substr,
 
     encoding_is_cclass,
     encoding_find_cclass,

Modified: branches/string_checks/src/string/encoding/utf8.c
==============================================================================
--- branches/string_checks/src/string/encoding/utf8.c	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/src/string/encoding/utf8.c	Sun Oct 31 15:01:21 2010	(r49750)
@@ -32,9 +32,10 @@
         __attribute__nonnull__(2);
 
 PARROT_CANNOT_RETURN_NULL
-static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
+static utf8_t * utf8_encode(PARROT_INTERP, ARGMOD(utf8_t *ptr), UINTVAL c)
         __attribute__nonnull__(1)
-        __attribute__nonnull__(2);
+        __attribute__nonnull__(2)
+        FUNC_MODIFIES(*ptr);
 
 static UINTVAL utf8_iter_get(PARROT_INTERP,
     ARGIN(const STRING *str),
@@ -88,11 +89,13 @@
 
 PARROT_WARN_UNUSED_RESULT
 PARROT_CANNOT_RETURN_NULL
-static const utf8_t * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
+static const utf8_t * utf8_skip_backward(
+    ARGIN(const utf8_t *ptr),
+    UINTVAL n)
         __attribute__nonnull__(1);
 
 PARROT_CANNOT_RETURN_NULL
-static const utf8_t * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
+static const utf8_t * utf8_skip_forward(ARGIN(const utf8_t *ptr), UINTVAL n)
         __attribute__nonnull__(1);
 
 PARROT_CAN_RETURN_NULL
@@ -180,56 +183,14 @@
 utf8_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
 {
     ASSERT_ARGS(utf8_to_encoding)
-    STRING *result;
-    const STR_VTABLE *src_encoding = src->encoding;
-    UINTVAL dest_len, dest_pos, src_len;
-    unsigned char *p;
-
-    if (src_encoding == Parrot_utf8_encoding_ptr)
-        return Parrot_str_clone(interp, src);
-
-    src_len          = src->strlen;
-    result           = Parrot_gc_new_string_header(interp, 0);
-    result->encoding = Parrot_utf8_encoding_ptr;
-    result->strlen   = src_len;
-
-    if (!src_len)
-        return result;
-
-    Parrot_gc_allocate_string_storage(interp, result, src_len);
-    p = (unsigned char *)result->strstart;
-
-    if (src_encoding == Parrot_ascii_encoding_ptr) {
-        for (dest_len = 0; dest_len < src_len; ++dest_len) {
-            p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
-        }
-        result->bufused = dest_len;
+    STRING  *result;
+
+    if (src->encoding == Parrot_ascii_encoding_ptr) {
+        result           = Parrot_str_clone(interp, src);
+        result->encoding = Parrot_utf8_encoding_ptr;
     }
     else {
-        String_iter src_iter;
-        STRING_ITER_INIT(interp, &src_iter);
-        dest_len = src_len;
-        dest_pos = 0;
-        while (src_iter.charpos < src_len) {
-            const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter);
-            unsigned char *new_pos;
-            unsigned char *pos;
-
-            if (dest_len - dest_pos < 6) {
-                UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5);
-                if (need < 16)
-                    need = 16;
-                dest_len += need;
-                result->bufused = dest_pos;
-                Parrot_gc_reallocate_string_storage(interp, result, dest_len);
-                p = (unsigned char *)result->strstart;
-            }
-
-            pos = p + dest_pos;
-            new_pos = (unsigned char *)utf8_encode(interp, pos, c);
-            dest_pos += (new_pos - pos);
-        }
-        result->bufused = dest_pos;
+        result = encoding_to_encoding(interp, src, Parrot_utf8_encoding_ptr, 1.2);
     }
 
     return result;
@@ -322,7 +283,7 @@
     if ((UINTVAL)idx >= len)
         encoding_ord_error(interp, src, idx);
 
-    start = utf8_skip_forward(src->strstart, idx);
+    start = utf8_skip_forward((utf8_t *)src->strstart, idx);
 
     return utf8_decode(interp, start);
 }
@@ -363,7 +324,7 @@
 
 /*
 
-=item C<static void * utf8_encode(PARROT_INTERP, void *ptr, UINTVAL c)>
+=item C<static utf8_t * utf8_encode(PARROT_INTERP, utf8_t *ptr, UINTVAL c)>
 
 Returns the UTF-8 encoding of integer C<c>.
 
@@ -372,35 +333,31 @@
 */
 
 PARROT_CANNOT_RETURN_NULL
-static void *
-utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
+static utf8_t *
+utf8_encode(PARROT_INTERP, ARGMOD(utf8_t *ptr), UINTVAL c)
 {
     ASSERT_ARGS(utf8_encode)
-    const UINTVAL        len   = UNISKIP(c);
-
-    /* the const is good on u8ptr, but using ptr on other variables avoids the
-     * need to do a yucky cast to remove constness */
-    const utf8_t * const u8ptr = (utf8_t *)ptr;
-    utf8_t              *u8end = (utf8_t *)ptr + len - 1;
+    const UINTVAL  len = UNISKIP(c);
+    utf8_t        *end = ptr + len - 1;
 
     if (UNICODE_IS_INVALID(c))
         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
                 "Invalid character for UTF-8 encoding\n");
 
-    while (u8end > u8ptr) {
-        *u8end-- =
-            (utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK);
+    while (end > ptr) {
+        *end-- = (c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK;
         c >>= UTF8_ACCUMULATION_SHIFT;
     }
-    *u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len));
 
-    return (utf8_t *)ptr + len;
+    *end = (c & UTF8_START_MASK(len)) | UTF8_START_MARK(len);
+
+    return ptr + len;
 }
 
 
 /*
 
-=item C<static const utf8_t * utf8_skip_forward(const void *ptr, UINTVAL n)>
+=item C<static const utf8_t * utf8_skip_forward(const utf8_t *ptr, UINTVAL n)>
 
 Moves C<ptr> C<n> characters forward.
 
@@ -410,22 +367,21 @@
 
 PARROT_CANNOT_RETURN_NULL
 static const utf8_t *
-utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
+utf8_skip_forward(ARGIN(const utf8_t *ptr), UINTVAL n)
 {
     ASSERT_ARGS(utf8_skip_forward)
-    const utf8_t *u8ptr = (const utf8_t *)ptr;
 
     while (n-- > 0) {
-        u8ptr += UTF8SKIP(u8ptr);
+        ptr += UTF8SKIP(ptr);
     }
 
-    return u8ptr;
+    return ptr;
 }
 
 
 /*
 
-=item C<static const utf8_t * utf8_skip_backward(const void *ptr, UINTVAL n)>
+=item C<static const utf8_t * utf8_skip_backward(const utf8_t *ptr, UINTVAL n)>
 
 Moves C<ptr> C<n> characters back.
 
@@ -436,18 +392,17 @@
 PARROT_WARN_UNUSED_RESULT
 PARROT_CANNOT_RETURN_NULL
 static const utf8_t *
-utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
+utf8_skip_backward(ARGIN(const utf8_t *ptr), UINTVAL n)
 {
     ASSERT_ARGS(utf8_skip_backward)
-    const utf8_t *u8ptr = (const utf8_t *)ptr;
 
     while (n-- > 0) {
-        --u8ptr;
-        while (UTF8_IS_CONTINUATION(*u8ptr))
-            --u8ptr;
+        --ptr;
+        while (UTF8_IS_CONTINUATION(*ptr))
+            --ptr;
     }
 
-    return u8ptr;
+    return ptr;
 }
 
 
@@ -467,18 +422,16 @@
     ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
 {
     ASSERT_ARGS(utf8_iter_get)
-    const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+    const utf8_t *ptr = (utf8_t *)(str->strstart + i->bytepos);
 
     PARROT_ASSERT(i->charpos + offset < str->strlen);
 
-    if (offset > 0) {
-        u8ptr = utf8_skip_forward(u8ptr, offset);
-    }
-    else if (offset < 0) {
-        u8ptr = utf8_skip_backward(u8ptr, -offset);
-    }
+    if (offset > 0)
+        ptr = utf8_skip_forward(ptr, offset);
+    else if (offset < 0)
+        ptr = utf8_skip_backward(ptr, -offset);
 
-    return utf8_decode(interp, u8ptr);
+    return utf8_decode(interp, ptr);
 }
 
 
@@ -498,20 +451,18 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
 {
     ASSERT_ARGS(utf8_iter_skip)
-    const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+    const utf8_t *ptr = (utf8_t *)(str->strstart + i->bytepos);
 
     i->charpos += skip;
 
     PARROT_ASSERT(i->charpos <= str->strlen);
 
-    if (skip > 0) {
-        u8ptr = utf8_skip_forward(u8ptr, skip);
-    }
-    else if (skip < 0) {
-        u8ptr = utf8_skip_backward(u8ptr, -skip);
-    }
+    if (skip > 0)
+        ptr = utf8_skip_forward(ptr, skip);
+    else if (skip < 0)
+        ptr = utf8_skip_backward(ptr, -skip);
 
-    i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+    i->bytepos = (const char *)ptr - (const char *)str->strstart;
 
     PARROT_ASSERT(i->bytepos <= str->bufused);
 }
@@ -534,11 +485,11 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i))
 {
     ASSERT_ARGS(utf8_iter_get_and_advance)
-    const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
-    UINTVAL c = utf8_decode(interp, u8ptr);
+    const utf8_t *ptr = (utf8_t *)(str->strstart + i->bytepos);
+    UINTVAL       c   = utf8_decode(interp, ptr);
 
     i->charpos += 1;
-    i->bytepos += UTF8SKIP(u8ptr);
+    i->bytepos += UTF8SKIP(ptr);
 
     PARROT_ASSERT(i->bytepos <= str->bufused);
 
@@ -563,11 +514,11 @@
     ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
 {
     ASSERT_ARGS(utf8_iter_set_and_advance)
-    unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos;
-    unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
+    utf8_t * const ptr = (utf8_t *)(str->strstart + i->bytepos);
+    utf8_t * const end = utf8_encode(interp, ptr, c);
 
     i->charpos += 1;
-    i->bytepos += new_pos - pos;
+    i->bytepos += end - ptr;
 
     PARROT_ASSERT(i->bytepos <= str->bufused);
 }
@@ -590,7 +541,7 @@
     ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos)
 {
     ASSERT_ARGS(utf8_iter_set_position)
-    const utf8_t *u8ptr = (const utf8_t *)str->strstart;
+    const utf8_t *ptr = (utf8_t *)str->strstart;
 
     if (pos == 0) {
         i->charpos = 0;
@@ -607,27 +558,27 @@
     if (pos < i->charpos) {
         if (pos <= (i->charpos >> 1)) {
             /* go forward from start */
-            u8ptr = utf8_skip_forward(u8ptr, pos);
+            ptr = utf8_skip_forward(ptr, pos);
         }
         else {
             /* go backward from current */
-            u8ptr = utf8_skip_backward(u8ptr + i->bytepos, i->charpos - pos);
+            ptr = utf8_skip_backward(ptr + i->bytepos, i->charpos - pos);
         }
     }
     else {
         const UINTVAL  len = str->strlen;
         if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
             /* go forward from current */
-            u8ptr = utf8_skip_forward(u8ptr + i->bytepos, pos - i->charpos);
+            ptr = utf8_skip_forward(ptr + i->bytepos, pos - i->charpos);
         }
         else {
             /* go backward from end */
-            u8ptr = utf8_skip_backward(u8ptr + str->bufused, len - pos);
+            ptr = utf8_skip_backward(ptr + str->bufused, len - pos);
         }
     }
 
     i->charpos = pos;
-    i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+    i->bytepos = (const char *)ptr - (const char *)str->strstart;
 
     PARROT_ASSERT(i->bytepos <= str->bufused);
 }

Modified: branches/string_checks/src/string/unicode.h
==============================================================================
--- branches/string_checks/src/string/unicode.h	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/src/string/unicode.h	Sun Oct 31 15:01:21 2010	(r49750)
@@ -14,8 +14,8 @@
 #define PARROT_UNICODE_H_GUARD
 
 typedef unsigned char utf8_t;
-typedef unsigned short utf16_t;
-typedef unsigned long utf32_t;
+typedef Parrot_UInt2  utf16_t;
+typedef Parrot_Int4   utf32_t;
 
 #define UNICODE_SURROGATE_FIRST         0xD800u
 #define UNICODE_SURROGATE_LAST          0xDFFFu

Modified: branches/string_checks/t/op/string_cs.t
==============================================================================
--- branches/string_checks/t/op/string_cs.t	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/t/op/string_cs.t	Sun Oct 31 15:01:21 2010	(r49750)
@@ -460,10 +460,7 @@
 abcdefgefgefgefghi\xc2\xa9jk
 OUTPUT
 
-SKIP: {
-    skip( 'no ICU lib', 19 ) unless $PConfig{has_icu};
-
-    pir_output_is( <<'CODE', <<OUTPUT, "literal encoding persistence - TT #468" );
+pir_output_is( <<'CODE', <<OUTPUT, "literal encoding persistence - TT #468" );
 .include 'stdio.pasm'
 .sub main
     # set output encoding to normalize printed strings
@@ -501,7 +498,7 @@
 hello(10): ucs2
 OUTPUT
 
-    pir_output_is( <<'CODE', <<OUTPUT, "empty literal encoding persistence - TT #1791");
+pir_output_is( <<'CODE', <<OUTPUT, "empty literal encoding persistence - TT #1791");
 .sub main
     load_bytecode 't/op/testlib/test_strings.pbc'
     $P0 = 'get_empties'()
@@ -533,58 +530,14 @@
 (0): ucs2
 OUTPUT
 
-    pir_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
-.sub main :main
-    set $S0, iso-8859-1:"TÖTSCH"
-    find_encoding $I0, "utf8"
-    trans_encoding $S1, $S0, $I0
-    $S1 = downcase $S1
-    getstdout $P0           # need to convert back to utf8
-    $P0.'encoding'("utf8")  # set utf8 output
-    print $S1
-    print "\n"
-    end
-.end
-CODE
-t\xc3\xb6tsch
-OUTPUT
-
-    pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_encoding_s_s_i" );
-    set S0, iso-8859-1:"TÖTSCH"
-    find_encoding I0, "utf8"
-    trans_encoding S1, S0, I0
-    downcase S1, S1
-    find_encoding I0, "iso-8859-1"
-    trans_encoding S1, S1, I0
-    print S1
-    print "\n"
-    end
-CODE
-t\xf6tsch
-OUTPUT
-
-    pasm_error_output_like( <<'CODE', <<"OUTPUT", "negative encoding number" );
+pasm_error_output_like( <<'CODE', <<"OUTPUT", "negative encoding number" );
     trans_encoding S2, 'foo', -1
     end
 CODE
 /encoding #-1 not found/
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transencoding" );
-    set S0, iso-8859-1:"TÖTSCH"
-    find_encoding I0, "utf8"
-    trans_encoding S1, S0, I0
-    downcase S1, S1
-    find_encoding I0, "utf8"
-    trans_encoding S2, S1, I0
-    print S2
-    print "\n"
-    end
-CODE
-t\xc3\xb6tsch
-OUTPUT
-
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
     set S1, iso-8859-1:"TÖTSCH"
     find_encoding I0, "utf16"
     trans_encoding S1, S1, I0
@@ -605,7 +558,7 @@
 84_214_84_83_67_72_
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
     set S0, iso-8859-1:"TTÖÖ"
     find_encoding I0, "utf8"
     trans_encoding S1, S0, I0
@@ -624,7 +577,7 @@
 TT 2 2
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
     set S1, iso-8859-1:"Tötsch"
     find_encoding I0, "utf16"
     trans_encoding S1, S1, I0
@@ -646,7 +599,7 @@
 T\xc3\xb6tsch Leo
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
     set S1, iso-8859-1:"Tötsch"
     find_encoding I0, "utf16"
     trans_encoding S1, S1, I0
@@ -668,7 +621,7 @@
 T\xc3\xb6tsch Leo
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
     set S1, iso-8859-1:"Tötsch"
     find_encoding I0, "utf16"
     trans_encoding S1, S1, I0
@@ -682,7 +635,7 @@
 \xc3\xb6t
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
     set S1, iso-8859-1:"Tötsch"
     find_encoding I0, "utf16"
     trans_encoding S1, S1, I0
@@ -701,11 +654,10 @@
 Toetsch
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
-    set S0, iso-8859-1:"TÖTSCH"
-    find_encoding I0, "utf8"
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
+    set S0, iso-8859-1:"tötsch"
+    find_encoding I0, "utf16"
     trans_encoding S1, S0, I0
-    downcase S1, S1
     set S2, iso-8859-1:"öt"
     index I0, S1, S2
     print I0
@@ -715,11 +667,10 @@
 1
 OUTPUT
 
-    pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
-    set S0, iso-8859-1:"TÖTSCH"
-    find_encoding I0, "utf8"
+pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
+    set S0, iso-8859-1:"tötsch"
+    find_encoding I0, "utf16"
     trans_encoding S1, S0, I0
-    downcase S1, S1
     set S2, iso-8859-1:"öt"
     index I0, S1, S2
     print I0
@@ -734,6 +685,53 @@
 6
 OUTPUT
 
+SKIP: {
+    skip( 'no ICU lib', 8 ) unless $PConfig{has_icu};
+
+    pir_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
+.sub main :main
+    set $S0, iso-8859-1:"TÖTSCH"
+    find_encoding $I0, "utf8"
+    trans_encoding $S1, $S0, $I0
+    $S1 = downcase $S1
+    getstdout $P0           # need to convert back to utf8
+    $P0.'encoding'("utf8")  # set utf8 output
+    print $S1
+    print "\n"
+    end
+.end
+CODE
+t\xc3\xb6tsch
+OUTPUT
+
+    pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_encoding_s_s_i" );
+    set S0, iso-8859-1:"TÖTSCH"
+    find_encoding I0, "utf8"
+    trans_encoding S1, S0, I0
+    downcase S1, S1
+    find_encoding I0, "iso-8859-1"
+    trans_encoding S1, S1, I0
+    print S1
+    print "\n"
+    end
+CODE
+t\xf6tsch
+OUTPUT
+
+    pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transencoding" );
+    set S0, iso-8859-1:"TÖTSCH"
+    find_encoding I0, "utf8"
+    trans_encoding S1, S0, I0
+    downcase S1, S1
+    find_encoding I0, "utf8"
+    trans_encoding S2, S1, I0
+    print S2
+    print "\n"
+    end
+CODE
+t\xc3\xb6tsch
+OUTPUT
+
     pir_output_is( <<'CODE', <<"OUTPUT", "unicode upcase" );
 .sub main :main
     set $S0, iso-8859-1:"tötsch"

Modified: branches/string_checks/t/op/stringu.t
==============================================================================
--- branches/string_checks/t/op/stringu.t	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/t/op/stringu.t	Sun Oct 31 15:01:21 2010	(r49750)
@@ -6,7 +6,7 @@
 use warnings;
 use lib qw( . lib ../lib ../../lib );
 use Test::More;
-use Parrot::Test tests => 35;
+use Parrot::Test tests => 36;
 use Parrot::Config;
 
 =head1 NAME
@@ -237,19 +237,6 @@
 \xc2\xab
 OUTPUT
 
-pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
-    set S0, utf8:unicode:"\xc2\xab"
-    length I0, S0
-    print I0
-    print "\n"
-    print S0
-    print "\n"
-    end
-CODE
-1
-\xc2\xab
-OUTPUT
-
 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
     set S0, ascii:"«"
     length I0, S0
@@ -508,9 +495,6 @@
 hello
 OUTPUT
 
-
-SKIP: {
-    skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to int' );
 .sub main :main
      $S0 = "140"
@@ -555,7 +539,6 @@
 140
 140
 OUT
-}
 
 pir_output_is( <<'CODE', <<'OUT', 'concatenation of utf8 and iso-8859-1 (TT #752)' );
 .sub 'main'
@@ -712,6 +695,119 @@
 0x10FFFD
 OUT
 
+sub units_to_code {
+    my $code = '';
+
+    for my $unit (@_) {
+        my $str = pack('S*', @$unit);
+        $str =~ s/./sprintf("\\x%02X", ord($&))/egs;
+        $code .= qq{    'test_chars'(binary:"$str")\n};
+    }
+
+    return $code;
+}
+
+my $code = qq{    'test_chars'(binary:"\\x41\\x42\\x43")\n};
+$code .= units_to_code(
+    [ 0xD800 ],
+    [ 0xDFFF ],
+    [ 0xD800, 0x0041 ],
+    [ 0xD900, 0xDAFF ],
+    [ 0xDBFF, 0xD800 ],
+    [ 0xDC00, 0xD8FF ],
+    [ 0xDDFF, 0xDE00 ],
+    [ 0xDFFF, 0x0041 ],
+    [ 0xFDD0 ],
+    [ 0xFDEF ],
+    [ 0xFFFE ],
+    [ 0xFFFF ],
+    [ 0xD83F, 0xDFFF ],
+    [ 0xDBFF, 0xDFFE ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'illegal utf16 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc eh, ex, bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    eh = new 'ExceptionHandler'
+    set_addr eh, handler
+    push_eh eh
+    chars = bb.'get_string'('utf16')
+    say 'valid'
+    goto end
+  handler:
+    .local pmc ex
+    .get_results (ex)
+    \$S0 = ex['message']
+    print \$S0
+  end:
+    pop_eh
+.end
+CODE
+Unaligned end in UTF-16 string
+Unaligned end in UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Malformed UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+Non-character in UTF-16 string
+OUT
+
+$code = units_to_code(
+    [ 0x0041 ],
+    [ 0xD7FF ],
+    [ 0xE000 ],
+    [ 0xFDCF ],
+    [ 0xFDF0 ],
+    [ 0xFFFD ],
+    [ 0xD800, 0xDC00 ],
+    [ 0xD912, 0xDE34 ],
+    [ 0xDBFF, 0xDFFD ],
+);
+
+pir_output_is( <<CODE, <<'OUT', 'valid utf16 chars' );
+.sub 'main'
+$code
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    chars = bb.'get_string'('utf16')
+    \$I0 = ord chars
+    \$P0 = new 'FixedIntegerArray', 1
+    \$P0[0] = \$I0
+    \$S0 = sprintf '0x%X', \$P0
+    say \$S0
+.end
+CODE
+0x41
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+0x10000
+0x54A34
+0x10FFFD
+OUT
+
 SKIP: {
     skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
 

Modified: branches/string_checks/t/pmc/bytebuffer.t
==============================================================================
--- branches/string_checks/t/pmc/bytebuffer.t	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/t/pmc/bytebuffer.t	Sun Oct 31 15:01:21 2010	(r49750)
@@ -152,27 +152,10 @@
     is(n, 4, "getting ascii from buffer gives correct length")
     is(s, "abcd", "getting ascii from buffer gives correct content")
 
-    $I0 = hasicu()
-    unless $I0 goto skip_it
-
     bb = new ['ByteBuffer']
 
     # Upper case n tilde: codepoint 0xD1, utf8 encoding 0xC3, 0x91
-    #bb = utf16:"\x{D1}"
-    # Can't do that, or the program can't be compiled without ICU.
-    # Fill the buffer with bytes instead.
-
-    # Get endianess to set the bytes in the appropiate order.
-    # *** XXX *** Need report from big endian platforms.
-    big = isbigendian()
-    if big goto isbig
-    bb[0] = 0xD1
-    bb[1] = 0x00
-    goto doit
-isbig:
-    bb[0] = 0x00
-    bb[1] = 0xD1
-doit:
+    bb = utf16:"\x{D1}"
     s = bb.'get_string'('utf16')
     n = length s
     is(n, 1, "getting utf16 from buffer gives correct length")
@@ -186,10 +169,6 @@
     is(n, 1, "getting utf8 from buffer gives correct length")
     n = ord s
     is(n, 0xD1, "getting utf8 from buffer gives correct codepoint")
-    goto end
-skip_it:
-    skip(4, "this test needs ICU")
-end:
 .end
 
 .sub test_push
@@ -280,9 +259,6 @@
     .local pmc bb
     .local int i, big, pos, b0, b1, c
 
-    $I0 = hasicu()
-    unless $I0 goto skip_it
-
     # Get endianess to set the bytes in the appropiate order.
     # *** XXX *** Need report from big endian platforms.
     big = isbigendian()
@@ -329,9 +305,6 @@
 failed:
     say i
     ok(0, "reallocation")
-    goto end
-skip_it:
-    skip(1, "this test needs ICU")
 end:
 .end
 

Modified: branches/string_checks/t/tools/pbc_disassemble.t
==============================================================================
--- branches/string_checks/t/tools/pbc_disassemble.t	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/t/tools/pbc_disassemble.t	Sun Oct 31 15:01:21 2010	(r49750)
@@ -86,16 +86,13 @@
 .end
 PIR
 
-SKIP: {
-    skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
-    my $utf16 = pack('S*', unpack('C*', 'Hello'));
-    $utf16 =~ s/\0/\\\\0/g;
-    disassemble_output_like( <<PIR, "pir", qr/set_s_sc S0,utf16:"$utf16"/ms, 'pbc_disassemble utf16 string');
+my $utf16 = pack('S*', unpack('C*', 'Hello'));
+$utf16 =~ s/\0/\\\\0/g;
+disassemble_output_like( <<PIR, "pir", qr/set_s_sc S0,utf16:"$utf16"/ms, 'pbc_disassemble utf16 string');
 .sub main :main
     \$S0 = utf16:"Hello"
 .end
 PIR
-}
 
 =head1 HELPER SUBROUTINES
 

Modified: branches/string_checks/t/tools/pbc_dump.t
==============================================================================
--- branches/string_checks/t/tools/pbc_dump.t	Sun Oct 31 15:00:38 2010	(r49749)
+++ branches/string_checks/t/tools/pbc_dump.t	Sun Oct 31 15:01:21 2010	(r49750)
@@ -73,14 +73,11 @@
 PIR
 
 for my $enc qw(binary iso-8859-1 utf8 utf16 ucs2 ucs4) {
-    SKIP: {
-        skip( 'no ICU lib', 1 ) if $enc eq 'utf16' && !$PConfig{has_icu};
-        dump_output_like( <<PIR, "pir", qr/ENCODING.*=>.*$enc/ms, "pbc_dump $enc encoding");
+    dump_output_like( <<PIR, "pir", qr/ENCODING.*=>.*$enc/ms, "pbc_dump $enc encoding");
 .sub main :main
     \$S0 = $enc:"abc"
 .end
 PIR
-    }
 }
 
 my $longcode = ".sub main :main\n";


More information about the parrot-commits mailing list