[svn:parrot] r48565 - in trunk: include/parrot src/io src/pmc src/string src/string/charset src/string/encoding

chromatic at svn.parrot.org chromatic at svn.parrot.org
Thu Aug 19 05:53:14 UTC 2010

Author: chromatic
Date: Thu Aug 19 05:53:12 2010
New Revision: 48565
URL: https://trac.parrot.org/parrot/changeset/48565

[str] Revised STRING iterator interface (TT #1456)

These two patches from Nick Wellnhofer improve the algorithmic performance of
STRING iteration especially for variable-width encodings.


Modified: trunk/include/parrot/encoding.h
--- trunk/include/parrot/encoding.h	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/include/parrot/encoding.h	Thu Aug 19 05:53:12 2010	(r48565)
@@ -30,8 +30,16 @@
 struct string_iterator_t;       /* s. parrot/string.h */
-typedef void (*encoding_iter_init_t)(PARROT_INTERP, const STRING *src,
-        struct string_iterator_t *);
+typedef UINTVAL (*encoding_iter_get_t)(
+    PARROT_INTERP, const STRING *str, const String_iter *i, INTVAL  offset);
+typedef void    (*encoding_iter_skip_t)(
+    PARROT_INTERP, const STRING *str,       String_iter *i, INTVAL  skip);
+typedef UINTVAL (*encoding_iter_get_and_advance_t)(
+    PARROT_INTERP, const STRING *str,       String_iter *i);
+typedef void    (*encoding_iter_set_and_advance_t)(
+    PARROT_INTERP,       STRING *str,       String_iter *i, UINTVAL c);
+typedef void    (*encoding_iter_set_position_t)(
+    PARROT_INTERP, const STRING *str,       String_iter *i, UINTVAL pos);
 struct _encoding {
     ARGIN(const char *name);
@@ -44,9 +52,13 @@
     encoding_get_bytes_t                get_bytes;
     encoding_codepoints_t               codepoints;
     encoding_bytes_t                    bytes;
-    encoding_iter_init_t                iter_init;
     encoding_find_cclass_t              find_cclass;
     encoding_hash_t                     hash;
+    encoding_iter_get_t                 iter_get;
+    encoding_iter_skip_t                iter_skip;
+    encoding_iter_get_and_advance_t     iter_get_and_advance;
+    encoding_iter_set_and_advance_t     iter_set_and_advance;
+    encoding_iter_set_position_t        iter_set_position;
 typedef struct _encoding ENCODING;
@@ -209,8 +221,6 @@
     ((src)->encoding)->codepoints((i), (src))
 #define ENCODING_BYTES(i, src) \
     ((src)->encoding)->bytes((i), (src))
-#define ENCODING_ITER_INIT(i, src, iter) \
-    ((src)->encoding)->iter_init((i), (src), (iter))
 #define ENCODING_FIND_CCLASS(i, src, typetable, flags, pos, end) \
     ((src)->encoding)->find_cclass((i), (src), (typetable), (flags), (pos), (end))
 #define ENCODING_HASH(i, src, seed) \

Modified: trunk/include/parrot/string.h
--- trunk/include/parrot/string.h	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/include/parrot/string.h	Thu Aug 19 05:53:12 2010	(r48565)
@@ -30,14 +30,23 @@
 /* String iterator */
 typedef struct string_iterator_t {
-    const STRING *str;
     UINTVAL bytepos;
     UINTVAL charpos;
-    UINTVAL (*get_and_advance)(PARROT_INTERP, struct string_iterator_t *i);
-    void (*set_and_advance)(PARROT_INTERP, struct string_iterator_t *i, UINTVAL c);
-    void (*set_position)(PARROT_INTERP, struct string_iterator_t *i, UINTVAL pos);
 } String_iter;
+#define STRING_ITER_INIT(i, iter) \
+    (iter)->charpos = (iter)->bytepos = 0
+#define STRING_ITER_GET(i, str, iter, offset) \
+    ((str)->encoding)->iter_get((i), (str), (iter), (offset))
+#define STRING_ITER_SKIP(i, str, iter, skip) \
+    ((str)->encoding)->iter_skip((i), (str), (iter), (skip))
+#define STRING_ITER_GET_AND_ADVANCE(i, str, iter) \
+    ((str)->encoding)->iter_get_and_advance((i), (str), (iter))
+#define STRING_ITER_SET_AND_ADVANCE(i, str, iter, c) \
+    ((str)->encoding)->iter_set_and_advance((i), (str), (iter), (c))
+#define STRING_ITER_SET_POSITION(i, str, iter, pos) \
+    ((str)->encoding)->iter_set_position((i), (str), (iter), (pos))
 #define STREQ(x, y)  (strcmp((x), (y))==0)
 #define STRNEQ(x, y) (strcmp((x), (y))!=0)

Modified: trunk/include/parrot/string_funcs.h
--- trunk/include/parrot/string_funcs.h	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/include/parrot/string_funcs.h	Thu Aug 19 05:53:12 2010	(r48565)
@@ -226,6 +226,31 @@
 INTVAL Parrot_str_is_null(SHIM_INTERP, ARGIN_NULLOK(const STRING *s));
+INTVAL Parrot_str_iter_index(PARROT_INTERP,
+    ARGIN(const STRING *src),
+    ARGMOD(String_iter *start),
+    ARGOUT(String_iter *end),
+    ARGIN(const STRING *search))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
+        __attribute__nonnull__(4)
+        __attribute__nonnull__(5)
+        FUNC_MODIFIES(*start)
+        FUNC_MODIFIES(*end);
+STRING * Parrot_str_iter_substr(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *l),
+    ARGIN_NULLOK(const String_iter *r))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
 STRING* Parrot_str_join(PARROT_INTERP,
@@ -559,6 +584,16 @@
        PARROT_ASSERT_ARG(interp) \
 #define ASSERT_ARGS_Parrot_str_is_null __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
+#define ASSERT_ARGS_Parrot_str_iter_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(src) \
+    , PARROT_ASSERT_ARG(start) \
+    , PARROT_ASSERT_ARG(end) \
+    , PARROT_ASSERT_ARG(search))
+#define ASSERT_ARGS_Parrot_str_iter_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
 #define ASSERT_ARGS_Parrot_str_join __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \

Modified: trunk/src/io/utf8.c
--- trunk/src/io/utf8.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/io/utf8.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -57,7 +57,7 @@
     s->encoding = Parrot_utf8_encoding_ptr;
     /* count chars, verify utf8 */
-    Parrot_utf8_encoding_ptr->iter_init(interp, s, &iter);
+    STRING_ITER_INIT(interp, &iter);
     while (iter.bytepos < s->bufused) {
         if (iter.bytepos + 4 > s->bufused) {
@@ -84,8 +84,6 @@
                 s->strlen    = iter.charpos;
                 s            = Parrot_str_concat(interp, s, s2);
-                /* String is updated. Poke into iterator to replace old string */
-                iter.str     = s;
                 *buf         = s;
                 len         += len2 + 1;
@@ -93,7 +91,7 @@
-        iter.get_and_advance(interp, &iter);
+        Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, *buf, &iter);
     s->strlen = iter.charpos;
     return len;

Modified: trunk/src/pmc/stringiterator.pmc
--- trunk/src/pmc/stringiterator.pmc	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/pmc/stringiterator.pmc	Thu Aug 19 05:53:12 2010	(r48565)
@@ -27,11 +27,9 @@
 /* HEADERIZER END: static */
 pmclass StringIterator auto_attrs extends Iterator {
-    ATTR PMC    *string;    /* String to iterate over */
-    ATTR INTVAL  pos;       /* Current position of iterator for forward iterator */
-                            /* Previous position of iterator for reverse iterator */
-    ATTR INTVAL  length;    /* Length of C<string> */
-    ATTR INTVAL  reverse;   /* Direction of iteration. 1 - for reverse iteration */
+    ATTR STRING      *str_val;   /* String to iterate over */
+    ATTR String_iter  iter;      /* String iterator */
+    ATTR INTVAL       reverse;   /* Direction of iteration. 1 - for reverse iteration */
@@ -43,10 +41,13 @@
     VTABLE void init_pmc(PMC *string) {
-        SET_ATTR_string(INTERP, SELF, string);
+        String_iter * const iter    = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING      * const str_val = VTABLE_get_string(INTERP, string);
+        SET_ATTR_str_val(INTERP, SELF, str_val);
+        STRING_ITER_INIT(INTERP, iter);
-        /* by default, iterate from start */
-        SELF.set_integer_native(ITERATE_FROM_START);
@@ -61,9 +62,10 @@
     VTABLE void mark() {
-        PMC *string;
-        GET_ATTR_string(INTERP, SELF, string);
-        Parrot_gc_mark_PMC_alive(INTERP, string);
+        STRING *str_val;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        Parrot_gc_mark_STRING_alive(INTERP, str_val);
@@ -74,15 +76,21 @@
     VTABLE PMC* clone() {
-        Parrot_StringIterator_attributes * const attrs =
-        PMC                              * const clone =
-                Parrot_pmc_new_init(INTERP, enum_class_StringIterator, attrs->string);
-        Parrot_StringIterator_attributes * const clone_attrs =
-                PARROT_STRINGITERATOR(clone);
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        PMC         *clone, *str_pmc;
+        String_iter *clone_iter;
+        STRING      *str_val;
+        INTVAL       reverse;
+        str_pmc = Parrot_pmc_new(INTERP, enum_class_String);
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        VTABLE_set_string_native(INTERP, str_pmc, str_val);
+        clone = Parrot_pmc_new_init(INTERP, enum_class_StringIterator, str_pmc);
+        clone_iter = &PARROT_STRINGITERATOR(clone)->iter;
+        *clone_iter = *iter;
+        GET_ATTR_reverse(INTERP, SELF, reverse);
+        SET_ATTR_reverse(INTERP, clone, reverse);
-        clone_attrs->pos     = attrs->pos;
-        clone_attrs->reverse = attrs->reverse;
         return clone;
@@ -111,12 +119,17 @@
     VTABLE INTVAL elements() {
-        Parrot_StringIterator_attributes * const attrs =
-        if (attrs->reverse)
-            return attrs->pos;
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val;
+        INTVAL  reverse;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        GET_ATTR_reverse(INTERP, SELF, reverse);
+        if (reverse)
+            return iter->charpos;
-            return attrs->length - attrs->pos;
+            return str_val->strlen - iter->charpos;
     VTABLE INTVAL get_integer() {
@@ -137,20 +150,19 @@
     VTABLE void set_integer_native(INTVAL value) {
-        Parrot_StringIterator_attributes * const attrs =
-        switch (value) {
-          case ITERATE_FROM_START:
-            attrs->reverse   = 0;
-            attrs->pos       = 0;
-            attrs->length    = VTABLE_elements(INTERP, attrs->string);
-            break;
-          case ITERATE_FROM_END:
-            attrs->reverse   = 1;
-            attrs->pos       = attrs->length
-                             = VTABLE_elements(INTERP, attrs->string);
-            break;
-          default:
+        STRING *str_val;
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        if (value == ITERATE_FROM_START) {
+            SET_ATTR_reverse(INTERP, SELF, 0);
+            STRING_ITER_SET_POSITION(INTERP, str_val, iter, 0);
+        }
+        else if (value == ITERATE_FROM_END) {
+            SET_ATTR_reverse(INTERP, SELF, 1);
+            STRING_ITER_SET_POSITION(INTERP, str_val, iter, str_val->strlen);
+        }
+        else {
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_INVALID_OPERATION,
                     "Wrong direction for StringIterator");
@@ -167,9 +179,13 @@
     VTABLE PMC *get_pmc() {
-        PMC *string;
-        GET_ATTR_string(INTERP, SELF, string);
-        return string ? string : PMCNULL;
+        PMC * const string = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(
+            interp, enum_class_String));
+        STRING *str_val;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        VTABLE_set_string_native(interp, string, str_val);
+        return string;
@@ -182,17 +198,20 @@
     VTABLE PMC *shift_pmc() {
-        Parrot_StringIterator_attributes * const attrs =
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
         PMC *ret;
+        STRING *str_val, *substr;
+        const String_iter old_iter = *iter;
-        if (attrs->pos >= attrs->length)
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        if (iter->charpos >= str_val->strlen)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
         ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String));
-        VTABLE_set_string_native(INTERP, ret,
-                VTABLE_get_string_keyed_int(INTERP, attrs->string, attrs->pos++));
+        STRING_ITER_SKIP(INTERP, str_val, iter, 1);
+        substr = Parrot_str_iter_substr(INTERP, str_val, &old_iter, iter);
+        VTABLE_set_string_native(INTERP, ret, substr);
         return ret;
@@ -206,14 +225,17 @@
     VTABLE STRING *shift_string() {
-        Parrot_StringIterator_attributes * const attrs =
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val;
+        const String_iter old_iter = *iter;
-        if (attrs->pos >= attrs->length)
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        if (iter->charpos >= str_val->strlen)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
-        return VTABLE_get_string_keyed_int(INTERP, attrs->string, attrs->pos++);
+        STRING_ITER_SKIP(INTERP, str_val, iter, 1);
+        return Parrot_str_iter_substr(INTERP, str_val, &old_iter, iter);
@@ -226,14 +248,15 @@
     VTABLE INTVAL shift_integer() {
-        Parrot_StringIterator_attributes * const attrs =
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val;
-        if (attrs->pos >= attrs->length)
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        if (iter->charpos >= str_val->strlen)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
-        return VTABLE_get_integer_keyed_int(INTERP, attrs->string, attrs->pos++);
+        return STRING_ITER_GET_AND_ADVANCE(INTERP, str_val, iter);
@@ -246,17 +269,21 @@
     VTABLE PMC *pop_pmc() {
-        Parrot_StringIterator_attributes * const attrs =
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val, *substr;
         PMC *ret;
+        const String_iter old_iter = *iter;
-        if (!STATICSELF.get_bool())
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        /* Shouldn't this test be (iter->charpos <= 0) ? */
+        if (SELF.elements() <= 0)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
         ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String));
-        VTABLE_set_string_native(INTERP, ret,
-                VTABLE_get_string_keyed_int(INTERP, attrs->string, --attrs->pos));
+        STRING_ITER_SKIP(INTERP, str_val, iter, -1);
+        substr = Parrot_str_iter_substr(INTERP, str_val, iter, &old_iter);
+        VTABLE_set_string_native(INTERP, ret, substr);
         return ret;
@@ -270,14 +297,18 @@
     VTABLE STRING *pop_string() {
-        Parrot_StringIterator_attributes * const attrs =
-        if (!STATICSELF.get_bool())
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val;
+        const String_iter old_iter = *iter;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        /* Shouldn't this test be (iter->charpos <= 0) ? */
+        if (SELF.elements() <= 0)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
-        return VTABLE_get_string_keyed_int(INTERP, attrs->string, --attrs->pos);
+        STRING_ITER_SKIP(INTERP, str_val, iter, -1);
+        return Parrot_str_iter_substr(INTERP, str_val, iter, &old_iter);
@@ -290,14 +321,17 @@
     VTABLE INTVAL pop_integer() {
-        Parrot_StringIterator_attributes * const attrs =
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val;
-        if (!STATICSELF.get_bool())
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        /* Shouldn't this test be (iter->charpos <= 0) ? */
+        if (SELF.elements() <= 0)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
-        return VTABLE_get_integer_keyed_int(INTERP, attrs->string, --attrs->pos);
+        STRING_ITER_SKIP(INTERP, str_val, iter, -1);
+        return STRING_ITER_GET(INTERP, str_val, iter, 0);
@@ -311,8 +345,16 @@
     VTABLE INTVAL get_integer_keyed_int(INTVAL idx) {
-        return VTABLE_get_integer_keyed_int(INTERP, STATICSELF.get_pmc(),
-                PARROT_STRINGITERATOR(SELF)->pos + idx);
+        String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter;
+        STRING *str_val;
+        const UINTVAL offset = iter->charpos + idx;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        if (offset >= str_val->strlen)
+            Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
+                "StopIteration");
+        return STRING_ITER_GET(INTERP, str_val, iter, idx);
@@ -326,8 +368,22 @@
     VTABLE STRING *get_string_keyed_int(INTVAL idx) {
-        return VTABLE_get_string_keyed_int(INTERP, STATICSELF.get_pmc(),
-                PARROT_STRINGITERATOR(SELF)->pos + idx);
+        String_iter iter = PARROT_STRINGITERATOR(SELF)->iter;
+        String_iter next_iter;
+        STRING *str_val;
+        const UINTVAL offset = iter.charpos + idx;
+        GET_ATTR_str_val(INTERP, SELF, str_val);
+        if (offset >= str_val->strlen)
+            Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
+                "StopIteration");
+        if (idx != 0)
+            STRING_ITER_SKIP(INTERP, str_val, &iter, idx);
+        next_iter = iter;
+        STRING_ITER_SKIP(INTERP, str_val, &next_iter, 1);
+        return Parrot_str_iter_substr(INTERP, str_val, &iter, &next_iter);

Modified: trunk/src/string/api.c
--- trunk/src/string/api.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/api.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -1104,6 +1104,104 @@
     return CHARSET_GET_CODEPOINTS(interp, src, true_offset, true_length);
+=item C<STRING * Parrot_str_iter_substr(PARROT_INTERP, const STRING *str, const
+String_iter *l, const String_iter *r)>
+Returns the substring between iterators C<l> and C<r>.
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *l), ARGIN_NULLOK(const String_iter *r))
+    ASSERT_ARGS(Parrot_str_iter_substr)
+    STRING *dest = Parrot_str_copy(interp, str);
+    dest->strstart = (char *)dest->strstart + l->bytepos;
+    if (r == NULL) {
+        dest->bufused = str->bufused - l->bytepos;
+        dest->strlen  = str->strlen  - l->charpos;
+    }
+    else {
+        dest->bufused = r->bytepos - l->bytepos;
+        dest->strlen  = r->charpos - l->charpos;
+    }
+    dest->hashval = 0;
+    return dest;
+=item C<INTVAL Parrot_str_iter_index(PARROT_INTERP, const STRING *src,
+String_iter *start, String_iter *end, const STRING *search)>
+Find the next occurence of STRING C<search> in STRING C<src> starting at
+String_iter C<start>. If C<search> is found C<start> is modified to mark the
+beginning of C<search> and String_iter C<end> is set to the character after
+C<search> in C<src>.  Returns the character position where C<search> was found
+or -1 if it wasn't found.
+    ARGIN(const STRING *src),
+    ARGMOD(String_iter *start), ARGOUT(String_iter *end),
+    ARGIN(const STRING *search))
+    ASSERT_ARGS(Parrot_str_iter_index)
+    String_iter search_iter, search_start, next_start;
+    const UINTVAL len = search->strlen;
+    UINTVAL c0;
+    if (len == 0) {
+        *end = *start;
+        return start->charpos;
+    }
+    STRING_ITER_INIT(interp, &search_iter);
+    c0 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter);
+    search_start = search_iter;
+    next_start = *start;
+    while (start->charpos + len <= src->strlen) {
+        UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, &next_start);
+        if (c1 == c0) {
+            UINTVAL c2;
+            *end = next_start;
+            do {
+                if (search_iter.charpos >= len)
+                    return start->charpos;
+                c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, end);
+                c2 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter);
+            } while (c1 == c2);
+            search_iter = search_start;
+        }
+        *start = next_start;
+    }
+    return -1;
@@ -1145,7 +1243,7 @@
     UINTVAL         true_offset = (UINTVAL)offset;
     UINTVAL         true_length = (UINTVAL)length;
-    UINTVAL         start_byte, end_byte;
+    UINTVAL         start_byte, end_byte, start_char, end_char;
     INTVAL          buf_size;
     if (STRING_IS_NULL(src)) {
@@ -1181,13 +1279,15 @@
     /* get byte position of the part that will be replaced */
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
-    iter.set_position(interp, &iter, true_offset);
+    STRING_ITER_SET_POSITION(interp, src, &iter, true_offset);
     start_byte = iter.bytepos;
+    start_char = iter.charpos;
-    iter.set_position(interp, &iter, true_offset + true_length);
+    STRING_ITER_SKIP(interp, src, &iter, true_length);
     end_byte   = iter.bytepos;
+    end_char   = iter.charpos;
     /* not possible.... */
     if (end_byte < start_byte)
@@ -1226,7 +1326,7 @@
             (char *)src->strstart + end_byte,
             src->bufused - end_byte);
-    dest->strlen  = CHARSET_CODEPOINTS(interp, dest);
+    dest->strlen  = src->strlen - (end_char - start_char) + rep->strlen;
     dest->hashval = 0;
     return dest;
@@ -1252,7 +1352,7 @@
     STRING * const chopped = Parrot_str_copy(interp, s);
-    UINTVAL new_length, uchar_size;
+    UINTVAL new_length;
     if (n < 0) {
         new_length = -n;
@@ -1273,23 +1373,23 @@
         return chopped;
-    uchar_size      = chopped->bufused / chopped->strlen;
-    chopped->strlen = new_length;
     if (chopped->encoding == Parrot_fixed_8_encoding_ptr) {
         chopped->bufused = new_length;
     else if (chopped->encoding == Parrot_ucs2_encoding_ptr) {
+        const UINTVAL uchar_size = chopped->bufused / chopped->strlen;
         chopped->bufused = new_length * uchar_size;
     else {
         String_iter iter;
-        ENCODING_ITER_INIT(interp, s, &iter);
-        iter.set_position(interp, &iter, new_length);
+        STRING_ITER_INIT(interp, &iter);
+        STRING_ITER_SET_POSITION(interp, s, &iter, new_length);
         chopped->bufused = iter.bytepos;
+    chopped->strlen = new_length;
     return chopped;
@@ -1860,13 +1960,12 @@
         int                 sign      = 1;
         UINTVAL             i         = 0;
         String_iter         iter;
-        UINTVAL             offs;
         number_parse_state  state = parse_start;
-        ENCODING_ITER_INIT(interp, s, &iter);
+        STRING_ITER_INIT(interp, &iter);
-        for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
-            const UINTVAL c = iter.get_and_advance(interp, &iter);
+        while (state != parse_end && iter.charpos < s->strlen) {
+            const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter);
             /* Check for overflow */
             if (c > 255)
@@ -1956,17 +2055,16 @@
     int           d_length  = 0;
     int           check_nan = 0;    /* Check for NaN and Inf after main loop */
     String_iter iter;
-    UINTVAL     offs;
     number_parse_state state = parse_start;
     if (STRING_IS_NULL(s))
         return 0.0;
-    ENCODING_ITER_INIT(interp, s, &iter);
+    STRING_ITER_INIT(interp, &iter);
-    /* Handcrafter FSM to read float value */
-    for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    /* Handcrafted FSM to read float value */
+    while (state != parse_end && iter.charpos < s->strlen) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter);
         /* Check for overflow */
         if (c > 255)
@@ -2415,11 +2513,11 @@
             Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
     /* more work TODO */
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
     dp = (unsigned char *)result->strstart;
     for (i = 0; len > 0; --len) {
-        UINTVAL c = iter.get_and_advance(interp, &iter);
+        UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         if (c < 0x7f) {
             /* process ASCII chars */
             if (i >= charlen - 2) {
@@ -2559,17 +2657,17 @@
     Parrot_gc_allocate_string_storage(interp, result, reserved);
     result->bufused = reserved;
-    src->encoding->iter_init(interp, src, &itersrc);
-    encoding->iter_init(interp, result, &iterdest);
+    STRING_ITER_INIT(interp, &itersrc);
+    STRING_ITER_INIT(interp, &iterdest);
     while (itersrc.bytepos < srclen) {
-        INTVAL c = itersrc.get_and_advance(interp, &itersrc);
+        INTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
         INTVAL next;
         do {
             pending = 0;
             next = c;
             if (c == '\\') {
-                c = itersrc.get_and_advance(interp, &itersrc);
+                c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                 switch (c) {
                 /* Common one char sequences */
                 case 'a': next = '\a'; break;
@@ -2582,7 +2680,7 @@
                 case 'e': next = '\e'; break;
                 /* Escape character */
                 case 'c':
-                    c = itersrc.get_and_advance(interp, &itersrc);
+                    c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                     /* This assumes ascii-alike encoding */
                     if (c < 'A' || c > 'Z')
@@ -2590,11 +2688,11 @@
                 case 'x':
                     digcount = 0;
-                    c = itersrc.get_and_advance(interp, &itersrc);
+                    c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                     if (c == '{') {
                         /* \x{h..h} 1..8 hex digits */
                         while (itersrc.bytepos < srclen) {
-                            c = itersrc.get_and_advance(interp, &itersrc);
+                            c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                             if (c == '}')
                             if (!isxdigit(c))
@@ -2618,7 +2716,7 @@
                                 pending = 0;
-                            c = itersrc.get_and_advance(interp, &itersrc);
+                            c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                     if (digcount == 0)
@@ -2629,7 +2727,7 @@
                 case 'u':
                     /* \uhhhh 4 hex digits */
                     for (digcount = 0; digcount < 4; ++digcount) {
-                        c = itersrc.get_and_advance(interp, &itersrc);
+                        c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                         if (!isxdigit(c))
                         digbuf[digcount] = c;
@@ -2640,7 +2738,7 @@
                 case 'U':
                     /* \Uhhhhhhhh 8 hex digits */
                     for (digcount = 0; digcount < 8; ++digcount) {
-                        c = itersrc.get_and_advance(interp, &itersrc);
+                        c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                         if (!isxdigit(c))
                         digbuf[digcount] = c;
@@ -2653,7 +2751,7 @@
                     /* \ooo 1..3 oct digits */
                     digbuf[0] = c;
                     for (digcount = 1; digcount < 3; ++digcount) {
-                        c = itersrc.get_and_advance(interp, &itersrc);
+                        c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
                         if (c < '0' || c > '7')
                         digbuf[digcount] = c;
@@ -2667,7 +2765,7 @@
                     next = c;
-            iterdest.set_and_advance(interp, &iterdest, next);
+            STRING_ITER_SET_AND_ADVANCE(interp, result, &iterdest, next);
         } while (pending);
     result->bufused = iterdest.bytepos;
@@ -2748,7 +2846,7 @@
         encoding = result->encoding;
-    encoding->iter_init(interp, result, &iter);
+    STRING_ITER_INIT(interp, &iter);
     for (offs = d = 0; offs < clength; ++offs) {
         r = (Parrot_UInt4)((unsigned char *)result->strstart)[offs];
@@ -2771,7 +2869,7 @@
         PARROT_ASSERT(d < offs);
-        iter.set_and_advance(interp, &iter, r);
+        encoding->iter_set_and_advance(interp, result, &iter, r);
@@ -3269,8 +3367,10 @@
-    PMC    *res;
-    INTVAL  slen, dlen, ps, pe;
+    PMC     *res;
+    STRING  *tstr;
+    UINTVAL  slen, dlen;
+    String_iter iter;
     if (STRING_IS_NULL(delim) || STRING_IS_NULL(str))
         return PMCNULL;
@@ -3282,44 +3382,38 @@
     if (!slen)
         return res;
+    STRING_ITER_INIT(interp, &iter);
     dlen = Parrot_str_length(interp, delim);
     if (dlen == 0) {
-        int i;
         VTABLE_set_integer_native(interp, res, slen);
-        for (i = 0; i < slen; ++i) {
-            STRING * const p = Parrot_str_substr(interp, str, i, 1);
-            VTABLE_set_string_keyed_int(interp, res, i, p);
-        }
-        return res;
-    }
+        do {
+            const String_iter old_iter = iter;
-    pe = Parrot_str_find_index(interp, str, delim, 0);
+            STRING_ITER_SKIP(interp, str, &iter, 1);
+            tstr = Parrot_str_iter_substr(interp, str, &old_iter, &iter);
+            VTABLE_set_string_keyed_int(interp, res, old_iter.charpos, tstr);
+        } while (iter.charpos < slen);
-    if (pe < 0) {
-        VTABLE_push_string(interp, res, str);
         return res;
-    ps = 0;
-    while (ps <= slen) {
-        const int      pl   = pe - ps;
-        STRING * const tstr = Parrot_str_substr(interp, str, ps, pl);
-        VTABLE_push_string(interp, res, tstr);
-        ps = pe + Parrot_str_length(interp, delim);
+    do {
+        String_iter start, end;
+        INTVAL pos;
-        if (ps > slen)
+        start = iter;
+        if (Parrot_str_iter_index(interp, str, &start, &end, delim) < 0)
-        pe = Parrot_str_find_index(interp, str, delim, ps);
+        tstr = Parrot_str_iter_substr(interp, str, &iter, &start);
+        VTABLE_push_string(interp, res, tstr);
+        iter = end;
+    } while (iter.charpos < slen);
-        if (pe < 0)
-            pe = slen;
-    }
+    tstr = Parrot_str_iter_substr(interp, str, &iter, NULL);
+    VTABLE_push_string(interp, res, tstr);
     return res;

Modified: trunk/src/string/charset/ascii.c
--- trunk/src/string/charset/ascii.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/charset/ascii.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -201,7 +201,6 @@
     String_iter iter;
-    UINTVAL offs;
     unsigned char *p;
     const UINTVAL len = src->strlen;
@@ -209,9 +208,9 @@
     STRING * const dest = Parrot_str_clone(interp, src);
     p = (unsigned char *)dest->strstart;
-    ENCODING_ITER_INIT(interp, src, &iter);
-    for (offs = 0; offs < len; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    STRING_ITER_INIT(interp, &iter);
+    while (iter.charpos < len) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         if (c >= 128)
             Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
                     "can't convert unicode string to ascii");
@@ -493,11 +492,10 @@
             return ret_val < 0 ? -1 : 1;
     else {
-        UINTVAL offs;
-        ENCODING_ITER_INIT(interp, rhs, &iter);
-        for (offs = 0; offs < min_len; ++offs) {
-            const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, offs);
-            const UINTVAL cr = iter.get_and_advance(interp, &iter);
+        STRING_ITER_INIT(interp, &iter);
+        while (iter.charpos < min_len) {
+            const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, iter.charpos);
+            const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &iter);
             if (cl != cr)
                 return cl < cr ? -1 : 1;
@@ -531,35 +529,12 @@
     UINTVAL offs)
+    String_iter start, end;
-    if (search->strlen <= src->strlen) {
-        String_iter src_iter, search_iter;
-        const UINTVAL maxpos = src->strlen - search->strlen + 1;
-        const UINTVAL cfirst = Parrot_str_indexed(interp, search, 0);
-        ENCODING_ITER_INIT(interp, src, &src_iter);
-        src_iter.set_position(interp, &src_iter, offs);
-        ENCODING_ITER_INIT(interp, search, &search_iter);
-        while (src_iter.charpos < maxpos) {
-            if (cfirst == src_iter.get_and_advance(interp, &src_iter)) {
-                const INTVAL next_pos = src_iter.charpos;
-                const INTVAL next_byte = src_iter.bytepos;
-                UINTVAL len;
-                search_iter.set_position(interp, &search_iter, 1);
-                for (len = search->strlen - 1; len; --len) {
-                    if ((src_iter.get_and_advance(interp, &src_iter)) !=
-                            (search_iter.get_and_advance(interp, &search_iter)))
-                        break;
-                }
-                if (len == 0)
-                    return next_pos - 1;
-                src_iter.charpos = next_pos;
-                src_iter.bytepos = next_byte;
-            }
-        }
-    }
-    return -1;
+    STRING_ITER_INIT(interp, &start);
+    STRING_ITER_SET_POSITION(interp, src, &start, offs);
+    return Parrot_str_iter_index(interp, src, &start, &end, search);
@@ -638,13 +613,12 @@
 validate(PARROT_INTERP, ARGIN(const STRING *src))
-    INTVAL      offset;
     String_iter iter;
     const INTVAL length = Parrot_str_length(interp, src);
-    ENCODING_ITER_INIT(interp, src, &iter);
-    for (offset = 0; offset < length; ++offset) {
-        const UINTVAL codepoint = iter.get_and_advance(interp, &iter);
+    STRING_ITER_INIT(interp, &iter);
+    while (iter.charpos < length) {
+        const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         if (codepoint >= 0x80)
             return 0;

Modified: trunk/src/string/charset/iso-8859-1.c
--- trunk/src/string/charset/iso-8859-1.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/charset/iso-8859-1.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -178,24 +178,24 @@
 to_iso_8859_1(PARROT_INTERP, ARGIN(const STRING *src))
-    UINTVAL offs, src_len;
+    UINTVAL src_len;
     String_iter iter;
     /* iso-8859-1 is never bigger then source */
     STRING * dest = Parrot_str_clone(interp, src);
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
     src_len = src->strlen;
     dest->bufused = src_len;
-    dest->charset = Parrot_iso_8859_1_charset_ptr;
-    dest->encoding = Parrot_fixed_8_encoding_ptr;
-    for (offs = 0; offs < src_len; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < src_len) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         if (c >= 0x100)
             Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
                 "lossy conversion to iso-8559-1");
-        ENCODING_SET_BYTE(interp, dest, offs, c);
+        Parrot_fixed_8_encoding_ptr->set_byte(interp, dest, iter.charpos - 1, c);
+    dest->charset = Parrot_iso_8859_1_charset_ptr;
+    dest->encoding = Parrot_fixed_8_encoding_ptr;
     return dest;
@@ -221,18 +221,18 @@
     dest->charset = Parrot_unicode_charset_ptr;
     dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
     Parrot_gc_reallocate_string_storage(interp, dest, src->strlen);
-    ENCODING_ITER_INIT(interp, dest, &iter);
-    for (offs = 0; offs < src->strlen; ++offs) {
-        const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
+    STRING_ITER_INIT(interp, &iter);
+    while (iter.charpos < src->strlen) {
+        const UINTVAL c = ENCODING_GET_BYTE(interp, src, iter.charpos);
         if (iter.bytepos >= Buffer_buflen(dest) - 4) {
-            UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
+            UINTVAL need = (UINTVAL)((src->strlen - iter.charpos) * 1.5);
             if (need < 16)
                 need = 16;
             Parrot_gc_reallocate_string_storage(interp, dest,
                     Buffer_buflen(dest) + need);
-        iter.set_and_advance(interp, &iter, c);
+        STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, c);
     dest->bufused = iter.bytepos;
     dest->strlen  = iter.charpos;

Modified: trunk/src/string/charset/unicode.c
--- trunk/src/string/charset/unicode.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/charset/unicode.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -651,20 +651,20 @@
     String_iter l_iter, r_iter;
-    UINTVAL offs, cl, cr, min_len, l_len, r_len;
+    UINTVAL min_len, l_len, r_len;
     /* TODO make optimized equal - strings are equal length then already */
-    ENCODING_ITER_INIT(interp, lhs, &l_iter);
-    ENCODING_ITER_INIT(interp, rhs, &r_iter);
+    STRING_ITER_INIT(interp, &l_iter);
+    STRING_ITER_INIT(interp, &r_iter);
     l_len = lhs->strlen;
     r_len = rhs->strlen;
     min_len = l_len > r_len ? r_len : l_len;
-    for (offs = 0; offs < min_len; ++offs) {
-        cl = l_iter.get_and_advance(interp, &l_iter);
-        cr = r_iter.get_and_advance(interp, &r_iter);
+    while (l_iter.charpos < min_len) {
+        const UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, lhs, &l_iter);
+        const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &r_iter);
         if (cl != cr)
             return cl < cr ? -1 : 1;
@@ -716,13 +716,12 @@
 validate(PARROT_INTERP, ARGIN(const STRING *src))
-    INTVAL      offset;
     String_iter iter;
     const INTVAL length = Parrot_str_length(interp, src);
-    ENCODING_ITER_INIT(interp, src, &iter);
-    for (offset = 0; offset < length; ++offset) {
-        const UINTVAL codepoint = iter.get_and_advance(interp, &iter);
+    STRING_ITER_INIT(interp, &iter);
+    while (iter.charpos < length) {
+        const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         /* Check for Unicode non-characters */
         if (codepoint >= 0xfdd0
         && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
@@ -877,24 +876,22 @@
     String_iter iter;
     UINTVAL     codepoint;
-    UINTVAL     pos = offset;
     UINTVAL     end = offset + count;
-    ENCODING_ITER_INIT(interp, src, &iter);
-    iter.set_position(interp, &iter, pos);
+    STRING_ITER_INIT(interp, &iter);
+    STRING_ITER_SET_POSITION(interp, src, &iter, offset);
     end = src->strlen < end ? src->strlen : end;
-    for (; pos < end; ++pos) {
-        codepoint = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < end) {
+        codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         if (codepoint >= 256) {
             if (u_iscclass(interp, codepoint, flags))
-                    return pos;
+                    return iter.charpos - 1;
         else {
             if (Parrot_iso_8859_1_typetable[codepoint] & flags)
-                return pos;
+                return iter.charpos - 1;
@@ -920,37 +917,36 @@
     String_iter iter;
     UINTVAL     codepoint;
-    UINTVAL     pos = offset;
     UINTVAL     end = offset + count;
     int         bit;
-    if (pos > src->strlen) {
+    if (offset > src->strlen) {
         /* XXX: Throw in this case? */
         return offset + count;
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
-    if (pos)
-        iter.set_position(interp, &iter, pos);
+    if (offset)
+        STRING_ITER_SET_POSITION(interp, src, &iter, offset);
     end = src->strlen < end ? src->strlen : end;
     if (flags == enum_cclass_any)
         return end;
-    for (; pos < end; ++pos) {
-        codepoint = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < end) {
+        codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         if (codepoint >= 256) {
             for (bit = enum_cclass_uppercase;
                     bit <= enum_cclass_word ; bit <<= 1) {
                 if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
-                    return pos;
+                    return iter.charpos - 1;
         else {
             if (!(Parrot_iso_8859_1_typetable[codepoint] & flags))
-                return pos;
+                return iter.charpos - 1;
@@ -978,8 +974,8 @@
     dest->strlen = 1;
-    ENCODING_ITER_INIT(interp, dest, &iter);
-    iter.set_and_advance(interp, &iter, codepoint);
+    STRING_ITER_INIT(interp, &iter);
+    STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, codepoint);
     dest->bufused = iter.bytepos;
     return dest;
@@ -1002,13 +998,12 @@
     String_iter iter;
-    UINTVAL     offs;
     size_t      hashval = seed;
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
-    for (offs = 0; offs < src->strlen; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < src->strlen) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
         hashval += hashval << 5;
         hashval += c;

Modified: trunk/src/string/encoding/fixed_8.c
--- trunk/src/string/encoding/fixed_8.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/encoding/fixed_8.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -41,22 +41,46 @@
-static UINTVAL fixed8_get_next(PARROT_INTERP, ARGMOD(String_iter *iter))
+static UINTVAL fixed8_iter_get(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *iter),
+    INTVAL offset)
+        __attribute__nonnull__(3);
+static UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *iter))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
-static void fixed8_set_next(PARROT_INTERP,
+static void fixed8_iter_set_and_advance(PARROT_INTERP,
+    ARGMOD(STRING *str),
     ARGMOD(String_iter *iter),
     UINTVAL c)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*str)
-static void fixed8_set_position(SHIM_INTERP,
+static void fixed8_iter_set_position(SHIM_INTERP,
+    ARGIN(const STRING *str),
     ARGMOD(String_iter *iter),
     UINTVAL pos)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*iter);
+static void fixed8_iter_skip(SHIM_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *iter),
+    INTVAL skip)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
 static size_t fixed_8_hash(SHIM_INTERP,
@@ -95,13 +119,6 @@
-static void iter_init(SHIM_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
 static void set_byte(PARROT_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset,
@@ -122,14 +139,24 @@
 #define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(s) \
     , PARROT_ASSERT_ARG(typetable))
-#define ASSERT_ARGS_fixed8_get_next __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_fixed8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+    , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
     , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed8_set_next __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_fixed8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+    , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(str) \
+    , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(str) \
     , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed8_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_fixed_8_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
 #define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -143,9 +170,6 @@
 #define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
@@ -375,85 +399,105 @@
-=item C<static UINTVAL fixed8_get_next(PARROT_INTERP, String_iter *iter)>
+=item C<static UINTVAL fixed8_iter_get(PARROT_INTERP, const STRING *str, const
+String_iter *iter, INTVAL offset)>
-Moves the string iterator C<i> to the next codepoint.
+Get the character at C<iter> plus C<offset>.
 static UINTVAL
-fixed8_get_next(PARROT_INTERP, ARGMOD(String_iter *iter))
+    ARGIN(const STRING *str), ARGIN(const String_iter *iter), INTVAL offset)
-    ASSERT_ARGS(fixed8_get_next)
-    const UINTVAL c = get_byte(interp, iter->str, iter->charpos++);
-    ++iter->bytepos;
-    return c;
+    ASSERT_ARGS(fixed8_iter_get)
+    return get_byte(interp, str, iter->charpos + offset);
-=item C<static void fixed8_set_next(PARROT_INTERP, String_iter *iter, UINTVAL
+=item C<static void fixed8_iter_skip(PARROT_INTERP, const STRING *str,
+String_iter *iter, INTVAL skip)>
-With the string iterator C<i>, appends the codepoint C<c> and advances to the
-next position in the string.
+Moves the string iterator C<i> by C<skip> characters.
 static void
-fixed8_set_next(PARROT_INTERP, ARGMOD(String_iter *iter), UINTVAL c)
+    ARGIN(const STRING *str), ARGMOD(String_iter *iter), INTVAL skip)
-    ASSERT_ARGS(fixed8_set_next)
-    set_byte(interp, iter->str, iter->charpos++, c);
-    ++iter->bytepos;
+    ASSERT_ARGS(fixed8_iter_skip)
+    iter->bytepos += skip;
+    iter->charpos += skip;
+    PARROT_ASSERT(iter->bytepos <= Buffer_buflen(str));
-=item C<static void fixed8_set_position(PARROT_INTERP, String_iter *iter,
-UINTVAL pos)>
+=item C<static UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP, const STRING
+*str, String_iter *iter)>
-Moves the string iterator C<i> to the position C<n> in the string.
+Moves the string iterator C<i> to the next codepoint.
-static void
-fixed8_set_position(SHIM_INTERP, ARGMOD(String_iter *iter), UINTVAL pos)
+static UINTVAL
+    ARGIN(const STRING *str), ARGMOD(String_iter *iter))
-    ASSERT_ARGS(fixed8_set_position)
-    iter->bytepos = iter->charpos = pos;
-    PARROT_ASSERT(pos <= Buffer_buflen(iter->str));
+    ASSERT_ARGS(fixed8_iter_get_and_advance)
+    const UINTVAL c = get_byte(interp, str, iter->charpos++);
+    iter->bytepos++;
+    return c;
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
+=item C<static void fixed8_iter_set_and_advance(PARROT_INTERP, STRING *str,
+String_iter *iter, UINTVAL c)>
-Initializes for string C<src> the string iterator C<iter>.
+With the string iterator C<i>, appends the codepoint C<c> and advances to the
+next position in the string.
 static void
-iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
+    ARGMOD(STRING *str), ARGMOD(String_iter *iter), UINTVAL c)
-    ASSERT_ARGS(iter_init)
-    iter->str             = src;
-    iter->bytepos         = iter->charpos        = 0;
-    iter->get_and_advance = fixed8_get_next;
-    iter->set_and_advance = fixed8_set_next;
-    iter->set_position    = fixed8_set_position;
+    ASSERT_ARGS(fixed8_iter_set_and_advance)
+    set_byte(interp, str, iter->charpos++, c);
+    iter->bytepos++;
+=item C<static void fixed8_iter_set_position(PARROT_INTERP, const STRING *str,
+String_iter *iter, UINTVAL pos)>
+Moves the string iterator C<i> to the position C<n> in the string.
+static void
+    ARGIN(const STRING *str), ARGMOD(String_iter *iter), UINTVAL pos)
+    ASSERT_ARGS(fixed8_iter_set_position)
+    iter->bytepos = iter->charpos = pos;
+    PARROT_ASSERT(pos <= Buffer_buflen(str));
@@ -509,9 +553,13 @@
-        iter_init,
-        fixed_8_hash
+        fixed_8_hash,
+        fixed8_iter_get,
+        fixed8_iter_skip,
+        fixed8_iter_get_and_advance,
+        fixed8_iter_set_and_advance,
+        fixed8_iter_set_position
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);

Modified: trunk/src/string/encoding/ucs2.c
--- trunk/src/string/encoding/ucs2.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/encoding/ucs2.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -84,14 +84,6 @@
-static void iter_init(PARROT_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
 static void set_byte(PARROT_INTERP,
     SHIM(const STRING *src),
     SHIM(UINTVAL offset),
@@ -104,26 +96,54 @@
-static UINTVAL ucs2_decode_and_advance(SHIM_INTERP, ARGMOD(String_iter *i))
+static size_t ucs2_hash(PARROT_INTERP,
+    ARGIN(const STRING *s),
+    size_t hashval)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2);
+static UINTVAL ucs2_iter_get(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *i),
+    INTVAL offset)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+static UINTVAL ucs2_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(3)
-static void ucs2_encode_and_advance(SHIM_INTERP,
+static void ucs2_iter_set_and_advance(PARROT_INTERP,
+    ARGMOD(STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL c)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*str)
-static size_t ucs2_hash(PARROT_INTERP,
-    ARGIN(const STRING *s),
-    size_t hashval)
+static void ucs2_iter_set_position(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i),
+    UINTVAL n)
-        __attribute__nonnull__(2);
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*i);
-static void ucs2_set_position(SHIM_INTERP,
+static void ucs2_iter_skip(PARROT_INTERP,
+    ARGIN(const STRING *str),
     ARGMOD(String_iter *i),
-    UINTVAL n)
+    INTVAL skip)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(3)
 #define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -145,24 +165,34 @@
 #define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
 #define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_ucs2_decode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-#define ASSERT_ARGS_ucs2_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
 #define ASSERT_ARGS_ucs2_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
-#define ASSERT_ARGS_ucs2_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_ucs2_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_ucs2_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_ucs2_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_ucs2_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_ucs2_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 /* HEADERIZER END: static */
@@ -323,11 +353,11 @@
         String_iter iter;
         UINTVAL start;
-        iter_init(interp, src, &iter);
-        iter.set_position(interp, &iter, offset);
+        STRING_ITER_INIT(interp, &iter);
+        ucs2_iter_set_position(interp, src, &iter, offset);
         start = iter.bytepos;
         return_string->strstart = (char *)return_string->strstart + start;
-        iter.set_position(interp, &iter, offset + count);
+        ucs2_iter_set_position(interp, src, &iter, offset + count);
         return_string->bufused = iter.bytepos - start;
@@ -402,7 +432,55 @@
-=item C<static UINTVAL ucs2_decode_and_advance(PARROT_INTERP, String_iter *i)>
+=item C<static UINTVAL ucs2_iter_get(PARROT_INTERP, const STRING *str, const
+String_iter *i, INTVAL offset)>
+Get the character at C<i> + C<offset>.
+static UINTVAL
+    ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
+    ASSERT_ARGS(ucs2_iter_get)
+    return get_codepoint(interp, str, i->charpos + offset);
+=item C<static void ucs2_iter_skip(PARROT_INTERP, const STRING *str, String_iter
+*i, INTVAL skip)>
+Moves the string iterator C<i> by C<skip> characters.
+static void
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
+    ASSERT_ARGS(ucs2_iter_skip)
+    UNUSED(str);
+    i->charpos += skip;
+    i->bytepos += skip * sizeof (UChar);
+    UNUSED(i);
+    UNUSED(skip);
+    no_ICU_lib(interp);
+=item C<static UINTVAL ucs2_iter_get_and_advance(PARROT_INTERP, const STRING
+*str, String_iter *i)>
 Moves the string iterator C<i> to the next UCS-2 codepoint.
@@ -411,35 +489,34 @@
 static UINTVAL
-ucs2_decode_and_advance(SHIM_INTERP, ARGMOD(String_iter *i))
+    ARGIN(const STRING *str), ARGMOD(String_iter *i))
-    ASSERT_ARGS(ucs2_decode_and_advance)
+    ASSERT_ARGS(ucs2_iter_get_and_advance)
-    const UChar * const s = (const UChar*) i->str->strstart;
+    UChar * const s = (UChar*) str->strstart;
     size_t pos = i->bytepos / sizeof (UChar);
     /* TODO either make sure that we don't go past end or use SAFE
      *      iter versions
     const UChar c = s[pos++];
-    ++i->charpos;
+    i->charpos++;
     i->bytepos = pos * sizeof (UChar);
     return c;
-    /* This function must never be called if compiled without ICU.
-     * See TT #557
-     */
+    UNUSED(str);
+    no_ICU_lib(interp);
     return (UINTVAL)0; /* Stop the static analyzers from panicing */
-=item C<static void ucs2_encode_and_advance(PARROT_INTERP, String_iter *i,
+=item C<static void ucs2_iter_set_and_advance(PARROT_INTERP, STRING *str,
+String_iter *i, UINTVAL c)>
 With the string iterator C<i>, appends the codepoint C<c> and advances to the
 next position in the string.
@@ -449,64 +526,29 @@
 static void
-ucs2_encode_and_advance(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL c)
+    ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
-    ASSERT_ARGS(ucs2_encode_and_advance)
+    ASSERT_ARGS(ucs2_iter_set_and_advance)
-    UChar    *s = (UChar*) i->str->strstart;
+    UChar * const s = (UChar*) str->strstart;
     UINTVAL pos = i->bytepos / sizeof (UChar);
-    s[pos++]    = (UChar)c;
-    ++i->charpos;
+    s[pos++] = (UChar)c;
+    i->charpos++;
     i->bytepos = pos * sizeof (UChar);
-    /* This function must never be called if compiled without ICU.
-     * See TT #557
-     */
+    UNUSED(str);
-=item C<static size_t ucs2_hash(PARROT_INTERP, const STRING *s, size_t hashval)>
-Returns the hashed value of the string, given a seed in hashval.
-static size_t
-ucs2_hash(PARROT_INTERP, ARGIN(const STRING *s), size_t hashval)
-    ASSERT_ARGS(ucs2_hash)
-    const UChar *pos = (const UChar*) s->strstart;
-    UINTVAL len = s->strlen;
-    UNUSED(interp);
-    while (len--) {
-        hashval += hashval << 5;
-        hashval += *(pos++);
-    }
-    return hashval;
-    UNUSED(s);
-    UNUSED(hashval);
-=item C<static void ucs2_set_position(PARROT_INTERP, String_iter *i, UINTVAL n)>
+=item C<static void ucs2_iter_set_position(PARROT_INTERP, const STRING *str,
+String_iter *i, UINTVAL n)>
 Moves the string iterator C<i> to the position C<n> in the string.
@@ -515,50 +557,52 @@
 static void
-ucs2_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL n)
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n)
-    ASSERT_ARGS(ucs2_set_position)
+    ASSERT_ARGS(ucs2_iter_set_position)
+    UNUSED(str);
     i->charpos = n;
     i->bytepos = n * sizeof (UChar);
-    /* This function must never be called if compiled without ICU.
-     * See TT #557
-     */
+    no_ICU_lib(interp);
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
+=item C<static size_t ucs2_hash(PARROT_INTERP, const STRING *s, size_t hashval)>
-Initializes for string C<src> the string iterator C<iter>.
+Returns the hashed value of the string, given a seed in hashval.
-static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
+static size_t
+ucs2_hash(PARROT_INTERP, ARGIN(const STRING *s), size_t hashval)
-    ASSERT_ARGS(iter_init)
+    ASSERT_ARGS(ucs2_hash)
+    const UChar *pos = (const UChar*) s->strstart;
+    UINTVAL len = s->strlen;
-    iter->str             = src;
-    iter->bytepos         = 0;
-    iter->charpos         = 0;
-    iter->get_and_advance = ucs2_decode_and_advance;
-    iter->set_and_advance = ucs2_encode_and_advance;
-    iter->set_position    = ucs2_set_position;
+    while (len--) {
+        hashval += hashval << 5;
+        hashval += *(pos++);
+    }
+    return hashval;
-    UNUSED(src);
-    UNUSED(iter);
+    UNUSED(s);
+    UNUSED(hashval);
@@ -590,9 +634,13 @@
-        iter_init,
-        ucs2_hash
+        ucs2_hash,
+        ucs2_iter_get,
+        ucs2_iter_skip,
+        ucs2_iter_get_and_advance,
+        ucs2_iter_set_and_advance,
+        ucs2_iter_set_position
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "ucs2", return_encoding);

Modified: trunk/src/string/encoding/ucs4.c
--- trunk/src/string/encoding/ucs4.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/encoding/ucs4.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -84,14 +84,6 @@
-static void iter_init(PARROT_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
 static void set_byte(PARROT_INTERP,
     SHIM(const STRING *src),
     SHIM(UINTVAL offset),
@@ -104,30 +96,54 @@
-static UINTVAL ucs4_decode_and_advance(PARROT_INTERP,
+static size_t ucs4_hash(PARROT_INTERP,
+    ARGIN(const STRING *s),
+    size_t hashval)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2);
+static UINTVAL ucs4_iter_get(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *i),
+    INTVAL offset)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+static UINTVAL ucs4_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(const STRING *str),
     ARGMOD(String_iter *i))
+        __attribute__nonnull__(3)
-static void ucs4_encode_and_advance(PARROT_INTERP,
+static void ucs4_iter_set_and_advance(PARROT_INTERP,
+    ARGMOD(STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL c)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*str)
-static size_t ucs4_hash(PARROT_INTERP,
-    ARGIN(const STRING *s),
-    size_t hashval)
+static void ucs4_iter_set_position(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i),
+    UINTVAL n)
-        __attribute__nonnull__(2);
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*i);
-static void ucs4_set_position(PARROT_INTERP,
+static void ucs4_iter_skip(PARROT_INTERP,
+    ARGIN(const STRING *str),
     ARGMOD(String_iter *i),
-    UINTVAL n)
+    INTVAL skip)
+        __attribute__nonnull__(3)
 #define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -149,26 +165,33 @@
 #define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
 #define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_ucs4_decode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_ucs4_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+#define ASSERT_ARGS_ucs4_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
-#define ASSERT_ARGS_ucs4_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_ucs4_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
-#define ASSERT_ARGS_ucs4_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_ucs4_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
-#define ASSERT_ARGS_ucs4_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_ucs4_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_ucs4_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 /* HEADERIZER END: static */
@@ -414,90 +437,115 @@
-=item C<static UINTVAL ucs4_decode_and_advance(PARROT_INTERP, String_iter *i)>
+=item C<static UINTVAL ucs4_iter_get(PARROT_INTERP, const STRING *str, const
+String_iter *i, INTVAL offset)>
-Moves the string iterator C<i> to the next UCS-4 codepoint.
+Get the character at C<i> + C<offset>.
 static UINTVAL
-ucs4_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
+    ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
+    ASSERT_ARGS(ucs4_iter_get)
+    return get_codepoint(interp, str, i->charpos + offset);
+=item C<static void ucs4_iter_skip(PARROT_INTERP, const STRING *str, String_iter
+*i, INTVAL skip)>
+Moves the string iterator C<i> by C<skip> characters.
+static void
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
-    ASSERT_ARGS(ucs4_decode_and_advance)
+    ASSERT_ARGS(ucs4_iter_skip)
+    UNUSED(str);
-    const UChar32 * const s = (const UChar32 *) i->str->strstart;
-    size_t pos              = i->bytepos / sizeof (UChar32);
-    const UChar32         c = s[pos++];
-    ++i->charpos;
-    i->bytepos = pos * sizeof (UChar32);
-    return c;
+    i->charpos += skip;
+    i->bytepos += skip * sizeof (UChar32);
+    UNUSED(skip);
-=item C<static void ucs4_encode_and_advance(PARROT_INTERP, String_iter *i,
+=item C<static UINTVAL ucs4_iter_get_and_advance(PARROT_INTERP, const STRING
+*str, String_iter *i)>
-With the string iterator C<i>, appends the codepoint C<c> and advances to the
-next position in the string.
+Moves the string iterator C<i> to the next codepoint.
-static void
-ucs4_encode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL c)
+static UINTVAL
+    ARGIN(const STRING *str), ARGMOD(String_iter *i))
-    ASSERT_ARGS(ucs4_encode_and_advance)
+    ASSERT_ARGS(ucs4_iter_get_and_advance)
-    UChar32 *s   = (UChar32 *) i->str->strstart;
-    size_t   pos = i->bytepos / sizeof (UChar32);
-    s[pos++] = (UChar32) c;
-    ++i->charpos;
-    i->bytepos = pos * sizeof (UChar32);
+    const UChar32 * const s = (const UChar32*) str->strstart;
+    const UChar32 c = s[i->charpos++];
+    i->bytepos += sizeof (UChar32);
+    return c;
+    UNUSED(str);
+    return (UINTVAL)0; /* Stop the static analyzers from panicing */
-=item C<static size_t ucs4_hash(PARROT_INTERP, const STRING *s, size_t hashval)>
+=item C<static void ucs4_iter_set_and_advance(PARROT_INTERP, STRING *str,
+String_iter *i, UINTVAL c)>
-Returns the hashed value of the string, given a seed in hashval.
+With the string iterator C<i>, appends the codepoint C<c> and advances to the
+next position in the string.
-static size_t
-ucs4_hash(PARROT_INTERP, ARGIN(const STRING *s), size_t hashval)
+static void
+    ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
-    ASSERT_ARGS(ucs4_hash)
-    const UChar32 *pos = (const UChar32*) s->strstart;
-    UINTVAL len = s->strlen;
-    UNUSED(interp);
-    while (len--) {
-        hashval += hashval << 5;
-        hashval += *(pos++);
-    }
+    ASSERT_ARGS(ucs4_iter_set_and_advance)
-    return hashval;
+    UChar32 * const s = (UChar32*) str->strstart;
+    s[i->charpos++] = (UChar32)c;
+    i->bytepos += sizeof (UChar32);
+    UNUSED(str);
+    UNUSED(i);
+    UNUSED(c);
+    no_ICU_lib(interp);
-=item C<static void ucs4_set_position(PARROT_INTERP, String_iter *i, UINTVAL n)>
+=item C<static void ucs4_iter_set_position(PARROT_INTERP, const STRING *str,
+String_iter *i, UINTVAL n)>
 Moves the string iterator C<i> to the position C<n> in the string.
@@ -506,9 +554,12 @@
 static void
-ucs4_set_position(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL n)
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n)
-    ASSERT_ARGS(ucs4_set_position)
+    ASSERT_ARGS(ucs4_iter_set_position)
+    UNUSED(str);
     i->charpos = n;
     i->bytepos = n * sizeof (UChar32);
@@ -519,36 +570,33 @@
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
+=item C<static size_t ucs4_hash(PARROT_INTERP, const STRING *s, size_t hashval)>
-Initializes for string C<src> the string iterator C<iter>.
+Returns the hashed value of the string, given a seed in hashval.
-static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
+static size_t
+ucs4_hash(PARROT_INTERP, ARGIN(const STRING *s), size_t hashval)
-    ASSERT_ARGS(iter_init)
+    ASSERT_ARGS(ucs4_hash)
+    const UChar32 *pos = (const UChar32*) s->strstart;
+    UINTVAL len = s->strlen;
-    iter->str             = src;
-    iter->bytepos         = 0;
-    iter->charpos         = 0;
-    iter->get_and_advance = ucs4_decode_and_advance;
-    iter->set_and_advance = ucs4_encode_and_advance;
-    iter->set_position    = ucs4_set_position;
-    UNUSED(src);
-    UNUSED(iter);
-    no_ICU_lib(interp);
+    while (len--) {
+        hashval += hashval << 5;
+        hashval += *(pos++);
+    }
+    return hashval;
@@ -577,13 +625,17 @@
-        iter_init,
-        ucs4_hash
+        ucs4_hash,
-        NULL
+        NULL,
+        ucs4_iter_get,
+        ucs4_iter_skip,
+        ucs4_iter_get_and_advance,
+        ucs4_iter_set_and_advance,
+        ucs4_iter_set_position
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "ucs4", return_encoding);

Modified: trunk/src/string/encoding/utf16.c
--- trunk/src/string/encoding/utf16.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/encoding/utf16.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -75,14 +75,6 @@
-static void iter_init(PARROT_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
 static void set_byte(PARROT_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset,
@@ -96,21 +88,49 @@
+static UINTVAL utf16_iter_get(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *i),
+    INTVAL offset)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
-static UINTVAL utf16_decode_and_advance(SHIM_INTERP, ARGMOD(String_iter *i))
+static UINTVAL utf16_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(3)
-static void utf16_encode_and_advance(SHIM_INTERP,
+static void utf16_iter_set_and_advance(PARROT_INTERP,
+    ARGMOD(STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL c)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*str)
-static void utf16_set_position(SHIM_INTERP,
+static void utf16_iter_set_position(PARROT_INTERP,
+    ARGIN(const STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL n)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*i);
+static void utf16_iter_skip(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i),
+    INTVAL skip)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(3)
 #define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -133,22 +153,32 @@
 #define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_utf16_decode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-#define ASSERT_ARGS_utf16_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-#define ASSERT_ARGS_utf16_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_utf16_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf16_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf16_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf16_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf16_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
 /* HEADERIZER END: static */
@@ -377,19 +407,27 @@
 get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-    String_iter iter;
-    UINTVAL start;
+    UINTVAL pos = 0, start;
+    const UChar * const s = (UChar*) src->strstart;
     STRING * const return_string = Parrot_str_copy(interp, src);
-    iter_init(interp, src, &iter);
-    iter.set_position(interp, &iter, offset);
-    start = iter.bytepos;
-    return_string->strstart = (char *)return_string->strstart + start ;
-    iter.set_position(interp, &iter, offset + count);
-    return_string->bufused = iter.bytepos - start;
+    U16_FWD_N_UNSAFE(s, pos, offset);
+    start = pos * sizeof (UChar);
+    return_string->strstart = (char *)return_string->strstart + start;
+    U16_FWD_N_UNSAFE(s, pos, count);
+    return_string->bufused = pos * sizeof (UChar) - start;
     return_string->strlen = count;
     return_string->hashval = 0;
     return return_string;
+    UNUSED(src);
+    UNUSED(offset);
+    UNUSED(count);
+    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+        "no ICU lib loaded");
@@ -432,15 +470,24 @@
 codepoints(PARROT_INTERP, ARGIN(const STRING *src))
-    String_iter iter;
+    const UChar * const s = (UChar*) src->strstart;
+    UINTVAL pos = 0, charpos = 0;
      * this is used to initially calculate src->strlen,
      * therefore we must scan the whole string
-    iter_init(interp, src, &iter);
-    while (iter.bytepos < src->bufused)
-        iter.get_and_advance(interp, &iter);
-    return iter.charpos;
+    while (pos * sizeof (UChar) < src->bufused) {
+        U16_FWD_1_UNSAFE(s, pos);
+        ++charpos;
+    }
+    return charpos;
+    UNUSED(src);
+    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+        "no ICU lib loaded");
@@ -461,106 +508,184 @@
     return src->bufused;
-=item C<static UINTVAL utf16_decode_and_advance(PARROT_INTERP, String_iter *i)>
+=item C<static UINTVAL utf16_iter_get(PARROT_INTERP, const STRING *str, const
+String_iter *i, INTVAL offset)>
-Moves the string iterator C<i> to the next UTF-16 codepoint.
+Get the character at C<i> plus C<offset>.
 static UINTVAL
-utf16_decode_and_advance(SHIM_INTERP, ARGMOD(String_iter *i))
+    ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
-    ASSERT_ARGS(utf16_decode_and_advance)
-    const UChar * const s = (const UChar*) i->str->strstart;
-    UINTVAL pos = i->bytepos / sizeof (UChar);
-    UINTVAL c;
+    ASSERT_ARGS(utf16_iter_get)
+    const UChar * const s = (UChar*) str->strstart;
+    UINTVAL c, pos;
+    pos = i->bytepos / sizeof (UChar);
+    if (offset > 0) {
+        U16_FWD_N_UNSAFE(s, pos, offset);
+    }
+    else if (offset < 0) {
+        U16_BACK_N_UNSAFE(s, pos, -offset);
+    }
+    U16_GET_UNSAFE(s, pos, c);
-    /* TODO either make sure that we don't go past end or use SAFE
-     *      iter versions
-     */
-    U16_NEXT_UNSAFE(s, pos, c);
-    ++i->charpos;
-    i->bytepos = pos * sizeof (UChar);
     return c;
+    UNUSED(str);
+    UNUSED(i);
+    UNUSED(offset);
+    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+        "no ICU lib loaded");
-=item C<static void utf16_encode_and_advance(PARROT_INTERP, String_iter *i,
+=item C<static void utf16_iter_skip(PARROT_INTERP, const STRING *str,
+String_iter *i, INTVAL skip)>
-With the string iterator C<i>, appends the codepoint C<c> and advances to the
-next position in the string.
+Moves the string iterator C<i> by C<skip> characters.
 static void
-utf16_encode_and_advance(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL c)
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
-    ASSERT_ARGS(utf16_encode_and_advance)
-    UChar * const s = (UChar*) i->str->strstart;
+    ASSERT_ARGS(utf16_iter_skip)
+    const UChar * const s = (UChar*) str->strstart;
     UINTVAL pos = i->bytepos / sizeof (UChar);
-    U16_APPEND_UNSAFE(s, pos, c);
-    ++i->charpos;
+    if (skip > 0) {
+        U16_FWD_N_UNSAFE(s, pos, skip);
+    }
+    else if (skip < 0) {
+        U16_BACK_N_UNSAFE(s, pos, -skip);
+    }
+    i->charpos += skip;
     i->bytepos = pos * sizeof (UChar);
+    UNUSED(str);
+    UNUSED(i);
+    UNUSED(skip);
+    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+        "no ICU lib loaded");
-=item C<static void utf16_set_position(PARROT_INTERP, String_iter *i, UINTVAL
+=item C<static UINTVAL utf16_iter_get_and_advance(PARROT_INTERP, const STRING
+*str, String_iter *i)>
-Moves the string iterator C<i> to the position C<n> in the string.
+Moves the string iterator C<i> to the next UTF-16 codepoint.
+static UINTVAL
+    ARGIN(const STRING *str), ARGMOD(String_iter *i))
+    ASSERT_ARGS(utf16_iter_get_and_advance)
+    const UChar * const s = (UChar*) str->strstart;
+    UINTVAL c, pos;
+    pos = i->bytepos / sizeof (UChar);
+    /* TODO either make sure that we don't go past end or use SAFE
+     *      iter versions
+     */
+    U16_NEXT_UNSAFE(s, pos, c);
+    i->charpos++;
+    i->bytepos = pos * sizeof (UChar);
+    return c;
+    UNUSED(str);
+    UNUSED(i);
+    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+        "no ICU lib loaded");
+=item C<static void utf16_iter_set_and_advance(PARROT_INTERP, STRING *str,
+String_iter *i, UINTVAL c)>
+With the string iterator C<i>, appends the codepoint C<c> and advances to the
+next position in the string.
 static void
-utf16_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL n)
+    ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
-    ASSERT_ARGS(utf16_set_position)
-    UChar * const s = (UChar*) i->str->strstart;
+    ASSERT_ARGS(utf16_iter_set_and_advance)
+    UChar * const s = (UChar*) str->strstart;
     UINTVAL pos;
-    pos = 0;
-    U16_FWD_N_UNSAFE(s, pos, n);
-    i->charpos = n;
+    pos = i->bytepos / sizeof (UChar);
+    U16_APPEND_UNSAFE(s, pos, c);
+    i->charpos++;
     i->bytepos = pos * sizeof (UChar);
+    UNUSED(str);
+    UNUSED(i);
+    UNUSED(c);
+    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+        "no ICU lib loaded");
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
+=item C<static void utf16_iter_set_position(PARROT_INTERP, const STRING *str,
+String_iter *i, UINTVAL n)>
-Initializes for string C<src> the string iterator C<iter>.
+Moves the string iterator C<i> to the position C<n> in the string.
 static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n)
-    ASSERT_ARGS(iter_init)
-    iter->str = src;
-    iter->bytepos = iter->charpos = 0;
+    ASSERT_ARGS(utf16_iter_set_position)
-    UNUSED(interp);
-    iter->get_and_advance = utf16_decode_and_advance;
-    iter->set_and_advance = utf16_encode_and_advance;
-    iter->set_position =    utf16_set_position;
+    UChar * const s = (UChar*) str->strstart;
+    UINTVAL pos;
+    pos = 0;
+    U16_FWD_N_UNSAFE(s, pos, n);
+    i->charpos = n;
+    i->bytepos = pos * sizeof (UChar);
+    UNUSED(str);
+    UNUSED(i);
+    UNUSED(n);
     Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
         "no ICU lib loaded");
@@ -593,9 +718,13 @@
-        iter_init,
-        NULL
+        NULL,
+        utf16_iter_get,
+        utf16_iter_skip,
+        utf16_iter_get_and_advance,
+        utf16_iter_set_and_advance,
+        utf16_iter_set_position
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "utf16", return_encoding);

Modified: trunk/src/string/encoding/utf8.c
--- trunk/src/string/encoding/utf8.c	Thu Aug 19 02:03:11 2010	(r48564)
+++ trunk/src/string/encoding/utf8.c	Thu Aug 19 05:53:12 2010	(r48565)
@@ -71,13 +71,6 @@
-static void iter_init(SHIM_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
 static void set_byte(PARROT_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset,
@@ -100,28 +93,51 @@
-static UINTVAL utf8_decode_and_advance(PARROT_INTERP,
-    ARGMOD(String_iter *i))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2)
-        FUNC_MODIFIES(*i);
 static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
-static void utf8_encode_and_advance(PARROT_INTERP,
+static UINTVAL utf8_iter_get(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGIN(const String_iter *i),
+    INTVAL offset)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+static UINTVAL utf8_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*i);
+static void utf8_iter_set_and_advance(PARROT_INTERP,
+    ARGMOD(STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL c)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*str)
-static void utf8_set_position(SHIM_INTERP,
+static void utf8_iter_set_position(SHIM_INTERP,
+    ARGIN(const STRING *str),
     ARGMOD(String_iter *i),
     UINTVAL pos)
+        __attribute__nonnull__(3)
+        FUNC_MODIFIES(*i);
+static void utf8_iter_skip(SHIM_INTERP,
+    ARGIN(const STRING *str),
+    ARGMOD(String_iter *i),
+    INTVAL skip)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3)
@@ -151,9 +167,6 @@
 #define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
@@ -166,17 +179,27 @@
 #define ASSERT_ARGS_utf8_decode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(ptr))
-#define ASSERT_ARGS_utf8_decode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
 #define ASSERT_ARGS_utf8_encode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(ptr))
-#define ASSERT_ARGS_utf8_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_utf8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(str) \
+#define ASSERT_ARGS_utf8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(str) \
-#define ASSERT_ARGS_utf8_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
 #define ASSERT_ARGS_utf8_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
 #define ASSERT_ARGS_utf8_skip_forward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -387,7 +410,65 @@
-=item C<static UINTVAL utf8_decode_and_advance(PARROT_INTERP, String_iter *i)>
+=item C<static UINTVAL utf8_iter_get(PARROT_INTERP, const STRING *str, const
+String_iter *i, INTVAL offset)>
+Get the character at C<i> plus C<offset>.
+static UINTVAL
+    ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
+    ASSERT_ARGS(utf8_iter_get)
+    const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+    if (offset > 0) {
+        u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, offset);
+    }
+    else if (offset < 0) {
+        u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr, -offset);
+    }
+    return utf8_decode(interp, u8ptr);
+=item C<static void utf8_iter_skip(PARROT_INTERP, const STRING *str, String_iter
+*i, INTVAL skip)>
+Moves the string iterator C<i> by C<skip> characters.
+static void
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip)
+    ASSERT_ARGS(utf8_iter_skip)
+    const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+    if (skip > 0) {
+        u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, skip);
+    }
+    else if (skip < 0) {
+        u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr, -skip);
+    }
+    i->charpos += skip;
+    i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+=item C<static UINTVAL utf8_iter_get_and_advance(PARROT_INTERP, const STRING
+*str, String_iter *i)>
 The UTF-8 implementation of the string iterator's C<get_and_advance>
@@ -397,10 +478,11 @@
 static UINTVAL
-utf8_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
+    ARGIN(const STRING *str), ARGMOD(String_iter *i))
-    ASSERT_ARGS(utf8_decode_and_advance)
-    const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos);
+    ASSERT_ARGS(utf8_iter_get_and_advance)
+    const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
     UINTVAL c = *u8ptr;
     if (UTF8_IS_START(c)) {
@@ -408,13 +490,12 @@
         c &= UTF8_START_MASK(len);
         i->bytepos += len;
-        for (--len; len; --len) {
-            ++u8ptr;
+        for (len--; len; len--) {
+            u8ptr++;
             if (!UTF8_IS_CONTINUATION(*u8ptr))
                 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                     "Malformed UTF-8 string\n");
             c = UTF8_ACCUMULATE(c, *u8ptr);
@@ -427,17 +508,17 @@
             "Malformed UTF-8 string\n");
     else {
-        ++i->bytepos;
+        i->bytepos++;
-    ++i->charpos;
+    i->charpos++;
     return c;
-=item C<static void utf8_encode_and_advance(PARROT_INTERP, String_iter *i,
+=item C<static void utf8_iter_set_and_advance(PARROT_INTERP, STRING *str,
+String_iter *i, UINTVAL c)>
 The UTF-8 implementation of the string iterator's C<set_and_advance>
@@ -447,23 +528,23 @@
 static void
-utf8_encode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL c)
+    ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c)
-    ASSERT_ARGS(utf8_encode_and_advance)
-    const STRING * const s = i->str;
-    unsigned char * const pos = (unsigned char *)s->strstart + i->bytepos;
+    ASSERT_ARGS(utf8_iter_set_and_advance)
+    unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos;
     unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
     i->bytepos += (new_pos - pos);
     /* XXX possible buffer overrun exception? */
-    PARROT_ASSERT(i->bytepos <= Buffer_buflen(s));
-    ++i->charpos;
+    PARROT_ASSERT(i->bytepos <= Buffer_buflen(str));
+    i->charpos++;
-=item C<static void utf8_set_position(PARROT_INTERP, String_iter *i, UINTVAL
+=item C<static void utf8_iter_set_position(PARROT_INTERP, const STRING *str,
+String_iter *i, UINTVAL pos)>
 The UTF-8 implementation of the string iterator's C<set_position>
@@ -473,25 +554,46 @@
 static void
-utf8_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL pos)
+    ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos)
-    ASSERT_ARGS(utf8_set_position)
-    const utf8_t *u8ptr = (const utf8_t *)i->str->strstart;
+    ASSERT_ARGS(utf8_iter_set_position)
+    const utf8_t *u8ptr = (const utf8_t *)str->strstart;
-    /* start from last known charpos, if we can */
-    if (i->charpos <= pos) {
-        const UINTVAL old_pos = pos;
-        pos       -= i->charpos;
-        u8ptr     += i->bytepos;
-        i->charpos = old_pos;
+    if (pos == 0) {
+        i->charpos = 0;
+        i->bytepos = 0;
+        return;
-    else
-        i->charpos = pos;
-    while (pos-- > 0)
-        u8ptr += UTF8SKIP(u8ptr);
+    /*
+     * we know the byte offsets of three positions: start, current and end
+     * now find the shortest way to reach pos
+     */
+    if (pos < i->charpos) {
+        if (pos <= (i->charpos >> 1)) {
+            /* go forward from start */
+            u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, pos);
+        }
+        else {
+            /* go backward from current */
+            u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + i->bytepos, i->charpos - pos);
+        }
+    }
+    else {
+        const UINTVAL  len = str->strlen;
+        if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
+            /* go forward from current */
+            u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr + i->bytepos, pos - i->charpos);
+        }
+        else {
+            /* go backward from end */
+            u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + str->bufused, len - pos);
+        }
+    }
-    i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
+    i->charpos = pos;
+    i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
@@ -513,8 +615,8 @@
     STRING *result;
-    String_iter src_iter;
-    UINTVAL offs, dest_len, dest_pos, src_len;
+    const ENCODING *src_encoding;
+    UINTVAL dest_len, dest_pos, src_len;
     unsigned char *p;
     if (src->encoding == Parrot_utf8_encoding_ptr)
@@ -523,8 +625,8 @@
     result = Parrot_gc_new_string_header(interp, 0);
     src_len = src->strlen;
-    /* init iter before possilby changing encoding */
-    ENCODING_ITER_INIT(interp, src, &src_iter);
+    /* save source encoding before possibly changing it */
+    src_encoding = src->encoding;
     result->charset  = Parrot_unicode_charset_ptr;
     result->encoding = Parrot_utf8_encoding_ptr;
     result->strlen   = src_len;
@@ -542,15 +644,17 @@
         result->bufused = dest_len;
     else {
+        String_iter src_iter;
+        STRING_ITER_INIT(interp, &src_iter);
         dest_len = src_len;
         dest_pos = 0;
-        for (offs = 0; offs < src_len; ++offs) {
-            const UINTVAL c = src_iter.get_and_advance(interp, &src_iter);
+        while (src_iter.charpos < src_len) {
+            const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter);
             unsigned char *new_pos;
             unsigned char *pos;
             if (dest_len - dest_pos < 6) {
-                UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
+                UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5);
                 if (need < 16)
                     need = 16;
                 dest_len += need;
@@ -683,16 +787,16 @@
     String_iter    iter;
     UINTVAL        start;
-    iter_init(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
     if (offset)
-        iter.set_position(interp, &iter, offset);
+        utf8_iter_set_position(interp, src, &iter, offset);
     start                   = iter.bytepos;
     return_string->strstart = (char *)return_string->strstart + start;
     if (count)
-        iter.set_position(interp, &iter, offset + count);
+        utf8_iter_set_position(interp, src, &iter, offset + count);
     return_string->bufused  = iter.bytepos - start;
     return_string->strlen   = count;
@@ -749,9 +853,9 @@
      * this is used to initially calculate src->strlen,
      * therefore we must scan the whole string
-    iter_init(interp, src, &iter);
+    STRING_ITER_INIT(interp, &iter);
     while (iter.bytepos < src->bufused)
-        iter.get_and_advance(interp, &iter);
+        utf8_iter_get_and_advance(interp, src, &iter);
     return iter.charpos;
@@ -775,29 +879,6 @@
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-Initializes for string C<src> the string iterator C<iter>.
-static void
-iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-    ASSERT_ARGS(iter_init)
-    iter->str             = src;
-    iter->bytepos         = 0;
-    iter->charpos         = 0;
-    iter->get_and_advance = utf8_decode_and_advance;
-    iter->set_and_advance = utf8_encode_and_advance;
-    iter->set_position    = utf8_set_position;
 =item C<void Parrot_encoding_utf8_init(PARROT_INTERP)>
 Initializes the UTF-8 encoding.
@@ -823,9 +904,13 @@
-        iter_init,
-        NULL
+        NULL,
+        utf8_iter_get,
+        utf8_iter_skip,
+        utf8_iter_get_and_advance,
+        utf8_iter_set_and_advance,
+        utf8_iter_set_position
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "utf8", return_encoding);

More information about the parrot-commits mailing list