[svn:parrot] r38853 - in trunk: include/parrot src/string/charset src/string/encoding

Infinoid at svn.parrot.org Infinoid at svn.parrot.org
Sat May 16 23:28:15 UTC 2009


Author: Infinoid
Date: Sat May 16 23:28:14 2009
New Revision: 38853
URL: https://trac.parrot.org/parrot/changeset/38853

Log:
[encoding] Implement chromatic++'s idea to push the Parrot_str_find_cclass() loop down into the encoding plugin, for the ascii and iso-8859-1 cases.
This saves several million function calls worth of overhead.

Modified:
   trunk/include/parrot/encoding.h
   trunk/src/string/charset/ascii.c
   trunk/src/string/charset/iso-8859-1.c
   trunk/src/string/encoding/fixed_8.c
   trunk/src/string/encoding/ucs2.c
   trunk/src/string/encoding/utf16.c
   trunk/src/string/encoding/utf8.c

Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/include/parrot/encoding.h	Sat May 16 23:28:14 2009	(r38853)
@@ -29,6 +29,7 @@
 typedef void (*encoding_become_encoding_t)(PARROT_INTERP, STRING *src);
 typedef UINTVAL (*encoding_codepoints_t)(PARROT_INTERP, STRING *src);
 typedef UINTVAL (*encoding_bytes_t)(PARROT_INTERP, STRING *src);
+typedef UINTVAL (*encoding_find_cclass_t)(PARROT_INTERP, STRING *s, const INTVAL *typetable, INTVAL flags, UINTVAL offset, UINTVAL count);
 
 /* iterator support */
 
@@ -55,6 +56,7 @@
     encoding_codepoints_t               codepoints;
     encoding_bytes_t                    bytes;
     encoding_iter_init_t                iter_init;
+    encoding_find_cclass_t              find_cclass;
 };
 
 typedef struct _encoding ENCODING;
@@ -218,6 +220,8 @@
     ((src)->encoding)->bytes((i), (src))
 #define ENCODING_ITER_INIT(i, src, iter) \
     ((src)->encoding)->iter_init((i), (src), (iter))
+#define ENCODING_FIND_CCLASS(i, src, typetable, flags, pos, end) \
+    ((src)->encoding)->find_cclass((i), (src), (typetable), (flags), (pos), (end))
 
 #endif /* PARROT_ENCODING_H_GUARD */
 

Modified: trunk/src/string/charset/ascii.c
==============================================================================
--- trunk/src/string/charset/ascii.c	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/src/string/charset/ascii.c	Sat May 16 23:28:14 2009	(r38853)
@@ -766,6 +766,9 @@
 =item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, STRING
 *source_string, UINTVAL offset, UINTVAL count)>
 
+Find a character in the given character class.  Delegates to the find_cclass
+method of the encoding plugin.
+
 =cut
 
 */
@@ -780,13 +783,8 @@
     UINTVAL end = offset + count;
 
     end = source_string->strlen < end ? source_string->strlen : end;
-    for (; pos < end; ++pos) {
-        const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, source_string, pos);
-        if ((Parrot_ascii_typetable[codepoint] & flags) != 0) {
-            return pos;
-        }
-    }
-    return end;
+    return ENCODING_FIND_CCLASS(interp, source_string, Parrot_ascii_typetable,
+            flags, pos, end);
 }
 
 /*

Modified: trunk/src/string/charset/iso-8859-1.c
==============================================================================
--- trunk/src/string/charset/iso-8859-1.c	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/src/string/charset/iso-8859-1.c	Sat May 16 23:28:14 2009	(r38853)
@@ -591,6 +591,9 @@
 =item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, STRING
 *source_string, UINTVAL offset, UINTVAL count)>
 
+Find a character in the given character class.  Delegates to the find_cclass
+method of the encoding plugin.
+
 =cut
 
 */
@@ -602,16 +605,10 @@
     ASSERT_ARGS(find_cclass)
     UINTVAL pos = offset;
     UINTVAL end = offset + count;
-    UINTVAL codepoint;
 
     end = source_string->strlen < end ? source_string->strlen : end;
-    for (; pos < end; ++pos) {
-        codepoint = ENCODING_GET_CODEPOINT(interp, source_string, pos);
-        if ((Parrot_iso_8859_1_typetable[codepoint] & flags) != 0) {
-            return pos;
-        }
-    }
-    return end;
+    return ENCODING_FIND_CCLASS(interp, source_string,
+            Parrot_iso_8859_1_typetable, flags, pos, end);
 }
 
 /*

Modified: trunk/src/string/encoding/fixed_8.c
==============================================================================
--- trunk/src/string/encoding/fixed_8.c	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/src/string/encoding/fixed_8.c	Sat May 16 23:28:14 2009	(r38853)
@@ -34,6 +34,17 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL find_cclass(PARROT_INTERP,
+    ARGIN(STRING *s),
+    ARGIN(const INTVAL *typetable),
+    INTVAL flags,
+    UINTVAL pos,
+    UINTVAL end)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+
 static UINTVAL fixed8_get_next(PARROT_INTERP, ARGMOD(String_iter *iter))
         __attribute__nonnull__(1)
         __attribute__nonnull__(2)
@@ -163,6 +174,10 @@
 #define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(interp) \
     || PARROT_ASSERT_ARG(source_string)
+#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = \
+       PARROT_ASSERT_ARG(interp) \
+    || PARROT_ASSERT_ARG(s) \
+    || PARROT_ASSERT_ARG(typetable)
 #define ASSERT_ARGS_fixed8_get_next __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(interp) \
     || PARROT_ASSERT_ARG(iter)
@@ -278,6 +293,33 @@
     set_byte(interp, source_string, offset, codepoint);
 }
 
+
+/*
+
+=item C<static UINTVAL find_cclass(PARROT_INTERP, STRING *s, const INTVAL
+*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
+
+codepoints are bytes, so delegate
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+find_cclass(PARROT_INTERP, ARGIN(STRING *s), ARGIN(const INTVAL *typetable),
+INTVAL flags, UINTVAL pos, UINTVAL end)
+{
+    ASSERT_ARGS(find_cclass)
+    unsigned char *contents = (unsigned char *)s->strstart;
+    for (; pos < end; ++pos) {
+        if ((typetable[contents[pos]] & flags) != 0) {
+            return pos;
+        }
+    }
+    return end;
+}
+
 /*
 
 =item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *source_string,
@@ -652,7 +694,8 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init
+        iter_init,
+        find_cclass
 
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);

Modified: trunk/src/string/encoding/ucs2.c
==============================================================================
--- trunk/src/string/encoding/ucs2.c	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/src/string/encoding/ucs2.c	Sat May 16 23:28:14 2009	(r38853)
@@ -49,6 +49,17 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL find_cclass(PARROT_INTERP,
+    ARGIN(STRING *s),
+    ARGIN(const INTVAL *typetable),
+    INTVAL flags,
+    UINTVAL pos,
+    UINTVAL end)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+
 static UINTVAL get_byte(PARROT_INTERP,
     SHIM(const STRING *src),
     SHIM(UINTVAL offset))
@@ -167,6 +178,10 @@
 #define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(interp) \
     || PARROT_ASSERT_ARG(src)
+#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = \
+       PARROT_ASSERT_ARG(interp) \
+    || PARROT_ASSERT_ARG(s) \
+    || PARROT_ASSERT_ARG(typetable)
 #define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(interp)
 #define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = \
@@ -299,6 +314,27 @@
 
 /*
 
+=item C<static UINTVAL find_cclass(PARROT_INTERP, STRING *s, const INTVAL
+*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
+
+Stub, the charset level handles this for unicode strings.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+find_cclass(PARROT_INTERP, ARGIN(STRING *s), ARGIN(const INTVAL *typetable),
+INTVAL flags, UINTVAL pos, UINTVAL end)
+{
+    Parrot_ex_throw_from_c_args(interp, NULL,
+        EXCEPTION_UNIMPLEMENTED,
+        "No find_cclass support in unicode encoding plugins");
+}
+
+/*
+
 =item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
 offset)>
 
@@ -691,7 +727,8 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init
+        iter_init,
+        find_cclass
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "ucs2", return_encoding);

Modified: trunk/src/string/encoding/utf16.c
==============================================================================
--- trunk/src/string/encoding/utf16.c	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/src/string/encoding/utf16.c	Sat May 16 23:28:14 2009	(r38853)
@@ -38,6 +38,17 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL find_cclass(PARROT_INTERP,
+    ARGIN(STRING *s),
+    ARGIN(const INTVAL *typetable),
+    INTVAL flags,
+    UINTVAL pos,
+    UINTVAL end)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+
 static UINTVAL get_byte(SHIM_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset)
@@ -164,6 +175,10 @@
 #define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(interp) \
     || PARROT_ASSERT_ARG(src)
+#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = \
+       PARROT_ASSERT_ARG(interp) \
+    || PARROT_ASSERT_ARG(s) \
+    || PARROT_ASSERT_ARG(typetable)
 #define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(src)
 #define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = \
@@ -392,6 +407,27 @@
 
 /*
 
+=item C<static UINTVAL find_cclass(PARROT_INTERP, STRING *s, const INTVAL
+*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
+
+Stub, the charset level handles this for unicode strings.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+find_cclass(PARROT_INTERP, ARGIN(STRING *s), ARGIN(const INTVAL *typetable),
+INTVAL flags, UINTVAL pos, UINTVAL end)
+{
+    Parrot_ex_throw_from_c_args(interp, NULL,
+        EXCEPTION_UNIMPLEMENTED,
+        "No find_cclass support in unicode encoding plugins");
+}
+
+/*
+
 =item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
 offset)>
 
@@ -806,7 +842,8 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init
+        iter_init,
+        find_cclass
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "utf16", return_encoding);

Modified: trunk/src/string/encoding/utf8.c
==============================================================================
--- trunk/src/string/encoding/utf8.c	Sat May 16 23:28:06 2009	(r38852)
+++ trunk/src/string/encoding/utf8.c	Sat May 16 23:28:14 2009	(r38853)
@@ -39,6 +39,17 @@
         __attribute__nonnull__(2)
         FUNC_MODIFIES(*src);
 
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL find_cclass(PARROT_INTERP,
+    ARGIN(STRING *s),
+    ARGIN(const INTVAL *typetable),
+    INTVAL flags,
+    UINTVAL pos,
+    UINTVAL end)
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(3);
+
 static UINTVAL get_byte(SHIM_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset)
@@ -181,6 +192,10 @@
 #define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(interp) \
     || PARROT_ASSERT_ARG(src)
+#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = \
+       PARROT_ASSERT_ARG(interp) \
+    || PARROT_ASSERT_ARG(s) \
+    || PARROT_ASSERT_ARG(typetable)
 #define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = \
        PARROT_ASSERT_ARG(src)
 #define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = \
@@ -683,6 +698,27 @@
 
 /*
 
+=item C<static UINTVAL find_cclass(PARROT_INTERP, STRING *s, const INTVAL
+*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
+
+Stub, the charset level handles this for unicode strings.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+find_cclass(PARROT_INTERP, ARGIN(STRING *s), ARGIN(const INTVAL *typetable),
+INTVAL flags, UINTVAL pos, UINTVAL end)
+{
+    Parrot_ex_throw_from_c_args(interp, NULL,
+        EXCEPTION_UNIMPLEMENTED,
+        "No find_cclass support in unicode encoding plugins");
+}
+
+/*
+
 =item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
 offset)>
 
@@ -1018,7 +1054,8 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init
+        iter_init,
+        find_cclass
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "utf8", return_encoding);


More information about the parrot-commits mailing list