[svn:parrot] r49749 - in branches/string_checks: src/io src/string src/string/encoding t/op

nwellnhof at svn.parrot.org nwellnhof at svn.parrot.org
Sun Oct 31 15:00:38 UTC 2010


Author: nwellnhof
Date: Sun Oct 31 15:00:38 2010
New Revision: 49749
URL: https://trac.parrot.org/parrot/changeset/49749

Log:
[str] UTF-8 checks

Don't allow overlong forms. Perform all checks during initial scan.

Modified:
   branches/string_checks/src/io/utf8.c
   branches/string_checks/src/string/encoding/utf8.c
   branches/string_checks/src/string/unicode.h
   branches/string_checks/t/op/stringu.t

Modified: branches/string_checks/src/io/utf8.c
==============================================================================
--- branches/string_checks/src/io/utf8.c	Sun Oct 31 15:00:04 2010	(r49748)
+++ branches/string_checks/src/io/utf8.c	Sun Oct 31 15:00:38 2010	(r49749)
@@ -48,51 +48,78 @@
         ARGMOD(STRING **buf))
 {
     ASSERT_ARGS(Parrot_io_read_utf8)
-    STRING *s, *s2;
-    String_iter iter;
+    size_t  bytepos = 0;
+    size_t  charpos = 0;
+    size_t  len     = Parrot_io_read_buffer(interp, filehandle, buf);
+    STRING *s       = *buf;
 
-    size_t len  = Parrot_io_read_buffer(interp, filehandle, buf);
-    s           = *buf;
     s->encoding = Parrot_utf8_encoding_ptr;
 
-    /* count chars, verify utf8 */
-    STRING_ITER_INIT(interp, &iter);
+    while (bytepos < s->bufused) {
+        utf8_t  *u8ptr    = (utf8_t *)(s->strstart + bytepos);
+        UINTVAL  c        = *u8ptr;
+        size_t   utf8_len = 1;
 
-    while (iter.bytepos < s->bufused) {
-        if (iter.bytepos + 4 > s->bufused) {
-            const utf8_t *u8ptr = (utf8_t *)((char *)s->strstart +
-                    iter.bytepos);
-            const UINTVAL c = *u8ptr;
-
-            if (UTF8_IS_START(c)) {
-                UINTVAL new_bufused = iter.bytepos + UTF8SKIP(u8ptr);
-                UINTVAL len2;
-                INTVAL  read;
+        if (UTF8_IS_START(c)) {
+            size_t new_bufused, count;
 
-                if (new_bufused <= s->bufused)
-                    goto ok;
+            utf8_len    = UTF8SKIP(u8ptr);
+            new_bufused = bytepos + utf8_len;
 
+            if (new_bufused > s->bufused) {
                 /* read additional bytes to complete UTF-8 char */
-                len2        = new_bufused - s->bufused;
-                s2          = Parrot_str_new_init(interp, NULL, len2,
+                size_t  read;
+                size_t  len2 = new_bufused - s->bufused;
+                STRING *s2   = Parrot_str_new_init(interp, NULL, len2,
                                     Parrot_binary_encoding_ptr, 0);
+
                 s2->bufused = len2;
                 read        = Parrot_io_read_buffer(interp, filehandle, &s2);
-                UNUSED(read);
+
+                if (read < len2)
+                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                        "Unaligned end in UTF-8 string\n");
 
                 Parrot_gc_reallocate_string_storage(interp, s, new_bufused);
                 mem_sys_memcopy(s->strstart + s->bufused, s2->strstart, len2);
 
                 s->bufused  = new_bufused;
+                u8ptr       = (utf8_t *)(s->strstart + bytepos);
                 len        += len2;
+            }
+
+            /* Check for overlong forms */
+            if (UTF8_IS_OVERLONG(c, u8ptr[1]))
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                    "Overlong form in UTF-8 string\n");
+
+            c &= UTF8_START_MASK(utf8_len);
+
+            for (count = 1; count < utf8_len; ++count) {
+                ++u8ptr;
 
-                /* check last char */
+                if (!UTF8_IS_CONTINUATION(*u8ptr))
+                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                        "Malformed UTF-8 string\n");
+
+                c = UTF8_ACCUMULATE(c, *u8ptr);
             }
+
+            if (UNICODE_IS_INVALID(c))
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                    "Invalid character in UTF-8 string\n");
+        }
+        else if (!UNICODE_IS_INVARIANT(c)) {
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                "Malformed UTF-8 string\n");
         }
-ok:
-        STRING_iter_get_and_advance(interp, s, &iter);
+
+        bytepos += utf8_len;
+        charpos += 1;
     }
-    s->strlen = iter.charpos;
+
+    s->strlen = charpos;
+
     return len;
 }
 

Modified: branches/string_checks/src/string/encoding/utf8.c
==============================================================================
--- branches/string_checks/src/string/encoding/utf8.c	Sun Oct 31 15:00:04 2010	(r49748)
+++ branches/string_checks/src/string/encoding/utf8.c	Sun Oct 31 15:00:38 2010	(r49749)
@@ -86,10 +86,6 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
-static UINTVAL utf8_scan2(PARROT_INTERP, ARGIN(const STRING *src))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2);
-
 PARROT_WARN_UNUSED_RESULT
 PARROT_CANNOT_RETURN_NULL
 static const utf8_t * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
@@ -134,9 +130,6 @@
 #define ASSERT_ARGS_utf8_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_utf8_scan2 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_utf8_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(ptr))
 #define ASSERT_ARGS_utf8_skip_forward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -262,41 +255,47 @@
     UINTVAL characters = 0;
 
     while (u8ptr < u8end) {
-        u8ptr += UTF8SKIP(u8ptr);
-        ++characters;
-    }
+        UINTVAL c = *u8ptr;
 
-    if (u8ptr > u8end)
-        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-            "Unaligned end in UTF-8 string\n");
+        if (UTF8_IS_START(c)) {
+            size_t len = UTF8SKIP(u8ptr);
+            size_t count;
 
-    return characters;
-}
+            if (u8ptr + len > u8end)
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                    "Unaligned end in UTF-8 string\n");
 
+            /* Check for overlong forms */
+            if (UTF8_IS_OVERLONG(c, u8ptr[1]))
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                    "Overlong form in UTF-8 string\n");
 
-/*
+            c &= UTF8_START_MASK(len);
 
-=item C<static UINTVAL utf8_scan2(PARROT_INTERP, const STRING *src)>
+            for (count = 1; count < len; ++count) {
+                ++u8ptr;
 
-Returns the number of codepoints in string C<src>.
+                if (!UTF8_IS_CONTINUATION(*u8ptr))
+                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                        "Malformed UTF-8 string\n");
 
-=cut
+                c = UTF8_ACCUMULATE(c, *u8ptr);
+            }
 
-*/
+            if (UNICODE_IS_INVALID(c))
+                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+                    "Invalid character in UTF-8 string\n");
+        }
+        else if (!UNICODE_IS_INVARIANT(c)) {
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+                "Malformed UTF-8 string\n");
+        }
 
-static UINTVAL
-utf8_scan2(PARROT_INTERP, ARGIN(const STRING *src))
-{
-    ASSERT_ARGS(utf8_scan2)
-    String_iter iter;
-    /*
-     * this is used to initially calculate src->strlen,
-     * therefore we must scan the whole string
-     */
-    STRING_ITER_INIT(interp, &iter);
-    while (iter.bytepos < src->bufused)
-        utf8_iter_get_and_advance(interp, src, &iter);
-    return iter.charpos;
+        ++u8ptr;
+        ++characters;
+    }
+
+    return characters;
 }
 
 
@@ -354,20 +353,8 @@
         for (count = 1; count < len; ++count) {
             ++u8ptr;
 
-            if (!UTF8_IS_CONTINUATION(*u8ptr))
-                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-                    "Malformed UTF-8 string\n");
-
             c = UTF8_ACCUMULATE(c, *u8ptr);
         }
-
-        if (UNICODE_IS_SURROGATE(c))
-            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-                "Surrogate in UTF-8 string\n");
-    }
-    else if (!UNICODE_IS_INVARIANT(c)) {
-        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-            "Malformed UTF-8 string\n");
     }
 
     return c;
@@ -396,10 +383,9 @@
     const utf8_t * const u8ptr = (utf8_t *)ptr;
     utf8_t              *u8end = (utf8_t *)ptr + len - 1;
 
-    if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) {
+    if (UNICODE_IS_INVALID(c))
         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
-                           "Invalid character for UTF-8 encoding\n");
-    }
+                "Invalid character for UTF-8 encoding\n");
 
     while (u8end > u8ptr) {
         *u8end-- =
@@ -443,8 +429,6 @@
 
 Moves C<ptr> C<n> characters back.
 
-XXX This function is unused.
-
 =cut
 
 */
@@ -485,6 +469,8 @@
     ASSERT_ARGS(utf8_iter_get)
     const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
 
+    PARROT_ASSERT(i->charpos + offset < str->strlen);
+
     if (offset > 0) {
         u8ptr = utf8_skip_forward(u8ptr, offset);
     }
@@ -514,6 +500,10 @@
     ASSERT_ARGS(utf8_iter_skip)
     const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
 
+    i->charpos += skip;
+
+    PARROT_ASSERT(i->charpos <= str->strlen);
+
     if (skip > 0) {
         u8ptr = utf8_skip_forward(u8ptr, skip);
     }
@@ -521,8 +511,9 @@
         u8ptr = utf8_skip_backward(u8ptr, -skip);
     }
 
-    i->charpos += skip;
     i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 }
 
 
@@ -544,35 +535,13 @@
 {
     ASSERT_ARGS(utf8_iter_get_and_advance)
     const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
-    UINTVAL c = *u8ptr;
-
-    if (UTF8_IS_START(c)) {
-        UINTVAL len = UTF8SKIP(u8ptr);
+    UINTVAL c = utf8_decode(interp, u8ptr);
 
-        c &= UTF8_START_MASK(len);
-        i->bytepos += len;
-        for (len--; len; len--) {
-            u8ptr++;
+    i->charpos += 1;
+    i->bytepos += UTF8SKIP(u8ptr);
 
-            if (!UTF8_IS_CONTINUATION(*u8ptr))
-                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-                    "Malformed UTF-8 string\n");
-            c = UTF8_ACCUMULATE(c, *u8ptr);
-        }
-
-        if (UNICODE_IS_SURROGATE(c))
-            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-                "Surrogate in UTF-8 string\n");
-    }
-    else if (!UNICODE_IS_INVARIANT(c)) {
-        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
-            "Malformed UTF-8 string\n");
-    }
-    else {
-        i->bytepos++;
-    }
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 
-    i->charpos++;
     return c;
 }
 
@@ -597,10 +566,10 @@
     unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos;
     unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
 
-    i->bytepos += (new_pos - pos);
-    /* XXX possible buffer overrun exception? */
-    PARROT_ASSERT(i->bytepos <= Buffer_buflen(str));
-    i->charpos++;
+    i->charpos += 1;
+    i->bytepos += new_pos - pos;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 }
 
 
@@ -629,6 +598,8 @@
         return;
     }
 
+    PARROT_ASSERT(pos <= str->strlen);
+
     /*
      * we know the byte offsets of three positions: start, current and end
      * now find the shortest way to reach pos
@@ -657,6 +628,8 @@
 
     i->charpos = pos;
     i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+
+    PARROT_ASSERT(i->bytepos <= str->bufused);
 }
 
 
@@ -664,7 +637,7 @@
     0,
     "utf8",
     NULL,
-    4, /* Max bytes per codepoint */
+    UTF8_MAXLEN, /* Max bytes per codepoint */
 
     utf8_to_encoding,
     unicode_chr,
@@ -674,9 +647,9 @@
     encoding_index,
     encoding_rindex,
     encoding_hash,
-    unicode_validate,
+    encoding_validate,
 
-    utf8_scan2,
+    utf8_scan,
     utf8_ord,
     encoding_substr,
 

Modified: branches/string_checks/src/string/unicode.h
==============================================================================
--- branches/string_checks/src/string/unicode.h	Sun Oct 31 15:00:04 2010	(r49748)
+++ branches/string_checks/src/string/unicode.h	Sun Oct 31 15:00:38 2010	(r49749)
@@ -33,6 +33,14 @@
 #define UNICODE_IS_LOW_SURROGATE(c)    ((c) >= UNICODE_LOW_SURROGATE_FIRST && \
                                         (c) <= UNICODE_LOW_SURROGATE_LAST)
 #define UNICODE_IS_INVARIANT(c)        ((c) <  0x80u)
+#define UNICODE_IS_NON_CHARACTER(c)   (((c) &  0xFFFEu) == 0xFFFEu || \
+                                       ((c) >= 0xFDD0u && (c) <= 0xFDEFu))
+#define UNICODE_IS_INVALID(c)          ((c) >= UNICODE_SURROGATE_FIRST && \
+                                       ((c) <= 0xFDEFu ? \
+                                        (c) <= UNICODE_SURROGATE_LAST || \
+                                        (c) >= 0xFDD0u : \
+                                       ((c) &  0xFFFEu) == 0xFFFEu || \
+                                        (c) >  0x10FFFFu))
 
 #define UNICODE_HIGH_SURROGATE(c) \
   ((((c) - 0x10000u) >> UNICODE_HIGH_SURROGATE_SHIFT) + UNICODE_HIGH_SURROGATE_FIRST)
@@ -43,8 +51,8 @@
     ((low) - UNICODE_LOW_SURROGATE_FIRST) + 0x10000u)
 
 #define UNISKIP(uv) ((uv) < 0x80    ? 1 : \
-                      (uv) < 0x800   ? 2 : \
-                      (uv) < 0x10000 ? 3 : 4)
+                     (uv) < 0x800   ? 2 : \
+                     (uv) < 0x10000 ? 3 : 4)
 
 #define UTF16SKIP(s) (UNICODE_IS_HIGH_SURROGATE(*(s)) ? 2 : 1)
 
@@ -64,7 +72,7 @@
 
  */
 
-#define UTF8_IS_START(c)                ((c) >= 0xC0u && (c) <= 0xFDu)
+#define UTF8_IS_START(c)                ((c) >= 0xC2u && (c) <= 0xF4u)
 #define UTF8_IS_CONTINUATION(c)         ((c) >= 0x80u && (c) <= 0xBFu)
 #define UTF8_IS_CONTINUED(c)            ((c) &  0x80u)
 
@@ -76,6 +84,9 @@
 #define UTF8_CONTINUATION_MASK           0x3Fu
 #define UTF8_ACCUMULATE(old, new)       (((old) << UTF8_ACCUMULATION_SHIFT) | ((new) & UTF8_CONTINUATION_MASK))
 
+#define UTF8_IS_OVERLONG(c1, c2)       (((c1) == 0xE0u && (c2) < 0xA0u) || \
+                                        ((c1) == 0xF0u && (c2) < 0x90u))
+
 extern const char Parrot_utf8skip[256];
 
 #define UTF8SKIP(s) Parrot_utf8skip[(int)*(s)]

Modified: branches/string_checks/t/op/stringu.t
==============================================================================
--- branches/string_checks/t/op/stringu.t	Sun Oct 31 15:00:04 2010	(r49748)
+++ branches/string_checks/t/op/stringu.t	Sun Oct 31 15:00:38 2010	(r49749)
@@ -6,7 +6,7 @@
 use warnings;
 use lib qw( . lib ../lib ../../lib );
 use Test::More;
-use Parrot::Test tests => 34;
+use Parrot::Test tests => 35;
 use Parrot::Config;
 
 =head1 NAME
@@ -250,18 +250,6 @@
 \xc2\xab
 OUTPUT
 
-pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
-    set S0, utf8:unicode:"\xf2\xab"
-    length I0, S0
-    print I0
-    print "\n"
-    print S0
-    print "\n"
-    end
-CODE
-/Malformed UTF-8 string/
-OUTPUT
-
 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
     set S0, ascii:"«"
     length I0, S0
@@ -608,6 +596,122 @@
 3
 OUT
 
+pir_output_is( <<'CODE', <<'OUT', 'illegal utf8 chars' );
+.sub 'main'
+    # malformed strings
+    'test_chars'(binary:"\x41\x80\x41")
+    'test_chars'(binary:"\x41\xBF\x41")
+    'test_chars'(binary:"\x41\xC1\xBF")
+    'test_chars'(binary:"\x41\xF5\xA1\xA2\xA3")
+    'test_chars'(binary:"\x41\xFE\x41")
+
+    # unaligned end
+    'test_chars'(binary:"\xC2")
+    'test_chars'(binary:"\xF4")
+    'test_chars'(binary:"\xE1\x80")
+    'test_chars'(binary:"\xF2\xAB")
+    'test_chars'(binary:"\xF1\x80\x80")
+
+    # overlong forms
+    'test_chars'(binary:"\xE0\x9F\xBF")         # 0x07FF
+    'test_chars'(binary:"\xF0\x8F\xBF\xBD")     # 0xFFFD
+
+    # invalid chars
+    'test_chars'(binary:"\xED\xA0\x80")         # 0xD800
+    'test_chars'(binary:"\xED\xBF\xBF")         # 0xDFFF
+    'test_chars'(binary:"\xEF\xB7\x90")         # 0xFDD0
+    'test_chars'(binary:"\xEF\xB7\xAF")         # 0xFDEF
+    'test_chars'(binary:"\xEF\xBF\xBE")         # 0xFFFE
+    'test_chars'(binary:"\xEF\xBF\xBF")         # 0xFFFF
+    'test_chars'(binary:"\xF0\x9F\xBF\xBE")     # 0x1FFFE
+    'test_chars'(binary:"\xF4\x8F\xBF\xBF")     # 0x10FFFF
+    'test_chars'(binary:"\xF4\x90\x80\x80")     # 0x110000
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc eh, ex, bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    eh = new 'ExceptionHandler'
+    set_addr eh, handler
+    push_eh eh
+    chars = bb.'get_string'('utf8')
+    say 'valid'
+    goto end
+  handler:
+    .local pmc ex
+    .get_results (ex)
+    $S0 = ex['message']
+    print $S0
+  end:
+    pop_eh
+.end
+CODE
+Malformed UTF-8 string
+Malformed UTF-8 string
+Malformed UTF-8 string
+Malformed UTF-8 string
+Malformed UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Overlong form in UTF-8 string
+Overlong form in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+OUT
+
+pir_output_is( <<'CODE', <<'OUT', 'valid utf8 chars' );
+.sub 'main'
+    'test_chars'(binary:"\xC2\x80")
+    'test_chars'(binary:"\xE0\xA0\x80")
+    'test_chars'(binary:"\xED\x9F\xBF")
+    'test_chars'(binary:"\xEE\x80\x80")
+    'test_chars'(binary:"\xEF\xB7\x8F")
+    'test_chars'(binary:"\xEF\xB7\xB0")
+    'test_chars'(binary:"\xEF\xBF\xBD")
+    'test_chars'(binary:"\xF0\x90\x80\x80")
+    'test_chars'(binary:"\xF0\x9F\xBF\xBD")
+    'test_chars'(binary:"\xF0\xA0\x80\x80")
+    'test_chars'(binary:"\xF4\x8F\xBF\xBD")
+.end
+
+.sub 'test_chars'
+    .param string chars
+    .local pmc bb
+    bb = new 'ByteBuffer'
+    bb = chars
+    chars = bb.'get_string'('utf8')
+    $I0 = ord chars
+    $P0 = new 'FixedIntegerArray', 1
+    $P0[0] = $I0
+    $S0 = sprintf '0x%X', $P0
+    say $S0
+.end
+CODE
+0x80
+0x800
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+0x10000
+0x1FFFD
+0x20000
+0x10FFFD
+OUT
+
 SKIP: {
     skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
 


More information about the parrot-commits mailing list