[svn:parrot] r49749 - in branches/string_checks: src/io src/string src/string/encoding t/op
nwellnhof at svn.parrot.org
nwellnhof at svn.parrot.org
Sun Oct 31 15:00:38 UTC 2010
Author: nwellnhof
Date: Sun Oct 31 15:00:38 2010
New Revision: 49749
URL: https://trac.parrot.org/parrot/changeset/49749
Log:
[str] UTF-8 checks
Don't allow overlong forms. Perform all checks during initial scan.
Modified:
branches/string_checks/src/io/utf8.c
branches/string_checks/src/string/encoding/utf8.c
branches/string_checks/src/string/unicode.h
branches/string_checks/t/op/stringu.t
Modified: branches/string_checks/src/io/utf8.c
==============================================================================
--- branches/string_checks/src/io/utf8.c Sun Oct 31 15:00:04 2010 (r49748)
+++ branches/string_checks/src/io/utf8.c Sun Oct 31 15:00:38 2010 (r49749)
@@ -48,51 +48,78 @@
ARGMOD(STRING **buf))
{
ASSERT_ARGS(Parrot_io_read_utf8)
- STRING *s, *s2;
- String_iter iter;
+ size_t bytepos = 0;
+ size_t charpos = 0;
+ size_t len = Parrot_io_read_buffer(interp, filehandle, buf);
+ STRING *s = *buf;
- size_t len = Parrot_io_read_buffer(interp, filehandle, buf);
- s = *buf;
s->encoding = Parrot_utf8_encoding_ptr;
- /* count chars, verify utf8 */
- STRING_ITER_INIT(interp, &iter);
+ while (bytepos < s->bufused) {
+ utf8_t *u8ptr = (utf8_t *)(s->strstart + bytepos);
+ UINTVAL c = *u8ptr;
+ size_t utf8_len = 1;
- while (iter.bytepos < s->bufused) {
- if (iter.bytepos + 4 > s->bufused) {
- const utf8_t *u8ptr = (utf8_t *)((char *)s->strstart +
- iter.bytepos);
- const UINTVAL c = *u8ptr;
-
- if (UTF8_IS_START(c)) {
- UINTVAL new_bufused = iter.bytepos + UTF8SKIP(u8ptr);
- UINTVAL len2;
- INTVAL read;
+ if (UTF8_IS_START(c)) {
+ size_t new_bufused, count;
- if (new_bufused <= s->bufused)
- goto ok;
+ utf8_len = UTF8SKIP(u8ptr);
+ new_bufused = bytepos + utf8_len;
+ if (new_bufused > s->bufused) {
/* read additional bytes to complete UTF-8 char */
- len2 = new_bufused - s->bufused;
- s2 = Parrot_str_new_init(interp, NULL, len2,
+ size_t read;
+ size_t len2 = new_bufused - s->bufused;
+ STRING *s2 = Parrot_str_new_init(interp, NULL, len2,
Parrot_binary_encoding_ptr, 0);
+
s2->bufused = len2;
read = Parrot_io_read_buffer(interp, filehandle, &s2);
- UNUSED(read);
+
+ if (read < len2)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Unaligned end in UTF-8 string\n");
Parrot_gc_reallocate_string_storage(interp, s, new_bufused);
mem_sys_memcopy(s->strstart + s->bufused, s2->strstart, len2);
s->bufused = new_bufused;
+ u8ptr = (utf8_t *)(s->strstart + bytepos);
len += len2;
+ }
+
+ /* Check for overlong forms */
+ if (UTF8_IS_OVERLONG(c, u8ptr[1]))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Overlong form in UTF-8 string\n");
+
+ c &= UTF8_START_MASK(utf8_len);
+
+ for (count = 1; count < utf8_len; ++count) {
+ ++u8ptr;
- /* check last char */
+ if (!UTF8_IS_CONTINUATION(*u8ptr))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Malformed UTF-8 string\n");
+
+ c = UTF8_ACCUMULATE(c, *u8ptr);
}
+
+ if (UNICODE_IS_INVALID(c))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character in UTF-8 string\n");
+ }
+ else if (!UNICODE_IS_INVARIANT(c)) {
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Malformed UTF-8 string\n");
}
-ok:
- STRING_iter_get_and_advance(interp, s, &iter);
+
+ bytepos += utf8_len;
+ charpos += 1;
}
- s->strlen = iter.charpos;
+
+ s->strlen = charpos;
+
return len;
}
Modified: branches/string_checks/src/string/encoding/utf8.c
==============================================================================
--- branches/string_checks/src/string/encoding/utf8.c Sun Oct 31 15:00:04 2010 (r49748)
+++ branches/string_checks/src/string/encoding/utf8.c Sun Oct 31 15:00:38 2010 (r49749)
@@ -86,10 +86,6 @@
__attribute__nonnull__(1)
__attribute__nonnull__(2);
-static UINTVAL utf8_scan2(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static const utf8_t * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
@@ -134,9 +130,6 @@
#define ASSERT_ARGS_utf8_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_utf8_scan2 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_utf8_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(ptr))
#define ASSERT_ARGS_utf8_skip_forward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -262,41 +255,47 @@
UINTVAL characters = 0;
while (u8ptr < u8end) {
- u8ptr += UTF8SKIP(u8ptr);
- ++characters;
- }
+ UINTVAL c = *u8ptr;
- if (u8ptr > u8end)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Unaligned end in UTF-8 string\n");
+ if (UTF8_IS_START(c)) {
+ size_t len = UTF8SKIP(u8ptr);
+ size_t count;
- return characters;
-}
+ if (u8ptr + len > u8end)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Unaligned end in UTF-8 string\n");
+ /* Check for overlong forms */
+ if (UTF8_IS_OVERLONG(c, u8ptr[1]))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Overlong form in UTF-8 string\n");
-/*
+ c &= UTF8_START_MASK(len);
-=item C<static UINTVAL utf8_scan2(PARROT_INTERP, const STRING *src)>
+ for (count = 1; count < len; ++count) {
+ ++u8ptr;
-Returns the number of codepoints in string C<src>.
+ if (!UTF8_IS_CONTINUATION(*u8ptr))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Malformed UTF-8 string\n");
-=cut
+ c = UTF8_ACCUMULATE(c, *u8ptr);
+ }
-*/
+ if (UNICODE_IS_INVALID(c))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
+ "Invalid character in UTF-8 string\n");
+ }
+ else if (!UNICODE_IS_INVARIANT(c)) {
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
+ "Malformed UTF-8 string\n");
+ }
-static UINTVAL
-utf8_scan2(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(utf8_scan2)
- String_iter iter;
- /*
- * this is used to initially calculate src->strlen,
- * therefore we must scan the whole string
- */
- STRING_ITER_INIT(interp, &iter);
- while (iter.bytepos < src->bufused)
- utf8_iter_get_and_advance(interp, src, &iter);
- return iter.charpos;
+ ++u8ptr;
+ ++characters;
+ }
+
+ return characters;
}
@@ -354,20 +353,8 @@
for (count = 1; count < len; ++count) {
++u8ptr;
- if (!UTF8_IS_CONTINUATION(*u8ptr))
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Malformed UTF-8 string\n");
-
c = UTF8_ACCUMULATE(c, *u8ptr);
}
-
- if (UNICODE_IS_SURROGATE(c))
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Surrogate in UTF-8 string\n");
- }
- else if (!UNICODE_IS_INVARIANT(c)) {
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Malformed UTF-8 string\n");
}
return c;
@@ -396,10 +383,9 @@
const utf8_t * const u8ptr = (utf8_t *)ptr;
utf8_t *u8end = (utf8_t *)ptr + len - 1;
- if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) {
+ if (UNICODE_IS_INVALID(c))
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
- "Invalid character for UTF-8 encoding\n");
- }
+ "Invalid character for UTF-8 encoding\n");
while (u8end > u8ptr) {
*u8end-- =
@@ -443,8 +429,6 @@
Moves C<ptr> C<n> characters back.
-XXX This function is unused.
-
=cut
*/
@@ -485,6 +469,8 @@
ASSERT_ARGS(utf8_iter_get)
const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+ PARROT_ASSERT(i->charpos + offset < str->strlen);
+
if (offset > 0) {
u8ptr = utf8_skip_forward(u8ptr, offset);
}
@@ -514,6 +500,10 @@
ASSERT_ARGS(utf8_iter_skip)
const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
+ i->charpos += skip;
+
+ PARROT_ASSERT(i->charpos <= str->strlen);
+
if (skip > 0) {
u8ptr = utf8_skip_forward(u8ptr, skip);
}
@@ -521,8 +511,9 @@
u8ptr = utf8_skip_backward(u8ptr, -skip);
}
- i->charpos += skip;
i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
}
@@ -544,35 +535,13 @@
{
ASSERT_ARGS(utf8_iter_get_and_advance)
const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos);
- UINTVAL c = *u8ptr;
-
- if (UTF8_IS_START(c)) {
- UINTVAL len = UTF8SKIP(u8ptr);
+ UINTVAL c = utf8_decode(interp, u8ptr);
- c &= UTF8_START_MASK(len);
- i->bytepos += len;
- for (len--; len; len--) {
- u8ptr++;
+ i->charpos += 1;
+ i->bytepos += UTF8SKIP(u8ptr);
- if (!UTF8_IS_CONTINUATION(*u8ptr))
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Malformed UTF-8 string\n");
- c = UTF8_ACCUMULATE(c, *u8ptr);
- }
-
- if (UNICODE_IS_SURROGATE(c))
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Surrogate in UTF-8 string\n");
- }
- else if (!UNICODE_IS_INVARIANT(c)) {
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
- "Malformed UTF-8 string\n");
- }
- else {
- i->bytepos++;
- }
+ PARROT_ASSERT(i->bytepos <= str->bufused);
- i->charpos++;
return c;
}
@@ -597,10 +566,10 @@
unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos;
unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c);
- i->bytepos += (new_pos - pos);
- /* XXX possible buffer overrun exception? */
- PARROT_ASSERT(i->bytepos <= Buffer_buflen(str));
- i->charpos++;
+ i->charpos += 1;
+ i->bytepos += new_pos - pos;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
}
@@ -629,6 +598,8 @@
return;
}
+ PARROT_ASSERT(pos <= str->strlen);
+
/*
* we know the byte offsets of three positions: start, current and end
* now find the shortest way to reach pos
@@ -657,6 +628,8 @@
i->charpos = pos;
i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
+
+ PARROT_ASSERT(i->bytepos <= str->bufused);
}
@@ -664,7 +637,7 @@
0,
"utf8",
NULL,
- 4, /* Max bytes per codepoint */
+ UTF8_MAXLEN, /* Max bytes per codepoint */
utf8_to_encoding,
unicode_chr,
@@ -674,9 +647,9 @@
encoding_index,
encoding_rindex,
encoding_hash,
- unicode_validate,
+ encoding_validate,
- utf8_scan2,
+ utf8_scan,
utf8_ord,
encoding_substr,
Modified: branches/string_checks/src/string/unicode.h
==============================================================================
--- branches/string_checks/src/string/unicode.h Sun Oct 31 15:00:04 2010 (r49748)
+++ branches/string_checks/src/string/unicode.h Sun Oct 31 15:00:38 2010 (r49749)
@@ -33,6 +33,14 @@
#define UNICODE_IS_LOW_SURROGATE(c) ((c) >= UNICODE_LOW_SURROGATE_FIRST && \
(c) <= UNICODE_LOW_SURROGATE_LAST)
#define UNICODE_IS_INVARIANT(c) ((c) < 0x80u)
+#define UNICODE_IS_NON_CHARACTER(c) (((c) & 0xFFFEu) == 0xFFFEu || \
+ ((c) >= 0xFDD0u && (c) <= 0xFDEFu))
+#define UNICODE_IS_INVALID(c) ((c) >= UNICODE_SURROGATE_FIRST && \
+ ((c) <= 0xFDEFu ? \
+ (c) <= UNICODE_SURROGATE_LAST || \
+ (c) >= 0xFDD0u : \
+ ((c) & 0xFFFEu) == 0xFFFEu || \
+ (c) > 0x10FFFFu))
#define UNICODE_HIGH_SURROGATE(c) \
((((c) - 0x10000u) >> UNICODE_HIGH_SURROGATE_SHIFT) + UNICODE_HIGH_SURROGATE_FIRST)
@@ -43,8 +51,8 @@
((low) - UNICODE_LOW_SURROGATE_FIRST) + 0x10000u)
#define UNISKIP(uv) ((uv) < 0x80 ? 1 : \
- (uv) < 0x800 ? 2 : \
- (uv) < 0x10000 ? 3 : 4)
+ (uv) < 0x800 ? 2 : \
+ (uv) < 0x10000 ? 3 : 4)
#define UTF16SKIP(s) (UNICODE_IS_HIGH_SURROGATE(*(s)) ? 2 : 1)
@@ -64,7 +72,7 @@
*/
-#define UTF8_IS_START(c) ((c) >= 0xC0u && (c) <= 0xFDu)
+#define UTF8_IS_START(c) ((c) >= 0xC2u && (c) <= 0xF4u)
#define UTF8_IS_CONTINUATION(c) ((c) >= 0x80u && (c) <= 0xBFu)
#define UTF8_IS_CONTINUED(c) ((c) & 0x80u)
@@ -76,6 +84,9 @@
#define UTF8_CONTINUATION_MASK 0x3Fu
#define UTF8_ACCUMULATE(old, new) (((old) << UTF8_ACCUMULATION_SHIFT) | ((new) & UTF8_CONTINUATION_MASK))
+#define UTF8_IS_OVERLONG(c1, c2) (((c1) == 0xE0u && (c2) < 0xA0u) || \
+ ((c1) == 0xF0u && (c2) < 0x90u))
+
extern const char Parrot_utf8skip[256];
#define UTF8SKIP(s) Parrot_utf8skip[(int)*(s)]
Modified: branches/string_checks/t/op/stringu.t
==============================================================================
--- branches/string_checks/t/op/stringu.t Sun Oct 31 15:00:04 2010 (r49748)
+++ branches/string_checks/t/op/stringu.t Sun Oct 31 15:00:38 2010 (r49749)
@@ -6,7 +6,7 @@
use warnings;
use lib qw( . lib ../lib ../../lib );
use Test::More;
-use Parrot::Test tests => 34;
+use Parrot::Test tests => 35;
use Parrot::Config;
=head1 NAME
@@ -250,18 +250,6 @@
\xc2\xab
OUTPUT
-pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
- set S0, utf8:unicode:"\xf2\xab"
- length I0, S0
- print I0
- print "\n"
- print S0
- print "\n"
- end
-CODE
-/Malformed UTF-8 string/
-OUTPUT
-
pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
set S0, ascii:"«"
length I0, S0
@@ -608,6 +596,122 @@
3
OUT
+pir_output_is( <<'CODE', <<'OUT', 'illegal utf8 chars' );
+.sub 'main'
+ # malformed strings
+ 'test_chars'(binary:"\x41\x80\x41")
+ 'test_chars'(binary:"\x41\xBF\x41")
+ 'test_chars'(binary:"\x41\xC1\xBF")
+ 'test_chars'(binary:"\x41\xF5\xA1\xA2\xA3")
+ 'test_chars'(binary:"\x41\xFE\x41")
+
+ # unaligned end
+ 'test_chars'(binary:"\xC2")
+ 'test_chars'(binary:"\xF4")
+ 'test_chars'(binary:"\xE1\x80")
+ 'test_chars'(binary:"\xF2\xAB")
+ 'test_chars'(binary:"\xF1\x80\x80")
+
+ # overlong forms
+ 'test_chars'(binary:"\xE0\x9F\xBF") # 0x07FF
+ 'test_chars'(binary:"\xF0\x8F\xBF\xBD") # 0xFFFD
+
+ # invalid chars
+ 'test_chars'(binary:"\xED\xA0\x80") # 0xD800
+ 'test_chars'(binary:"\xED\xBF\xBF") # 0xDFFF
+ 'test_chars'(binary:"\xEF\xB7\x90") # 0xFDD0
+ 'test_chars'(binary:"\xEF\xB7\xAF") # 0xFDEF
+ 'test_chars'(binary:"\xEF\xBF\xBE") # 0xFFFE
+ 'test_chars'(binary:"\xEF\xBF\xBF") # 0xFFFF
+ 'test_chars'(binary:"\xF0\x9F\xBF\xBE") # 0x1FFFE
+ 'test_chars'(binary:"\xF4\x8F\xBF\xBF") # 0x10FFFF
+ 'test_chars'(binary:"\xF4\x90\x80\x80") # 0x110000
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc eh, ex, bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ eh = new 'ExceptionHandler'
+ set_addr eh, handler
+ push_eh eh
+ chars = bb.'get_string'('utf8')
+ say 'valid'
+ goto end
+ handler:
+ .local pmc ex
+ .get_results (ex)
+ $S0 = ex['message']
+ print $S0
+ end:
+ pop_eh
+.end
+CODE
+Malformed UTF-8 string
+Malformed UTF-8 string
+Malformed UTF-8 string
+Malformed UTF-8 string
+Malformed UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Unaligned end in UTF-8 string
+Overlong form in UTF-8 string
+Overlong form in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+Invalid character in UTF-8 string
+OUT
+
+pir_output_is( <<'CODE', <<'OUT', 'valid utf8 chars' );
+.sub 'main'
+ 'test_chars'(binary:"\xC2\x80")
+ 'test_chars'(binary:"\xE0\xA0\x80")
+ 'test_chars'(binary:"\xED\x9F\xBF")
+ 'test_chars'(binary:"\xEE\x80\x80")
+ 'test_chars'(binary:"\xEF\xB7\x8F")
+ 'test_chars'(binary:"\xEF\xB7\xB0")
+ 'test_chars'(binary:"\xEF\xBF\xBD")
+ 'test_chars'(binary:"\xF0\x90\x80\x80")
+ 'test_chars'(binary:"\xF0\x9F\xBF\xBD")
+ 'test_chars'(binary:"\xF0\xA0\x80\x80")
+ 'test_chars'(binary:"\xF4\x8F\xBF\xBD")
+.end
+
+.sub 'test_chars'
+ .param string chars
+ .local pmc bb
+ bb = new 'ByteBuffer'
+ bb = chars
+ chars = bb.'get_string'('utf8')
+ $I0 = ord chars
+ $P0 = new 'FixedIntegerArray', 1
+ $P0[0] = $I0
+ $S0 = sprintf '0x%X', $P0
+ say $S0
+.end
+CODE
+0x80
+0x800
+0xD7FF
+0xE000
+0xFDCF
+0xFDF0
+0xFFFD
+0x10000
+0x1FFFD
+0x20000
+0x10FFFD
+OUT
+
SKIP: {
skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
More information about the parrot-commits
mailing list