[svn:parrot] r49201 - in trunk: src/io t/pmc
nwellnhof at svn.parrot.org
nwellnhof at svn.parrot.org
Tue Sep 21 16:38:42 UTC 2010
Author: nwellnhof
Date: Tue Sep 21 16:38:41 2010
New Revision: 49201
URL: https://trac.parrot.org/parrot/changeset/49201
Log:
[io] Fix Parrot_io_read_utf8 with 3-byte chars
Also remove unsafe use of Parrot_str_concat
Modified:
trunk/src/io/utf8.c
trunk/t/pmc/io.t
Modified: trunk/src/io/utf8.c
==============================================================================
--- trunk/src/io/utf8.c Tue Sep 21 14:31:17 2010 (r49200)
+++ trunk/src/io/utf8.c Tue Sep 21 16:38:41 2010 (r49201)
@@ -65,31 +65,32 @@
const UINTVAL c = *u8ptr;
if (UTF8_IS_START(c)) {
- UINTVAL len2 = UTF8SKIP(u8ptr);
+ UINTVAL new_bufused = iter.bytepos + UTF8SKIP(u8ptr);
+ UINTVAL len2;
INTVAL read;
- if (iter.bytepos + len2 <= s->bufused)
+ if (new_bufused <= s->bufused)
goto ok;
- /* need len - 1 more chars */
- --len2;
- s2 = Parrot_str_new_init(interp, NULL, len2,
- Parrot_utf8_encoding_ptr, 0);
- s2->bufused = len2;
-
- read = Parrot_io_read_buffer(interp, filehandle, &s2);
+ /* read additional bytes to complete UTF-8 char */
+ len2 = new_bufused - s->bufused;
+ s2 = Parrot_str_new_init(interp, NULL, len2,
+ Parrot_binary_encoding_ptr, 0);
+ s2->bufused = len2;
+ read = Parrot_io_read_buffer(interp, filehandle, &s2);
UNUSED(read);
- s->strlen = iter.charpos;
- s = Parrot_str_concat(interp, s, s2);
- *buf = s;
- len += len2 + 1;
+ Parrot_gc_reallocate_string_storage(interp, s, new_bufused);
+ mem_sys_memcopy(s->strstart + s->bufused, s2->strstart, len2);
+
+ s->bufused = new_bufused;
+ len += len2;
/* check last char */
}
}
ok:
- Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, *buf, &iter);
+ STRING_iter_get_and_advance(interp, s, &iter);
}
s->strlen = iter.charpos;
return len;
Modified: trunk/t/pmc/io.t
==============================================================================
--- trunk/t/pmc/io.t Tue Sep 21 14:31:17 2010 (r49200)
+++ trunk/t/pmc/io.t Tue Sep 21 16:38:41 2010 (r49201)
@@ -631,7 +631,7 @@
($FOO, $temp_file) = create_tempfile( UNLINK => 1 );
-print $FOO "T\xc3\xb6tsch\n";
+print $FOO "T\xc3\xb6tsch \xe2\x82\xac100\n";
close $FOO;
pir_output_is( sprintf(<<'CODE', $temp_file), <<"OUTPUT", "utf8 read enabled, read parts" );
@@ -642,6 +642,10 @@
pio.'open'(temp_file, 'r')
pio.'encoding'("utf8")
$S0 = pio.'read'(2)
+ say $S0
+ $S1 = pio.'read'(7)
+ say $S1
+ $S0 .= $S1
$S1 = pio.'read'(1024) # read the rest of the file (much shorter than 1K)
$S0 .= $S1
pio.'close'()
@@ -650,13 +654,13 @@
$S2 = encodingname $I1
say $S2
- $I1 = find_encoding 'iso-8859-1'
- trans_encoding $S1, $S0, $I1
- print $S1
+ print $S0
.end
CODE
+T\xc3\xb6
+tsch \xe2\x82\xac
utf8
-T\xf6tsch
+T\xc3\xb6tsch \xe2\x82\xac100
OUTPUT
pir_output_is( <<"CODE", <<"OUTPUT", "PIO.readall() - classmeth" );
More information about the parrot-commits
mailing list