[svn:parrot] r49201 - in trunk: src/io t/pmc

nwellnhof at svn.parrot.org nwellnhof at svn.parrot.org
Tue Sep 21 16:38:42 UTC 2010


Author: nwellnhof
Date: Tue Sep 21 16:38:41 2010
New Revision: 49201
URL: https://trac.parrot.org/parrot/changeset/49201

Log:
[io] Fix Parrot_io_read_utf8 with 3-byte chars

Also remove unsafe use of Parrot_str_concat

Modified:
   trunk/src/io/utf8.c
   trunk/t/pmc/io.t

Modified: trunk/src/io/utf8.c
==============================================================================
--- trunk/src/io/utf8.c	Tue Sep 21 14:31:17 2010	(r49200)
+++ trunk/src/io/utf8.c	Tue Sep 21 16:38:41 2010	(r49201)
@@ -65,31 +65,32 @@
             const UINTVAL c = *u8ptr;
 
             if (UTF8_IS_START(c)) {
-                UINTVAL len2 = UTF8SKIP(u8ptr);
+                UINTVAL new_bufused = iter.bytepos + UTF8SKIP(u8ptr);
+                UINTVAL len2;
                 INTVAL  read;
 
-                if (iter.bytepos + len2 <= s->bufused)
+                if (new_bufused <= s->bufused)
                     goto ok;
 
-                /* need len - 1 more chars */
-                --len2;
-                s2 = Parrot_str_new_init(interp, NULL, len2,
-                        Parrot_utf8_encoding_ptr, 0);
-                s2->bufused  = len2;
-
-                read = Parrot_io_read_buffer(interp, filehandle, &s2);
+                /* read additional bytes to complete UTF-8 char */
+                len2        = new_bufused - s->bufused;
+                s2          = Parrot_str_new_init(interp, NULL, len2,
+                                    Parrot_binary_encoding_ptr, 0);
+                s2->bufused = len2;
+                read        = Parrot_io_read_buffer(interp, filehandle, &s2);
                 UNUSED(read);
 
-                s->strlen    = iter.charpos;
-                s            = Parrot_str_concat(interp, s, s2);
-                *buf         = s;
-                len         += len2 + 1;
+                Parrot_gc_reallocate_string_storage(interp, s, new_bufused);
+                mem_sys_memcopy(s->strstart + s->bufused, s2->strstart, len2);
+
+                s->bufused  = new_bufused;
+                len        += len2;
 
                 /* check last char */
             }
         }
 ok:
-        Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, *buf, &iter);
+        STRING_iter_get_and_advance(interp, s, &iter);
     }
     s->strlen = iter.charpos;
     return len;

Modified: trunk/t/pmc/io.t
==============================================================================
--- trunk/t/pmc/io.t	Tue Sep 21 14:31:17 2010	(r49200)
+++ trunk/t/pmc/io.t	Tue Sep 21 16:38:41 2010	(r49201)
@@ -631,7 +631,7 @@
 
 ($FOO, $temp_file) = create_tempfile( UNLINK => 1 );
 
-print $FOO "T\xc3\xb6tsch\n";
+print $FOO "T\xc3\xb6tsch \xe2\x82\xac100\n";
 close $FOO;
 
 pir_output_is( sprintf(<<'CODE', $temp_file), <<"OUTPUT", "utf8 read enabled, read parts" );
@@ -642,6 +642,10 @@
     pio.'open'(temp_file, 'r')
     pio.'encoding'("utf8")
     $S0 = pio.'read'(2)
+    say $S0
+    $S1 = pio.'read'(7)
+    say $S1
+    $S0 .= $S1
     $S1 = pio.'read'(1024) # read the rest of the file (much shorter than 1K)
     $S0 .= $S1
     pio.'close'()
@@ -650,13 +654,13 @@
     $S2 = encodingname $I1
     say $S2
 
-    $I1 = find_encoding 'iso-8859-1'
-    trans_encoding $S1, $S0, $I1
-    print $S1
+    print $S0
 .end
 CODE
+T\xc3\xb6
+tsch \xe2\x82\xac
 utf8
-T\xf6tsch
+T\xc3\xb6tsch \xe2\x82\xac100
 OUTPUT
 
 pir_output_is( <<"CODE", <<"OUTPUT", "PIO.readall() - classmeth" );


More information about the parrot-commits mailing list