[svn:parrot] r47827 - in branches/gsoc_nfg: src/string/encoding t/op

darbelo at svn.parrot.org darbelo at svn.parrot.org
Fri Jun 25 06:48:33 UTC 2010


Author: darbelo
Date: Fri Jun 25 06:48:33 2010
New Revision: 47827
URL: https://trac.parrot.org/parrot/changeset/47827

Log:
Work out the last iterator issues causing the creation of dynamic graphemes for valid Unicode compositions.
Add passing tests for this and a failing one for a newly uncovered bug with lazy table creation in transcoding.

Modified:
   branches/gsoc_nfg/src/string/encoding/nfg.c
   branches/gsoc_nfg/t/op/string_nfg.t

Modified: branches/gsoc_nfg/src/string/encoding/nfg.c
==============================================================================
--- branches/gsoc_nfg/src/string/encoding/nfg.c	Fri Jun 25 05:35:00 2010	(r47826)
+++ branches/gsoc_nfg/src/string/encoding/nfg.c	Fri Jun 25 06:48:33 2010	(r47827)
@@ -21,6 +21,14 @@
 #include "parrot/parrot.h"
 #include "../unicode.h"
 #include "../grapheme.h"
+#if PARROT_HAS_ICU
+#  include <unicode/ucnv.h>
+#  include <unicode/utypes.h>
+#  include <unicode/uchar.h> 
+#  include <unicode/ustring.h>
+#  include <unicode/unorm.h>
+#endif  
+
 
 #if !PARROT_HAS_ICU
 PARROT_DOES_NOT_RETURN
@@ -523,18 +531,13 @@
         i->bytepos = pos * sizeof (UChar32);
         return;
     }
-    // TODO: This can create dynamic graphemes for valid Unicode compositions.
     else {
         grapheme_table *table = (grapheme_table *) i->str->extra;
         int32_t  prev = s[pos - 1];
         grapheme g;
 
-        if (table == NULL) {
-            table = create_grapheme_table(interp, 1);
-            i->str->extra = table;
-        }
-
         if (prev < 0) {
+            PARROT_ASSERT(table);
             g.len = table->graphemes[-1 - prev].len + 1;
             g.hash = table->graphemes[-1 - prev].hash;
             g.hash += g.hash << 5;
@@ -544,6 +547,30 @@
                    g.len * sizeof (UChar));
         }
         else {
+            UErrorCode err = U_ZERO_ERROR;
+            int dst_len = 1;
+            int src_len = 1;
+            UChar src[2];
+            UChar dst[2];
+            src[0] = s[pos - 1];
+            src[1] = s[pos];
+
+            /* Delegate composition to ICU. */
+            dst_len = unorm_normalize(src, src_len, UNORM_DEFAULT, 0,
+                                      dst, dst_len, &err);
+
+            if (U_SUCCESS(err)) {
+                /* Composition succeded, we have a valid Uncode codepoint. */
+                s[pos - 1] = dst[0];
+                return;
+            }
+
+            /* Composition failed, we need a dynamic codepoint. */
+            if (table == NULL) {
+                table = create_grapheme_table(interp, 1);
+                i->str->extra = table;
+            }
+
             g.len  = 2;
             g.hash = 0xffff;
             g.codepoints = mem_gc_allocate_n_typed(interp, g.len, UChar32);
@@ -556,8 +583,14 @@
             g.hash += g.hash << 5;
             g.hash += c;
         }
+
+        /* 
+         * If we reach this point, then a dynamic codepoint was created earlier.
+         * Insert it into the table, growing it if needed.
+         */
         if (grapheme_table_capacity(interp, (grapheme_table *)i->str->extra) < 1)
             i->str->extra = grow_grapheme_table(interp, (grapheme_table *) i->str->extra, 2);
+
         s[pos - 1] = add_grapheme(interp, (grapheme_table *) i->str->extra, &g);
         mem_gc_free(interp, g.codepoints);
     }

Modified: branches/gsoc_nfg/t/op/string_nfg.t
==============================================================================
--- branches/gsoc_nfg/t/op/string_nfg.t	Fri Jun 25 05:35:00 2010	(r47826)
+++ branches/gsoc_nfg/t/op/string_nfg.t	Fri Jun 25 06:48:33 2010	(r47827)
@@ -20,7 +20,7 @@
 
 .include 'stringinfo.pasm'
 
-.const int TESTS = 8
+.const int TESTS = 12
 
 .sub _main :main
     .include 'test_more.pir'
@@ -48,6 +48,26 @@
     $I1 = stringinfo $S1, .STRINGINFO_BUFUSED
     is($I0, $I1, "Bufused is the same.")
 
+    # Do the same thing again, without dynamic codepoints.
+    $S2 = utf16:unicode:"O\u0308"
+    $I0 = find_encoding 'nfg'
+    $S2 = trans_encoding $S2, $I0
+
+    $S3 = nfg:unicode:"O\u0308"
+
+    $I0 = stringinfo $S2, .STRINGINFO_STRLEN
+    $I1 = stringinfo $S3, .STRINGINFO_STRLEN
+    is($I0, $I1, "Lenght is the same.")
+
+    $I0 = stringinfo $S2, .STRINGINFO_BUFUSED
+    $I1 = stringinfo $S3, .STRINGINFO_BUFUSED
+    is($I0, $I1, "Bufused is the same.")
+
+    $I0 = stringinfo $S2, .STRINGINFO_EXTRA
+    $I1 = stringinfo $S3, .STRINGINFO_EXTRA
+    is($I0, $I1, "EXTRA is the same.")
+    is($I1, 0, "EXTRA is NULL.")
+
 .end
 
 .sub transcode_without_graphemes


More information about the parrot-commits mailing list