[svn:parrot] r47827 - in branches/gsoc_nfg: src/string/encoding t/op
darbelo at svn.parrot.org
darbelo at svn.parrot.org
Fri Jun 25 06:48:33 UTC 2010
Author: darbelo
Date: Fri Jun 25 06:48:33 2010
New Revision: 47827
URL: https://trac.parrot.org/parrot/changeset/47827
Log:
Work out the last iterator issues causing the creation of dynamic graphemes for valid Unicode compositions.
Add passing tests for this and a failing one for a newly uncovered bug with lazy table creation in transcoding.
Modified:
branches/gsoc_nfg/src/string/encoding/nfg.c
branches/gsoc_nfg/t/op/string_nfg.t
Modified: branches/gsoc_nfg/src/string/encoding/nfg.c
==============================================================================
--- branches/gsoc_nfg/src/string/encoding/nfg.c Fri Jun 25 05:35:00 2010 (r47826)
+++ branches/gsoc_nfg/src/string/encoding/nfg.c Fri Jun 25 06:48:33 2010 (r47827)
@@ -21,6 +21,14 @@
#include "parrot/parrot.h"
#include "../unicode.h"
#include "../grapheme.h"
+#if PARROT_HAS_ICU
+# include <unicode/ucnv.h>
+# include <unicode/utypes.h>
+# include <unicode/uchar.h>
+# include <unicode/ustring.h>
+# include <unicode/unorm.h>
+#endif
+
#if !PARROT_HAS_ICU
PARROT_DOES_NOT_RETURN
@@ -523,18 +531,13 @@
i->bytepos = pos * sizeof (UChar32);
return;
}
- // TODO: This can create dynamic graphemes for valid Unicode compositions.
else {
grapheme_table *table = (grapheme_table *) i->str->extra;
int32_t prev = s[pos - 1];
grapheme g;
- if (table == NULL) {
- table = create_grapheme_table(interp, 1);
- i->str->extra = table;
- }
-
if (prev < 0) {
+ PARROT_ASSERT(table);
g.len = table->graphemes[-1 - prev].len + 1;
g.hash = table->graphemes[-1 - prev].hash;
g.hash += g.hash << 5;
@@ -544,6 +547,30 @@
g.len * sizeof (UChar));
}
else {
+ UErrorCode err = U_ZERO_ERROR;
+ int dst_len = 1;
+ int src_len = 1;
+ UChar src[2];
+ UChar dst[2];
+ src[0] = s[pos - 1];
+ src[1] = s[pos];
+
+ /* Delegate composition to ICU. */
+ dst_len = unorm_normalize(src, src_len, UNORM_DEFAULT, 0,
+ dst, dst_len, &err);
+
+ if (U_SUCCESS(err)) {
+ /* Composition succeded, we have a valid Uncode codepoint. */
+ s[pos - 1] = dst[0];
+ return;
+ }
+
+ /* Composition failed, we need a dynamic codepoint. */
+ if (table == NULL) {
+ table = create_grapheme_table(interp, 1);
+ i->str->extra = table;
+ }
+
g.len = 2;
g.hash = 0xffff;
g.codepoints = mem_gc_allocate_n_typed(interp, g.len, UChar32);
@@ -556,8 +583,14 @@
g.hash += g.hash << 5;
g.hash += c;
}
+
+ /*
+ * If we reach this point, then a dynamic codepoint was created earlier.
+ * Insert it into the table, growing it if needed.
+ */
if (grapheme_table_capacity(interp, (grapheme_table *)i->str->extra) < 1)
i->str->extra = grow_grapheme_table(interp, (grapheme_table *) i->str->extra, 2);
+
s[pos - 1] = add_grapheme(interp, (grapheme_table *) i->str->extra, &g);
mem_gc_free(interp, g.codepoints);
}
Modified: branches/gsoc_nfg/t/op/string_nfg.t
==============================================================================
--- branches/gsoc_nfg/t/op/string_nfg.t Fri Jun 25 05:35:00 2010 (r47826)
+++ branches/gsoc_nfg/t/op/string_nfg.t Fri Jun 25 06:48:33 2010 (r47827)
@@ -20,7 +20,7 @@
.include 'stringinfo.pasm'
-.const int TESTS = 8
+.const int TESTS = 12
.sub _main :main
.include 'test_more.pir'
@@ -48,6 +48,26 @@
$I1 = stringinfo $S1, .STRINGINFO_BUFUSED
is($I0, $I1, "Bufused is the same.")
+ # Do the same thing again, without dynamic codepoints.
+ $S2 = utf16:unicode:"O\u0308"
+ $I0 = find_encoding 'nfg'
+ $S2 = trans_encoding $S2, $I0
+
+ $S3 = nfg:unicode:"O\u0308"
+
+ $I0 = stringinfo $S2, .STRINGINFO_STRLEN
+ $I1 = stringinfo $S3, .STRINGINFO_STRLEN
+ is($I0, $I1, "Lenght is the same.")
+
+ $I0 = stringinfo $S2, .STRINGINFO_BUFUSED
+ $I1 = stringinfo $S3, .STRINGINFO_BUFUSED
+ is($I0, $I1, "Bufused is the same.")
+
+ $I0 = stringinfo $S2, .STRINGINFO_EXTRA
+ $I1 = stringinfo $S3, .STRINGINFO_EXTRA
+ is($I0, $I1, "EXTRA is the same.")
+ is($I1, 0, "EXTRA is NULL.")
+
.end
.sub transcode_without_graphemes
More information about the parrot-commits
mailing list