[svn:parrot] r39529 - in trunk: config/gen/makefiles src/string t/op

bacek at svn.parrot.org bacek at svn.parrot.org
Fri Jun 12 23:23:29 UTC 2009


Author: bacek
Date: Fri Jun 12 23:23:26 2009
New Revision: 39529
URL: https://trac.parrot.org/parrot/changeset/39529

Log:
Merge tt24_unicode_numifications branch into trunk.

Modified:
   trunk/config/gen/makefiles/root.in
   trunk/src/string/api.c
   trunk/t/op/number.t
   trunk/t/op/sprintf.t
   trunk/t/op/string.t
   trunk/t/op/stringu.t

Modified: trunk/config/gen/makefiles/root.in
==============================================================================
--- trunk/config/gen/makefiles/root.in	Fri Jun 12 18:06:27 2009	(r39528)
+++ trunk/config/gen/makefiles/root.in	Fri Jun 12 23:23:26 2009	(r39529)
@@ -618,6 +618,7 @@
     $(SRC_DIR)/scheduler.str \
     $(SRC_DIR)/spf_render.str \
     $(SRC_DIR)/spf_vtable.str \
+    $(SRC_DIR)/string/api.str \
     $(SRC_DIR)/sub.str \
     $(SRC_DIR)/stacks.str \
     \

Modified: trunk/src/string/api.c
==============================================================================
--- trunk/src/string/api.c	Fri Jun 12 18:06:27 2009	(r39528)
+++ trunk/src/string/api.c	Fri Jun 12 23:23:26 2009	(r39529)
@@ -27,6 +27,7 @@
 #include "parrot/compiler.h"
 #include "parrot/string_funcs.h"
 #include "private_cstring.h"
+#include "api.str"
 
 #define nonnull_encoding_name(s) (s) ? (s)->encoding->name : "null string"
 #define saneify_string(s) \
@@ -2041,6 +2042,21 @@
     return output;
 }
 
+/*
+State of FSM during number value parsing.
+
+Integer uses only parse_start, parse_before_dot and parse_end.
+
+*/
+typedef enum number_parse_state {
+    parse_start,
+    parse_before_dot,
+    parse_after_dot,
+    parse_after_e,
+    parse_after_e_sign,
+    parse_end
+} number_parse_state;
+
 
 /*
 
@@ -2074,45 +2090,60 @@
     if (s == NULL)
         return 0;
     {
-        const char         *start     = s->strstart;
-        const char * const  end       = start + s->bufused;
         const INTVAL        max_safe  = PARROT_INTVAL_MAX / 10;
         const INTVAL        last_dig  = PARROT_INTVAL_MAX % 10;
         int                 sign      = 1;
-        INTVAL              in_number = 0;
         INTVAL              i         = 0;
+        String_iter         iter;
+        UINTVAL             offs;
+        number_parse_state  state = parse_start;
 
-        PARROT_ASSERT(s);
+        ENCODING_ITER_INIT(interp, s, &iter);
 
-        while (start < end) {
-            const unsigned char c = *start;
+        for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
+            const UINTVAL c = iter.get_and_advance(interp, &iter);
+
+            switch (state) {
+                case parse_start:
+                    if (isdigit(c)) {
+                        const INTVAL nextval = c - '0';
+                        if (i < max_safe || (i == max_safe && nextval <= last_dig))
+                            i = i * 10 + nextval;
+                        else
+                            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ERR_OVERFLOW,
+                                "Integer value of String '%S' too big", s);
+                        state = parse_before_dot;
+                    }
+                    else if (c == '-') {
+                        sign      = -1;
+                        state = parse_before_dot;
+                    }
+                    else if (c == '+')
+                        state = parse_before_dot;
+                    else if (isspace((unsigned char)c))
+                        ; /* Do nothing */
+                    else
+                        state = parse_end;
 
-            if (isdigit((unsigned char)c)) {
-                const INTVAL nextval = c - '0';
-                in_number = 1;
-                if (i < max_safe || (i == max_safe && nextval <= last_dig))
-                    i = i * 10 + nextval;
-                else
-                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ERR_OVERFLOW,
-                        "Integer value of String '%S' too big", s);
-            }
-            else if (!in_number) {
-                /* we've not yet seen any digits */
-                if (c == '-') {
-                    sign      = -1;
-                    in_number = 1;
-                }
-                else if (c == '+')
-                    in_number = 1;
-                else if (isspace((unsigned char)c))
-                    ;
-                else
+                    break;
+
+                case parse_before_dot:
+                    if (isdigit(c)) {
+                        const INTVAL nextval = c - '0';
+                        if (i < max_safe || (i == max_safe && nextval <= last_dig))
+                            i = i * 10 + nextval;
+                        else
+                            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ERR_OVERFLOW,
+                                "Integer value of String '%S' too big", s);
+                    }
+                    else
+                        state = parse_end;
+                    break;
+
+                default:
+                    /* Pacify compiler */
                     break;
             }
-            else {
-                break;
-            }
-            ++start;
         }
 
         i *= sign;
@@ -2121,7 +2152,6 @@
     }
 }
 
-
 /*
 
 =item C<FLOATVAL Parrot_str_to_num(PARROT_INTERP, const STRING *s)>
@@ -2138,39 +2168,158 @@
 Parrot_str_to_num(PARROT_INTERP, ARGIN(const STRING *s))
 {
     ASSERT_ARGS(Parrot_str_to_num)
-    FLOATVAL    f;
-    char       *cstr;
-    const char *p;
+    FLOATVAL      f         = 0.0;
+    FLOATVAL      mantissa  = 0.0;
+    FLOATVAL      sign      = 1.0; /* -1 for '-' */
+    FLOATVAL      divider   = 0.1;
+    INTVAL        e         = 0;
+    INTVAL        e_sign    = 1; /* -1 for '-' */
+    /* How many digits it's safe to parse */
+    const INTVAL  max_safe  = PARROT_INTVAL_MAX / 10;
+    INTVAL        m         = 0;    /* Integer mantissa */
+    int           m_is_safe = 1;    /* We can use integer mantissa */
+    INTVAL        d         = 0;    /* Integer descriminator */
+    int           d_is_safe = 1;    /* We can use integer mantissa */
+    int           d_length  = 0;
+    int           check_nan = 0;    /* Check for NaN and Inf after main loop */
+    String_iter iter;
+    UINTVAL     offs;
+    number_parse_state state = parse_start;
 
-    /*
-     * XXX C99 atof interprets 0x prefix
-     * XXX would strtod() be better for detecting malformed input?
-     */
-    cstr = Parrot_str_to_cstring(interp, s);
-    p    = cstr;
+    if (!s)
+        return 0.0;
 
-    while (isspace((unsigned char)*p))
-        p++;
+    ENCODING_ITER_INIT(interp, s, &iter);
 
-    if (STREQ(p, PARROT_CSTRING_INF_POSITIVE))
-        f = PARROT_FLOATVAL_INF_POSITIVE;
-    else if (STREQ(p, PARROT_CSTRING_INF_NEGATIVE))
-        f = PARROT_FLOATVAL_INF_NEGATIVE;
-    else if (STREQ(p, PARROT_CSTRING_NAN_QUIET))
-        f = PARROT_FLOATVAL_NAN_QUIET;
-    else
-        f = atof(p);
+    /* Handcrafter FSM to read float value */
+    for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
+        const UINTVAL c = iter.get_and_advance(interp, &iter);
+        switch (state) {
+            case parse_start:
+                if (isdigit(c)) {
+                    f = c - '0';
+                    m = c - '0';
+                    state = parse_before_dot;
+                }
+                else if (c == '-') {
+                    sign = -1.0;
+                    state = parse_before_dot;
+                }
+                else if (c == '+')
+                    state = parse_before_dot;
+                else if (c == '.')
+                    state = parse_after_dot;
+                else if (isspace(c))
+                    ; /* Do nothing */
+                else {
+                    check_nan = 1;
+                    state     = parse_end;
+                }
+                break;
 
-    /* Not all atof()s return -0 from "-0" */
-    if (*p == '-' && FLOAT_IS_ZERO(f))
-#if defined(_MSC_VER)
-        /* Visual C++ compiles -0.0 to 0.0, so we need to trick
-            the compiler. */
-        f = 0.0 * -1;
-#else
-        f = -0.0;
-#endif
-    Parrot_str_free_cstring(cstr);
+            case parse_before_dot:
+                if (isdigit(c)) {
+                    f = f*10.0 + (c-'0');
+                    m = m*10 + (c-'0');
+                    /* Integer overflow for mantissa */
+                    if (m >= max_safe)
+                        m_is_safe = 0;
+                }
+                else if (c == '.') {
+                    state = parse_after_dot;
+                    /*
+                     * Throw gathered result. Recalulate from integer mantissa
+                     * to preserve precision.
+                     */
+                    if (m_is_safe)
+                        f = m;
+                    mantissa = f;
+                }
+                else if (c == 'e' || c == 'E') {
+                    state = parse_after_e;
+                    /* See comment above */
+                    if (m_is_safe)
+                        f = m;
+                    mantissa = f;
+                }
+                else {
+                    check_nan = 1;
+                    state     = parse_end;
+                }
+                break;
+
+            case parse_after_dot:
+                if (isdigit(c)) {
+                    f += (c-'0') * divider;
+                    divider /= 10.0;
+                    d = d*10 + (c-'0');
+                    if (d >= max_safe)
+                        d_is_safe = 0;
+                    d_length++;
+                }
+                else if (c == 'e' || c == 'E')
+                    state = parse_after_e;
+                else
+                    state = parse_end;
+                break;
+
+            case parse_after_e:
+                if (isdigit(c)) {
+                    e = e*10 + (c-'0');
+                    state = parse_after_e_sign;
+                }
+                else if (c == '-') {
+                    e_sign = -1;
+                    state = parse_after_e_sign;
+                }
+                else if (c == '+')
+                    state = parse_after_e_sign;
+                else
+                    state = parse_end;
+                break;
+
+            case parse_after_e_sign:
+                if (isdigit(c))
+                    e = e*10 + (c-'0');
+                else
+                    state = parse_end;
+                break;
+
+            case parse_end:
+            default:
+                /* Pacify compiler */
+                break;
+        }
+    }
+
+    /* Support for non-canonical NaN and Inf */
+    /* charpos <=2 because for "-i" iter will be advanced to next char already */
+    if (check_nan && (iter.charpos <= 2)) {
+        STRING *t = Parrot_str_upcase(interp, s);
+        if (Parrot_str_equal(interp, t, CONST_STRING(interp, "NAN")))
+            return PARROT_FLOATVAL_NAN_QUIET;
+        else if (Parrot_str_equal(interp, t, CONST_STRING(interp, "INF"))
+                || Parrot_str_equal(interp, t, CONST_STRING(interp, "INFINITY")))
+            return PARROT_FLOATVAL_INF_POSITIVE;
+        else if (Parrot_str_equal(interp, t, CONST_STRING(interp, "-INF"))
+                || Parrot_str_equal(interp, t, CONST_STRING(interp, "-INFINITY")))
+            return PARROT_FLOATVAL_INF_NEGATIVE;
+        else
+            return 0.0;
+    }
+
+    if (d && d_is_safe) {
+        f = mantissa + (1.0 * d / powl(10, d_length));
+    }
+
+    f = f * sign;
+
+    if (e) {
+        if (e_sign == 1)
+            f *= powl(10, e);
+        else
+            f /= powl(10, e);
+    }
 
     return f;
 }

Modified: trunk/t/op/number.t
==============================================================================
--- trunk/t/op/number.t	Fri Jun 12 18:06:27 2009	(r39528)
+++ trunk/t/op/number.t	Fri Jun 12 23:23:26 2009	(r39529)
@@ -1081,14 +1081,7 @@
 0.5
 OUTPUT
 
-# long double succeeds
-$output = $PConfig{numvalsize} == 8
-  ? '1.4142135623731
-1.41421356237309
-' : '1.4142135623731
-1.4142135623731
-';
-pasm_output_is( <<'CODE', $output, "sqrt_n_n" );
+pasm_output_is( <<'CODE', <<OUTPUT, "sqrt_n_n" );
         set N1, 2
         sqrt N2, N1
         say N2
@@ -1096,6 +1089,9 @@
         say N2
         end
 CODE
+1.4142135623731
+1.4142135623731
+OUTPUT
 
 pasm_error_output_like( <<'CODE', <<OUTPUT, "div_n_n by zero" );
         set N0, 0

Modified: trunk/t/op/sprintf.t
==============================================================================
--- trunk/t/op/sprintf.t	Fri Jun 12 18:06:27 2009	(r39528)
+++ trunk/t/op/sprintf.t	Fri Jun 12 23:23:26 2009	(r39529)
@@ -198,6 +198,9 @@
     description .= ' actual: >'
     description .= actual
     description .= '<'
+    description .= ' expected: >'
+    description .= expected
+    description .= '<'
     goto is_nok
 
     # remove /'s

Modified: trunk/t/op/string.t
==============================================================================
--- trunk/t/op/string.t	Fri Jun 12 18:06:27 2009	(r39528)
+++ trunk/t/op/string.t	Fri Jun 12 23:23:26 2009	(r39529)
@@ -7,7 +7,7 @@
 use lib qw( . lib ../lib ../../lib );
 
 use Test::More;
-use Parrot::Test tests => 163;
+use Parrot::Test tests => 165;
 use Parrot::Config;
 
 =head1 NAME
@@ -2926,6 +2926,38 @@
 Foo/Bar
 OUT
 
+pir_output_is( <<'CODE', <<'OUT', 'Corner cases of numification' );
+.sub main :main
+    say 2147483647.0
+    say -2147483648.0
+.end
+CODE
+2147483647
+-2147483648
+OUT
+pir_output_is( <<'CODE', <<'OUT', 'Non canonical nan and inf' );
+.sub main :main
+    $N0 = 'nan'
+    say $N0
+    $N0 = 'iNf'
+    say $N0
+    $N0 = 'INFINITY'
+    say $N0
+    $N0 = '-INF'
+    say $N0
+    $N0 = '-Infinity'
+    say $N0
+.end
+CODE
+NaN
+Inf
+Inf
+-Inf
+-Inf
+OUT
+
+
+
 # Local Variables:
 #   mode: cperl
 #   cperl-indent-level: 4

Modified: trunk/t/op/stringu.t
==============================================================================
--- trunk/t/op/stringu.t	Fri Jun 12 18:06:27 2009	(r39528)
+++ trunk/t/op/stringu.t	Fri Jun 12 23:23:26 2009	(r39529)
@@ -6,7 +6,7 @@
 use warnings;
 use lib qw( . lib ../lib ../../lib );
 use Test::More;
-use Parrot::Test tests => 28;
+use Parrot::Test tests => 31;
 use Parrot::Config;
 
 =head1 NAME
@@ -502,6 +502,51 @@
 OUTPUT
 
 
+pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings' );
+.sub main :main
+     $S0 = "140"
+     $I0 = $S0
+     say $I0
+     $I0 = find_encoding 'ucs2'
+     $S0 = trans_encoding $S0, $I0
+     $I0 = $S0
+     say $I0
+.end
+CODE
+140
+140
+OUT
+
+pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings' );
+.sub main :main
+     $S0 = "140"
+     $N0 = $S0
+     say $N0
+     $I0 = find_encoding 'ucs2'
+     $S0 = trans_encoding $S0, $I0
+     $N0 = $S0
+     say $N0
+.end
+CODE
+140
+140
+OUT
+
+pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings' );
+.sub main :main
+    $S0 = unicode:"140 r\x{e9}sum\x{e9}s"
+    $N0 = $S0
+    say $N0
+    $I0 = find_encoding 'ucs2'
+    $S0 = trans_encoding $S0, $I0
+    $N0 = $S0
+    say $N0
+.end
+CODE
+140
+140
+OUT
+
 
 
 # Local Variables:


More information about the parrot-commits mailing list