[svn:parrot] r46822 - in trunk: . compilers/imcc src/packfile

NotFound at svn.parrot.org NotFound at svn.parrot.org
Thu May 20 19:45:08 UTC 2010


Author: NotFound
Date: Thu May 20 19:45:07 2010
New Revision: 46822
URL: https://trac.parrot.org/parrot/changeset/46822

Log:
store string encoding in PBC and change semantic of wide char encodings in PIR string literals

Modified:
   trunk/PBC_COMPAT
   trunk/compilers/imcc/pbc.c
   trunk/src/packfile/pf_items.c

Modified: trunk/PBC_COMPAT
==============================================================================
--- trunk/PBC_COMPAT	Thu May 20 18:12:32 2010	(r46821)
+++ trunk/PBC_COMPAT	Thu May 20 19:45:07 2010	(r46822)
@@ -27,6 +27,7 @@
 
 # please insert tab separated entries at the top of the list
 
+6.17	2010.05.20	NotFound	store encoding of string constants
 6.16	2010.05.18	plobsing	move freeze/thaw adjacent to visit
 6.15	2010.05.06	bacek	add StringBuilder PMC
 6.14	2010.05.03	coke	remove popaction, pushmark, pushaction ops.

Modified: trunk/compilers/imcc/pbc.c
==============================================================================
--- trunk/compilers/imcc/pbc.c	Thu May 20 18:12:32 2010	(r46821)
+++ trunk/compilers/imcc/pbc.c	Thu May 20 19:45:07 2010	(r46822)
@@ -888,7 +888,7 @@
 IMCC_string_from_reg(PARROT_INTERP, ARGIN(const SymReg *r))
 {
     ASSERT_ARGS(IMCC_string_from_reg)
-    const char *buf = r->name;
+    char *buf = r->name;
 
     if (r->type & VT_ENCODED) {
         /*
@@ -896,19 +896,63 @@
          * get first part as charset, rest as string
          */
         STRING     *s;
+        const CHARSET *s_charset;
+        const ENCODING *s_encoding = NULL;
+        const ENCODING *src_encoding;
         const char *charset;
-        char * const p = strchr(r->name, '"');
+        #define MAX_NAME 31
+        char charset_name[MAX_NAME + 1];
+        char encoding_name[MAX_NAME + 1];
+        char * p = strchr(r->name, '"');
+        char * p2 = strchr(r->name, ':');
         PARROT_ASSERT(p && p[-1] == ':');
-
-        p[-1]   = 0;
-        charset = r->name;
+        if (p2 < p -1) {
+            strncpy(encoding_name, buf, p2 - buf);
+            encoding_name[p2-buf] = '\0';
+            strncpy(charset_name, p2 +1, p - p2 - 2);
+            charset_name[p- p2 - 2] = '\0';
+            /*fprintf(stderr, "%s:%s\n", charset_name, encoding_name);*/
+            s_charset = Parrot_find_charset(interp, charset_name);
+            s_encoding = Parrot_find_encoding(interp, encoding_name);
+        }
+        else {
+            strncpy(charset_name, buf, p - buf - 1);
+            charset_name[p - buf - 1] = '\0';
+            /*fprintf(stderr, "%s\n", charset_name);*/
+            s_charset = Parrot_find_charset(interp, charset_name);
+        }
+        if (strcmp(charset_name, "unicode") == 0)
+            src_encoding = Parrot_utf8_encoding_ptr;
+        else
+            src_encoding = Parrot_fixed_8_encoding_ptr;
+        if (s_encoding == NULL)
+            s_encoding = src_encoding;
 
         /* past delim */
         buf     = p + 1;
-        s       = Parrot_str_unescape(interp, buf, '"', charset);
-
-        /* restore colon, as we may reuse this string */
-        p[-1] = ':';
+        if (strcmp(charset_name, "unicode") == 0 && strcmp(encoding_name, "utf8") == 0) {
+            /* Special case needed for backward compatibility with utf8 literals
+	     * using \xHH\xHH byte sequences */
+            s = Parrot_str_unescape(interp, buf, '"', "utf8:unicode");
+        }
+        else {
+            p       = buf;
+            p2      = strchr(buf, '"');
+            while (p2 != NULL) {
+               p  = p2;
+               p2 = strchr(p + 1, '"');
+            }
+            {
+                STRING * aux = Parrot_str_new_init(interp, buf, p - buf,
+                        src_encoding, s_charset, 0);
+                s = Parrot_str_unescape_string(interp, aux,
+                        s_charset, s_encoding, PObj_constant_FLAG);
+                if (!CHARSET_VALIDATE(interp, s))
+                       Parrot_ex_throw_from_c_args(interp, NULL,
+                               EXCEPTION_INVALID_STRING_REPRESENTATION,
+                               "Malformed string");
+            }
+        }
         return s;
     }
     else if (*buf == '"') {

Modified: trunk/src/packfile/pf_items.c
==============================================================================
--- trunk/src/packfile/pf_items.c	Thu May 20 18:12:32 2010	(r46821)
+++ trunk/src/packfile/pf_items.c	Thu May 20 19:45:07 2010	(r46822)
@@ -1216,7 +1216,10 @@
     ASSERT_ARGS(PF_fetch_string)
     STRING   *s;
     UINTVAL   flags;
+    UINTVAL   encoding_nr;
     UINTVAL   charset_nr;
+    ENCODING *encoding;
+    CHARSET  *charset;
     size_t    size;
     const int wordsize          = pf ? pf->header->wordsize : sizeof (opcode_t);
     opcode_t  flag_charset_word = PF_fetch_opcode(pf, cursor);
@@ -1224,20 +1227,31 @@
     if (flag_charset_word == -1)
         return STRINGNULL;
 
-    /* decode flags and charset */
+    /* decode flags, charset and encoding */
     flags         = (flag_charset_word & 0x1 ? PObj_constant_FLAG : 0) |
                     (flag_charset_word & 0x2 ? PObj_private7_FLAG : 0) ;
-    charset_nr    = flag_charset_word >> 8;
+    encoding_nr   = (flag_charset_word >> 16);
+    charset_nr    = (flag_charset_word >> 8) & 0xFF;
 
 
     size = (size_t)PF_fetch_opcode(pf, cursor);
 
     TRACE_PRINTF(("PF_fetch_string(): flags=0x%04x, ", flags));
+    TRACE_PRINTF(("encoding_nr=%ld, ", encoding_nr));
     TRACE_PRINTF(("charset_nr=%ld, ", charset_nr));
     TRACE_PRINTF(("size=%ld.\n", size));
 
-    s = string_make_from_charset(interp, (const char *)*cursor,
-                        size, charset_nr, flags);
+    encoding = Parrot_get_encoding(interp, encoding_nr);
+    charset  = Parrot_get_charset(interp, charset_nr);
+    if (!encoding)
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
+                    "Invalid encoding number '%d' specified", encoding_nr);
+    if (!charset)
+            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
+                    "Invalid charset number '%d' specified", charset_nr);
+
+    s = Parrot_str_new_init(interp, (const char *)*cursor, size,
+            encoding, charset, flags);
 
     /* print only printable characters */
     TRACE_PRINTF_VAL(("PF_fetch_string(): string is '%s' at 0x%x\n",
@@ -1298,8 +1312,9 @@
      * see also PF_fetch_string
      */
 
-    /* encode charset_nr and flags into the same word for a 33% savings on constant overhead */
-    *cursor++ = (Parrot_charset_number_of_str(NULL, s) << 8)         |
+    /* encode charset_nr, encoding_nr and flags into the same word */
+    *cursor++ = (Parrot_encoding_number_of_str(NULL, s) << 16)       |
+                (Parrot_charset_number_of_str(NULL, s) << 8)         |
                 (PObj_get_FLAGS(s) & PObj_constant_FLAG ? 0x1 : 0x0) |
                 (PObj_get_FLAGS(s) & PObj_private7_FLAG ? 0x2 : 0x0) ;
     *cursor++ = s->bufused;


More information about the parrot-commits mailing list