[svn:parrot] r37860 - in trunk: compilers/pge/PGE src/pmc t/compilers/pge/perl6regex t/pmc
pmichaud at svn.parrot.org
pmichaud at svn.parrot.org
Thu Apr 2 00:51:26 UTC 2009
Author: pmichaud
Date: Thu Apr 2 00:51:24 2009
New Revision: 37860
URL: https://trac.parrot.org/parrot/changeset/37860
Log:
Merge refactors from p6strings branch into trunk.
Modified:
trunk/compilers/pge/PGE/Perl6Regex.pir
trunk/src/pmc/codestring.pmc
trunk/t/compilers/pge/perl6regex/rx_metachars
trunk/t/pmc/codestring.t
Modified: trunk/compilers/pge/PGE/Perl6Regex.pir
==============================================================================
--- trunk/compilers/pge/PGE/Perl6Regex.pir Wed Apr 1 22:05:58 2009 (r37859)
+++ trunk/compilers/pge/PGE/Perl6Regex.pir Thu Apr 2 00:51:24 2009 (r37860)
@@ -145,6 +145,120 @@
.return (match)
.end
+=item C<p6escapes>
+
+Parse and calculate various Perl 6 string escapes, such as \n, \r,
+\x, \o, and \c. For the latter escapes, also handle the bracketed
+forms and other special forms.
+
+Note that this function is used directly by PCT::Grammar and Rakudo,
+and someday may be refactored to a different location.
+
+=cut
+
+.sub 'p6escapes'
+ .param pmc mob
+ .param pmc adverbs :slurpy :named
+ .local string target, backchar, literal
+ .local int pos, lastpos
+ $P0 = get_hll_global ['PGE'], '$!MATCH'
+ (mob, pos, target) = $P0.'new'(mob, adverbs :flat :named)
+ lastpos = length target
+ if pos >= lastpos goto fail
+ $S0 = substr target, pos, 1
+ inc pos
+ if $S0 != "\\" goto fail
+ if pos >= lastpos goto fail
+ backchar = substr target, pos, 1
+ inc pos
+ backchar = downcase backchar
+ $I0 = index "\\0abefnrtxco", backchar
+ if $I0 < 0 goto fail
+ if $I0 >= 9 goto scan_xco
+ literal = substr "\\\0\a\b\e\f\n\r\t", $I0, 1
+ goto succeed
+ scan_xco:
+ ## Handle \x, \c, and \o escapes. Start by converting
+ ## backchar into the appropriate radix, then loop through
+ ## the characters that follow to compute the decimal value
+ ## of codepoints, and concatenate the codepoints into a
+ ## literal.
+ .local int base, decnum, isbracketed
+ base = index ' o c x', backchar
+ literal = ''
+ $S0 = substr target, pos, 1
+ isbracketed = iseq $S0, '['
+ pos += isbracketed
+ ## Handle the case of \cC (control escape).
+ if base != 10 goto scan_xco_char
+ if isbracketed goto scan_xco_char
+ $I0 = is_cclass .CCLASS_NUMERIC, $S0, 0
+ if $I0 goto scan_xco_char
+ ## xor the 64 bit
+ $I0 = ord $S0
+ bxor $I0, 64
+ literal = chr $I0
+ inc pos
+ goto succeed
+ scan_xco_char:
+ decnum = 0
+ if base != 10 goto scan_xco_char_digits
+ unless isbracketed goto scan_xco_char_digits
+ $I0 = is_cclass .CCLASS_NUMERIC, target, pos
+ if $I0 goto scan_xco_char_digits
+ ## look up character by name
+ .local int namepos
+ namepos = index target, ']', pos
+ if namepos < 0 goto err_missing_bracket
+ $I0 = index target, ',', pos
+ if $I0 < 0 goto have_namepos
+ if namepos < $I0 goto have_namepos
+ namepos = $I0
+ have_namepos:
+ $I0 = namepos - pos
+ $S0 = substr target, pos, $I0
+ $P0 = new 'CodeString'
+ decnum = $P0.'charname_to_ord'($S0)
+ if decnum < 0 goto err_unicode_name
+ pos = namepos
+ goto scan_xco_char_end
+ scan_xco_char_digits:
+ $S0 = substr target, pos, 1
+ $I0 = index "0123456789abcdef0123456789ABCDEF", $S0
+ if $I0 < 0 goto scan_xco_char_end
+ $I0 %= 16
+ if $I0 >= base goto scan_xco_char_end
+ decnum *= base
+ decnum += $I0
+ inc pos
+ goto scan_xco_char_digits
+ scan_xco_char_end:
+ $S1 = chr decnum
+ concat literal, $S1
+ $S0 = substr target, pos, 1
+ unless isbracketed goto scan_xco_end
+ if $S0 == ']' goto scan_xco_end
+ if $S0 == '' goto err_missing_bracket
+ if $S0 != ',' goto err_digit
+ inc pos
+ goto scan_xco_char
+ scan_xco_end:
+ pos += isbracketed
+ succeed:
+ mob.'!make'(literal)
+ mob.'to'(pos)
+ fail:
+ .return (mob)
+
+ err_unicode_name:
+ 'parse_error'(mob, pos, "Unrecognized character name")
+ err_missing_bracket:
+ 'parse_error'(mob, pos, "Missing close bracket for \\x[...], \\o[...], or \\c[...]")
+ err_digit:
+ 'parse_error'(mob, pos, "Invalid digit in \\x[...], \\o[...], or \\c[...]")
+.end
+
+
=item C<onload()>
Initializes the Perl6Regex parser and other data structures
@@ -362,39 +476,50 @@
.param pmc adverbs :slurpy :named
.local string target
- .local int pos, lastpos
+ .local int pos, lastpos, isnegated
$P0 = getattribute mob, '$.target'
target = $P0
$P0 = getattribute mob, '$.pos'
pos = $P0
lastpos = length target
+ isnegated = 0
- .local string initchar
- initchar = substr target, pos, 1
- $I0 = is_cclass .CCLASS_WORD, initchar, 0
- if $I0 goto term_metachar
- quoted_metachar:
- inc pos
- mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Literal')
- mob.'!make'(initchar)
- mob.'to'(pos)
- .return (mob)
-
- term_metachar:
- .local int isnegated
- isnegated = is_cclass .CCLASS_UPPERCASE, initchar, 0
- ## $S0 = downcase FIXME: RT# 48108
- $I0 = ord initchar
+ .local string backchar, charlist
+ ## get whatever follows the backslash
+ backchar = substr target, pos, 1
+ charlist = backchar
+ inc pos
+
+ ## if it's not a word character, it's a quoted metachar
+ $I0 = is_cclass .CCLASS_WORD, backchar, 0
+ unless $I0 goto term_literal
+
+ ## if it's a word character, it may be negated
+ isnegated = is_cclass .CCLASS_UPPERCASE, backchar, 0
+ ## $S0 = downcase charlist FIXME: RT #48108
+ $I0 = ord backchar
$S0 = chr $I0
- $S0 = downcase $S0
- if $S0 == 'x' goto scan_xdo
- if $S0 == 'o' goto scan_xdo
+ backchar = downcase $S0
+
+ ## if it's \x, \c, or \o, parse as string escape
+ $I0 = index 'xco', backchar
+ if $I0 < 0 goto meta_esclist
+ meta_xco:
+ $I0 = pos - 2
+ $P0 = 'p6escapes'(mob, 'pos' => $I0)
+ unless $P0 goto err_xcoparse
+ pos = $P0.'to'()
+ charlist = $P0.'ast'()
+ unless isnegated goto term_literal
+ $I0 = length charlist
+ if $I0 > 1 goto err_negated_brackets
+ goto term_charlist
+
+ meta_esclist:
$P0 = get_global '%esclist'
- $I0 = exists $P0[$S0]
- if $I0 == 0 goto err_reserved_metachar
- inc pos
- .local string charlist
- charlist = $P0[$S0]
+ $I0 = exists $P0[backchar]
+ unless $I0 goto err_reserved_metachar
+ charlist = $P0[backchar]
if isnegated goto term_charlist
$I0 = length charlist
if $I0 > 1 goto term_charlist
@@ -412,48 +537,13 @@
mob.'to'(pos)
.return (mob)
- scan_xdo:
- inc pos
- .local int base, decnum, isbracketed
- charlist = ''
- base = index ' o d x', $S0
- decnum = 0
- $S0 = substr target, pos, 1
- isbracketed = iseq $S0, '['
- pos += isbracketed
- scan_xdo_char_loop:
- $S0 = substr target, pos, 1
- $I0 = index '0123456789abcdef', $S0
- if $I0 < 0 goto scan_xdo_char_end
- if $I0 >= base goto scan_xdo_char_end
- decnum *= base
- decnum += $I0
- inc pos
- goto scan_xdo_char_loop
- scan_xdo_char_end:
- $S1 = chr decnum
- concat charlist, $S1
- unless isbracketed goto scan_xdo_end
- if $S0 == ']' goto scan_xdo_end
- if $S0 == '' goto err_missing_bracket
- if $S0 != ',' goto err_bracketed
- if isnegated goto err_negated_brackets
- inc pos
- decnum = 0
- goto scan_xdo_char_loop
- scan_xdo_end:
- pos += isbracketed
- if isnegated goto term_charlist
- goto term_literal
-
- err_reserved_metachar:
- parse_error(mob, pos, 'Alphanumeric metacharacters are reserved')
- err_missing_bracket:
- parse_error(mob, pos, 'Missing close bracket for \\x[...] or \\o[...]')
- err_bracketed:
- parse_error(mob, pos, 'Invalid digit in \\x[...] or \\o[...]')
+ err_xcoparse:
+ parse_error(mob, pos, 'Unable to parse \x, \c, or \o value')
err_negated_brackets:
+ pos = mob.'from'()
parse_error(mob, pos, 'Cannot use comma in \\X[...] or \\O[...]')
+ err_reserved_metachar:
+ parse_error(mob, pos, 'Alphanumeric metacharacters are reserved')
.end
Modified: trunk/src/pmc/codestring.pmc
==============================================================================
--- trunk/src/pmc/codestring.pmc Wed Apr 1 22:05:58 2009 (r37859)
+++ trunk/src/pmc/codestring.pmc Thu Apr 2 00:51:24 2009 (r37860)
@@ -32,6 +32,10 @@
#include "parrot/parrot.h"
+#if PARROT_HAS_ICU
+# include <unicode/uchar.h>
+#endif
+
pmclass CodeString extends String provides string {
/*
@@ -200,6 +204,33 @@
/*
+=item C<charname_to_ord(string name)>
+
+Converts the Unicode character name given by C<name> to its
+codepoint value. Returns -1 if an error occurs in conversion.
+
+*/
+
+
+ METHOD charname_to_ord(STRING *name) {
+#if PARROT_HAS_ICU
+ UChar32 codepoint;
+ UErrorCode err = U_ZERO_ERROR;
+ char *cstr = Parrot_str_to_cstring(INTERP, name);
+ codepoint = u_charFromName(U_UNICODE_CHAR_NAME, cstr, &err);
+ Parrot_str_free_cstring(cstr);
+ if (U_SUCCESS(err)) {
+ RETURN(INTVAL codepoint);
+ }
+ RETURN(INTVAL -1);
+#else
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+#endif
+ }
+
+/*
+
=item C<key( string name1 [, string name2, ...] )>
Construct a PIR key using the strings passed as arguments.
Modified: trunk/t/compilers/pge/perl6regex/rx_metachars
==============================================================================
--- trunk/t/compilers/pge/perl6regex/rx_metachars Wed Apr 1 22:05:58 2009 (r37859)
+++ trunk/t/compilers/pge/perl6regex/rx_metachars Thu Apr 2 00:51:24 2009 (r37860)
@@ -204,6 +204,27 @@
c \X[0021] d abc!def n not hex (\X[])
c \X[0021]+ d abc!!def n not hex (\X[])
a \X[0021]+ f abcdef y not hex (\X[])
+c \c33 d abc!def y hex (\x)
+c \c33+ d abc!!def y hex (\x)
+a \c33+ f abcdef n hex (\x)
+b \c33 c abc!def n hex (\x)
+c \c[33] d abc!def y hex (\x[])
+c \c[33]+ d abc!!def y hex (\x[])
+c \c[33,33] d abc!!def y hex (\x[])
+a \c[33]+ f abcdef n hex (\x[])
+b \c[33] c abc!def n hex (\x[])
+\C33 a y not hex (\X)
+a \C33 c abc y not hex (\X)
+\C33 '' n not hex (\X)
+c \C33 d abc!def n not hex (\X)
+c \C33+ d abc!!def n not hex (\X)
+a \C33+ f abcdef y not hex (\X)
+\C[33] a y not hex (\X[])
+a \C[33] c abc y not hex (\X[])
+\C[33] '' n not hex (\X[])
+c \C[33] d abc!def n not hex (\X[])
+c \C[33]+ d abc!!def n not hex (\X[])
+a \C[33]+ f abcdef y not hex (\X[])
c \o041 d abc!def y octal (\o)
c \o41+ d abc!!def y octal (\o)
a \o41+ f abcdef n octal (\o)
Modified: trunk/t/pmc/codestring.t
==============================================================================
--- trunk/t/pmc/codestring.t Wed Apr 1 22:05:58 2009 (r37859)
+++ trunk/t/pmc/codestring.t Thu Apr 2 00:51:24 2009 (r37860)
@@ -19,7 +19,7 @@
.sub main :main
.include 'test_more.pir'
- plan(20)
+ plan(24)
create_codestring()
calls_to_unique()
@@ -31,6 +31,7 @@
output_global_unique_num()
namespace_keys()
first_char_repl_regression()
+ ord_from_name()
.end
.sub create_codestring
@@ -167,6 +168,27 @@
is(code, "new\n", "regression on first char repl bug looks fine")
.end
+.sub 'ord_from_name'
+ .local pmc code
+ load_bytecode 'config.pbc'
+ $P0 = _config()
+ $I0 = $P0['has_icu']
+ if $I0 goto has_icu
+ skip(3, 'ICU unavailable')
+ .return ()
+
+ has_icu:
+ code = new ['CodeString']
+ $I0 = code.'charname_to_ord'('LATIN CAPITAL LETTER C')
+ is($I0, 0x0043, "LATIN CAPITAL LETTER C")
+ $I0 = code.'charname_to_ord'('MUSIC FLAT SIGN')
+ is($I0, 0x266d, "MUSIC FLAT SIGN")
+ $I0 = code.'charname_to_ord'('RECYCLING SYMBOL FOR TYPE-1 PLASTICS')
+ is($I0, 0x2673, "RECYCLING SYMBOL FOR TYPE-1 PLASTICS")
+ $I0 = code.'charname_to_ord'('<no such symbol>')
+ is($I0, -1, '<no such symbol>')
+.end
+
# Local Variables:
# mode: pir
# fill-column: 100
More information about the parrot-commits
mailing list