[svn:parrot] r48833 - in trunk: . compilers/imcc config/gen/makefiles config/inter examples/config/file include/parrot lib/Parrot/Configure/Step src src/dynpmc src/io src/ops src/packfile src/pmc src/string src/string/charset src/string/encoding t/op t/pmc tools/dev
nwellnhof at svn.parrot.org
nwellnhof at svn.parrot.org
Tue Sep 7 22:58:41 UTC 2010
Author: nwellnhof
Date: Tue Sep 7 22:58:38 2010
New Revision: 48833
URL: https://trac.parrot.org/parrot/changeset/48833
Log:
Merge branch charset_massacre
Added:
trunk/src/string/encoding/ascii.c
trunk/src/string/encoding/ascii.h
trunk/src/string/encoding/binary.c
trunk/src/string/encoding/binary.h
- copied, changed from r48832, trunk/src/string/charset/binary.h
trunk/src/string/encoding/latin1.c
trunk/src/string/encoding/latin1.h
trunk/src/string/encoding/shared.c
- copied, changed from r48832, trunk/src/string/charset/unicode.c
trunk/src/string/encoding/shared.h
trunk/src/string/encoding/tables.c
- copied, changed from r48832, trunk/src/string/charset/tables.c
trunk/src/string/encoding/tables.h
- copied, changed from r48832, trunk/src/string/charset/tables.h
Deleted:
trunk/config/inter/charset.pm
trunk/config/inter/encoding.pm
trunk/include/parrot/charset.h
trunk/src/string/charset.c
trunk/src/string/charset/ascii.c
trunk/src/string/charset/ascii.h
trunk/src/string/charset/binary.c
trunk/src/string/charset/binary.h
trunk/src/string/charset/iso-8859-1.c
trunk/src/string/charset/iso-8859-1.h
trunk/src/string/charset/tables.c
trunk/src/string/charset/tables.h
trunk/src/string/charset/unicode.c
trunk/src/string/charset/unicode.h
trunk/src/string/encoding/fixed_8.c
trunk/src/string/encoding/fixed_8.h
Modified:
trunk/Configure.pl
trunk/MANIFEST
trunk/compilers/imcc/pbc.c
trunk/config/gen/makefiles/root.in
trunk/examples/config/file/configcompiler
trunk/examples/config/file/configwithfatalstep
trunk/include/parrot/encoding.h
trunk/include/parrot/parrot.h
trunk/include/parrot/pobj.h
trunk/include/parrot/string.h
trunk/include/parrot/string_funcs.h
trunk/lib/Parrot/Configure/Step/List.pm
trunk/src/dynext.c
trunk/src/dynpmc/Defines.in
trunk/src/global_setup.c
trunk/src/hash.c
trunk/src/io/buffer.c
trunk/src/io/utf8.c
trunk/src/library.c
trunk/src/ops/core_ops.c
trunk/src/ops/string.ops
trunk/src/packdump.c
trunk/src/packfile.c
trunk/src/packfile/pf_items.c
trunk/src/packout.c
trunk/src/pmc/bytebuffer.pmc
trunk/src/pmc/packfile.pmc
trunk/src/pmc/packfilefixupentry.pmc
trunk/src/pmc/string.pmc
trunk/src/pmc/stringbuilder.pmc
trunk/src/pmc/stringiterator.pmc
trunk/src/spf_vtable.c
trunk/src/string/api.c
trunk/src/string/encoding.c
trunk/src/string/encoding/ucs2.c
trunk/src/string/encoding/ucs2.h
trunk/src/string/encoding/ucs4.c
trunk/src/string/encoding/ucs4.h
trunk/src/string/encoding/utf16.c
trunk/src/string/encoding/utf16.h
trunk/src/string/encoding/utf8.c
trunk/src/string/encoding/utf8.h
trunk/src/string/primitives.c
trunk/t/op/string_cs.t
trunk/t/pmc/bytebuffer.t
trunk/t/pmc/filehandle.t
trunk/t/pmc/io.t
trunk/tools/dev/gen_charset_tables.pl
Modified: trunk/Configure.pl
==============================================================================
--- trunk/Configure.pl Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/Configure.pl Tue Sep 7 22:58:38 2010 (r48833)
@@ -604,8 +604,6 @@
init::optimize
inter::shlibs
inter::libparrot
- inter::charset
- inter::encoding
inter::types
auto::ops
auto::alignptrs
Modified: trunk/MANIFEST
==============================================================================
--- trunk/MANIFEST Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/MANIFEST Tue Sep 7 22:58:38 2010 (r48833)
@@ -348,8 +348,6 @@
config/init/install.pm []
config/init/manifest.pm []
config/init/optimize.pm []
-config/inter/charset.pm []
-config/inter/encoding.pm []
config/inter/lex.pm []
config/inter/libparrot.pm []
config/inter/make.pm []
@@ -947,7 +945,6 @@
include/parrot/caches.h [main]include
include/parrot/call.h [main]include
include/parrot/cclass.h [main]include
-include/parrot/charset.h [main]include
include/parrot/compiler.h [main]include
include/parrot/context.h [main]include
include/parrot/core_types.h [main]include
@@ -1444,20 +1441,17 @@
src/spf_render.c []
src/spf_vtable.c []
src/string/api.c []
-src/string/charset.c []
-src/string/charset/ascii.c []
-src/string/charset/ascii.h []
-src/string/charset/binary.c []
-src/string/charset/binary.h []
-src/string/charset/iso-8859-1.c []
-src/string/charset/iso-8859-1.h []
-src/string/charset/tables.c []
-src/string/charset/tables.h []
-src/string/charset/unicode.c []
-src/string/charset/unicode.h []
src/string/encoding.c []
-src/string/encoding/fixed_8.c []
-src/string/encoding/fixed_8.h []
+src/string/encoding/ascii.c []
+src/string/encoding/ascii.h []
+src/string/encoding/binary.c []
+src/string/encoding/binary.h []
+src/string/encoding/latin1.c []
+src/string/encoding/latin1.h []
+src/string/encoding/shared.c []
+src/string/encoding/shared.h []
+src/string/encoding/tables.c []
+src/string/encoding/tables.h []
src/string/encoding/ucs2.c []
src/string/encoding/ucs2.h []
src/string/encoding/ucs4.c []
Modified: trunk/compilers/imcc/pbc.c
==============================================================================
--- trunk/compilers/imcc/pbc.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/compilers/imcc/pbc.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -973,9 +973,8 @@
* get first part as charset, rest as string
*/
STRING *s;
- const CHARSET *s_charset;
- const ENCODING *s_encoding = NULL;
- const ENCODING *src_encoding;
+ const STR_VTABLE *s_encoding;
+ const STR_VTABLE *src_encoding;
#define MAX_NAME 31
char charset_name[MAX_NAME + 1];
char encoding_name[MAX_NAME + 1];
@@ -983,38 +982,38 @@
char * p2 = strchr(r->name, ':');
PARROT_ASSERT(p && p[-1] == ':');
if (p2 < p -1) {
+ /* Handle the old 'encoding:charset' format by trying
+ * encoding as well as charset */
strncpy(encoding_name, buf, p2 - buf);
encoding_name[p2-buf] = '\0';
strncpy(charset_name, p2 +1, p - p2 - 2);
charset_name[p- p2 - 2] = '\0';
/*fprintf(stderr, "%s:%s\n", charset_name, encoding_name);*/
- s_charset = Parrot_find_charset(interp, charset_name);
- if (s_charset == NULL)
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_INVALID_STRING_REPRESENTATION,
- "Unknown charset '%s'", charset_name);
s_encoding = Parrot_find_encoding(interp, encoding_name);
- if (s_encoding == NULL)
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_INVALID_STRING_REPRESENTATION,
- "Unknown encoding '%s'", encoding_name);
+ if (s_encoding == NULL) {
+ s_encoding = Parrot_find_encoding(interp, charset_name);
+ if (s_encoding == NULL)
+ Parrot_ex_throw_from_c_args(interp, NULL,
+ EXCEPTION_INVALID_STRING_REPRESENTATION,
+ "Unknown encoding '%s:%s'",
+ encoding_name, charset_name);
+ }
}
else {
- strncpy(charset_name, buf, p - buf - 1);
- charset_name[p - buf - 1] = '\0';
- /*fprintf(stderr, "%s\n", charset_name);*/
- s_charset = Parrot_find_charset(interp, charset_name);
- if (s_charset == NULL)
+ strncpy(encoding_name, buf, p - buf - 1);
+ encoding_name[p - buf - 1] = '\0';
+ charset_name[0] = '\0';
+ /*fprintf(stderr, "%s\n", encoding_name);*/
+ s_encoding = Parrot_find_encoding(interp, encoding_name);
+ if (s_encoding == NULL)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_STRING_REPRESENTATION,
- "Unknown charset '%s'", charset_name);
+ "Unknown encoding '%s'", encoding_name);
}
- if (strcmp(charset_name, "unicode") == 0)
- src_encoding = Parrot_utf8_encoding_ptr;
+ if (s_encoding->max_bytes_per_codepoint == 1)
+ src_encoding = Parrot_ascii_encoding_ptr;
else
- src_encoding = Parrot_fixed_8_encoding_ptr;
- if (s_encoding == NULL)
- s_encoding = src_encoding;
+ src_encoding = Parrot_utf8_encoding_ptr;
/* past delim */
buf = p + 1;
@@ -1032,10 +1031,10 @@
}
{
STRING * aux = Parrot_str_new_init(interp, buf, p - buf,
- src_encoding, s_charset, 0);
+ src_encoding, 0);
s = Parrot_str_unescape_string(interp, aux,
- s_charset, s_encoding, PObj_constant_FLAG);
- if (!CHARSET_VALIDATE(interp, s))
+ s_encoding, PObj_constant_FLAG);
+ if (!STRING_validate(interp, s))
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_STRING_REPRESENTATION,
"Malformed string");
@@ -1882,7 +1881,7 @@
char *src, *chr, *start;
int base;
- if (s->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(s) != 1)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"unhandled string encoding in FixedIntegerArray initialization");
Modified: trunk/config/gen/makefiles/root.in
==============================================================================
--- trunk/config/gen/makefiles/root.in Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/config/gen/makefiles/root.in Tue Sep 7 22:58:38 2010 (r48833)
@@ -368,7 +368,6 @@
$(INC_DIR)/interpreter.h \
$(INC_DIR)/datatypes.h \
$(INC_DIR)/encoding.h \
- $(INC_DIR)/charset.h \
$(INC_DIR)/string.h \
$(INC_DIR)/string_primitives.h \
$(INC_DIR)/hash.h \
@@ -417,11 +416,20 @@
GENERAL_H_FILES = $(NONGEN_HEADERS) $(GEN_HEADERS) \
-CHARSET_O_FILES = @TEMP_charset_o@
CLASS_PMC_FILES = @TEMP_pmc_classes_pmc@
CLASS_O_FILES = @TEMP_pmc_classes_o@
CLASS_STR_FILES = @TEMP_pmc_classes_str@
-ENCODING_O_FILES = @TEMP_encoding_o@
+
+ENCODING_O_FILES = \
+ src/string/encoding/shared$(O) \
+ src/string/encoding/tables$(O) \
+ src/string/encoding/ascii$(O) \
+ src/string/encoding/latin1$(O) \
+ src/string/encoding/binary$(O) \
+ src/string/encoding/utf8$(O) \
+ src/string/encoding/utf16$(O) \
+ src/string/encoding/ucs2$(O) \
+ src/string/encoding/ucs4$(O)
IO_O_FILES = \
src/io/core$(O) \
@@ -440,7 +448,6 @@
src/string/api$(O) \
src/ops/core_ops$(O) \
#IF(i386_has_gcc_cmpxchg): src/atomic/gcc_x86$(O) \
- src/string/charset$(O) \
src/core_pmcs$(O) \
src/datatypes$(O) \
src/debug$(O) \
@@ -934,7 +941,6 @@
O_FILES = \
$(INTERP_O_FILES) \
- $(CHARSET_O_FILES) \
$(IO_O_FILES) \
$(CLASS_O_FILES) \
$(ENCODING_O_FILES) \
@@ -1511,18 +1517,40 @@
src/spf_vtable$(O) : $(PARROT_H_HEADERS) src/spf_vtable.str src/spf_vtable.c
-src/string/encoding$(O) : $(PARROT_H_HEADERS) src/string/encoding.c
+src/string/encoding$(O) : $(PARROT_H_HEADERS)
-src/string/charset$(O) : $(PARROT_H_HEADERS) src/string/charset.c \
- src/string/encoding/fixed_8.h \
- src/string/encoding/utf8.h \
- src/string/encoding/utf16.h \
- src/string/encoding/ucs2.h \
- src/string/encoding/ucs4.h \
- src/string/charset/ascii.h \
- src/string/charset/binary.h \
- src/string/charset/iso-8859-1.h \
- src/string/charset/unicode.h
+src/string/encoding/tables$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/tables.h
+src/string/encoding/shared$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/shared.h \
+ src/string/encoding/tables.h
+src/string/encoding/ascii$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/ascii.h \
+ src/string/encoding/shared.h \
+ src/string/encoding/tables.h
+src/string/encoding/latin1$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/latin1.h \
+ src/string/encoding/shared.h \
+ src/string/encoding/tables.h
+src/string/encoding/binary$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/binary.h \
+ src/string/encoding/shared.h
+src/string/encoding/utf8$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/utf8.h \
+ src/string/encoding/shared.h \
+ src/string/unicode.h
+src/string/encoding/utf16$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/utf16.h \
+ src/string/encoding/shared.h \
+ src/string/unicode.h
+src/string/encoding/ucs2$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/ucs2.h \
+ src/string/encoding/shared.h \
+ src/string/unicode.h
+src/string/encoding/ucs4$(O) : $(PARROT_H_HEADERS) \
+ src/string/encoding/ucs4.h \
+ src/string/encoding/shared.h \
+ src/string/unicode.h
src/pbc_merge$(O) : $(INC_DIR)/embed.h src/pbc_merge.c \
include/pmc/pmc_sub.h $(INC_DIR)/oplib/ops.h $(PARROT_H_HEADERS)
@@ -1553,10 +1581,6 @@
@TEMP_pmc_build@
- at TEMP_charset_build@
-
- at TEMP_encoding_build@
-
# $(CONFIGURE_GENERATED_FILES) : Configure.pl
# $(PERL) Configure.pl
@@ -2276,7 +2300,6 @@
src/pmc \
src/runcore \
src/string \
- src/string/charset \
src/string/encoding \
$(BUILD_DIR) \
$(BUILD_DIR)/t/perl \
Deleted: trunk/config/inter/charset.pm
==============================================================================
--- trunk/config/inter/charset.pm Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,96 +0,0 @@
-# Copyright (C) 2001-2003, Parrot Foundation.
-# $Id$
-
-=head1 NAME
-
-config/inter/charset.pm - charset files
-
-=head1 DESCRIPTION
-
-Asks the user to select which charset files to include.
-
-=cut
-
-package inter::charset;
-
-use strict;
-use warnings;
-
-use File::Basename qw/basename/;
-
-use base qw(Parrot::Configure::Step);
-
-use Parrot::Configure::Utils ':inter';
-
-
-sub _init {
- my $self = shift;
- my %data;
- $data{description} = q{Which charset files should be compiled in};
- $data{result} = q{};
- return \%data;
-}
-
-my @charsets_defaults =
- defined( $ENV{TEST_CHARSET} )
- ? $ENV{TEST_CHARSET}
- : sort map { basename($_) } glob "./src/string/charset/*.c";
-
-sub runstep {
- my ( $self, $conf ) = @_;
-
- my @charset = @charsets_defaults;
-
- my $charset_list = join ( ' ', grep { defined $_ } @charset );
-
- if ( $conf->options->get('ask') ) {
- print <<"END";
-
-
-The following charsets are available:
- @charset
-END
- $charset_list = prompt(
- 'Which charsets would you like?',
- $charset_list
- );
- }
-
- # names of class files for src/pmc/Makefile
- ( my $TEMP_charset_o = $charset_list ) =~ s/\.c/\$(O)/g;
-
- my $TEMP_charset_build = <<"E_NOTE";
-
-# the following part of the Makefile was built by 'config/inter/charset.pm'
-
-E_NOTE
-
- foreach my $charset ( split( /\s+/, $charset_list ) ) {
- $charset =~ s/\.c$//;
- $TEMP_charset_build .= <<END
-src/string/charset/$charset\$(O): src/string/charset/$charset.h src/string/charset/ascii.h src/string/charset/$charset.c src/string/charset/tables.h \$(NONGEN_HEADERS)
-
-
-END
- }
-
- # build list of libraries for link line in Makefile
- $TEMP_charset_o =~ s{^| }{ src/string/charset/}g;
-
- $conf->data->set(
- charset => $charset_list,
- TEMP_charset_o => $TEMP_charset_o,
- TEMP_charset_build => $TEMP_charset_build,
- );
-
- return 1;
-}
-
-1;
-
-# Local Variables:
-# mode: cperl
-# cperl-indent-level: 4
-# fill-column: 100
-# End:
-# vim: expandtab shiftwidth=4:
Deleted: trunk/config/inter/encoding.pm
==============================================================================
--- trunk/config/inter/encoding.pm Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,96 +0,0 @@
-# Copyright (C) 2001-2003, Parrot Foundation.
-# $Id$
-
-=head1 NAME
-
-config/inter/encoding.pm - encoding files
-
-=head1 DESCRIPTION
-
-Asks the user to select which encoding files to include.
-
-=cut
-
-package inter::encoding;
-
-use strict;
-use warnings;
-
-use base qw(Parrot::Configure::Step);
-
-use File::Basename qw/basename/;
-
-use Parrot::Configure::Utils ':inter';
-
-
-sub _init {
- my $self = shift;
- my %data;
- $data{description} = q{Which encoding files should be compiled in};
- $data{result} = q{};
- return \%data;
-}
-
-my @encodings_defaults =
- defined( $ENV{TEST_ENCODING} )
- ? $ENV{TEST_ENCODING}
- : sort map { basename($_) } glob "./src/string/encoding/*.c";
-
-sub runstep {
- my ( $self, $conf ) = @_;
-
- my @encodings = @encodings_defaults;
-
- my $encoding_list = join( ' ', grep { defined $_ } @encodings );
-
- if ( $conf->options->get('ask') ) {
- print <<"END";
-
-
-The following encodings are available:
- @encodings
-END
- $encoding_list = prompt(
- 'Which encodings would you like?',
- $encoding_list
- );
- }
-
- # names of class files for src/pmc/Makefile
- ( my $TEMP_encoding_o = $encoding_list ) =~ s/\.c/\$(O)/g;
-
- my $TEMP_encoding_build = <<"E_NOTE";
-
-# the following part of the Makefile was built by 'config/inter/encoding.pm'
-
-E_NOTE
-
- foreach my $encoding ( split( /\s+/, $encoding_list ) ) {
- $encoding =~ s/\.c$//;
- $TEMP_encoding_build .= <<END
-src/string/encoding/$encoding\$(O): src/string/encoding/$encoding.h src/string/encoding/$encoding.c src/string/unicode.h \$(NONGEN_HEADERS)
-
-
-END
- }
-
- # build list of libraries for link line in Makefile
- $TEMP_encoding_o =~ s{^| }{ src/string/encoding/}g;
-
- $conf->data->set(
- encoding => $encoding_list,
- TEMP_encoding_o => $TEMP_encoding_o,
- TEMP_encoding_build => $TEMP_encoding_build,
- );
-
- return 1;
-}
-
-1;
-
-# Local Variables:
-# mode: cperl
-# cperl-indent-level: 4
-# fill-column: 100
-# End:
-# vim: expandtab shiftwidth=4:
Modified: trunk/examples/config/file/configcompiler
==============================================================================
--- trunk/examples/config/file/configcompiler Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/examples/config/file/configcompiler Tue Sep 7 22:58:38 2010 (r48833)
@@ -34,8 +34,6 @@
init::optimize
inter::shlibs
inter::libparrot
-inter::charset
-inter::encoding
inter::types
auto::ops
auto::pmc
Modified: trunk/examples/config/file/configwithfatalstep
==============================================================================
--- trunk/examples/config/file/configwithfatalstep Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/examples/config/file/configwithfatalstep Tue Sep 7 22:58:38 2010 (r48833)
@@ -26,8 +26,6 @@
init::optimize
inter::shlibs
inter::libparrot
-inter::charset
-inter::encoding
inter::types
auto::ops
auto::pmc
Deleted: trunk/include/parrot/charset.h
==============================================================================
--- trunk/include/parrot/charset.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,277 +0,0 @@
-/* charset.h
- * Copyright (C) 2004-2010, Parrot Foundation.
- * SVN Info
- * $Id$
- * Overview:
- * This is the header for the 8-bit fixed-width encoding
- */
-
-#ifndef PARROT_CHARSET_H_GUARD
-#define PARROT_CHARSET_H_GUARD
-
-
-#include "parrot/encoding.h"
-#include "parrot/cclass.h"
-
-struct _charset;
-typedef struct _charset CHARSET;
-
-
-#if !defined PARROT_NO_EXTERN_CHARSET_PTRS
-PARROT_DATA CHARSET *Parrot_iso_8859_1_charset_ptr;
-PARROT_DATA CHARSET *Parrot_binary_charset_ptr;
-PARROT_DATA CHARSET *Parrot_default_charset_ptr;
-PARROT_DATA CHARSET *Parrot_unicode_charset_ptr;
-PARROT_DATA CHARSET *Parrot_ascii_charset_ptr;
-#endif
-
-#define PARROT_DEFAULT_CHARSET Parrot_ascii_charset_ptr
-#define PARROT_BINARY_CHARSET Parrot_binary_charset_ptr
-#define PARROT_UNICODE_CHARSET Parrot_unicode_charset_ptr
-
-typedef STRING * (*charset_get_graphemes_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
-typedef STRING * (*charset_to_charset_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_from_unicode_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_compose_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_decompose_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_upcase_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_downcase_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_titlecase_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_upcase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_downcase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef STRING * (*charset_titlecase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef INTVAL (*charset_compare_t)(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs));
-typedef INTVAL (*charset_index_t)(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search_string), UINTVAL offset);
-typedef INTVAL (*charset_rindex_t)(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search_string), UINTVAL offset);
-typedef UINTVAL (*charset_validate_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef INTVAL (*charset_is_cclass_t)(PARROT_INTERP, INTVAL, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_cclass_t)(PARROT_INTERP, INTVAL, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
-typedef INTVAL (*charset_find_not_cclass_t)(PARROT_INTERP, INTVAL, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
-typedef INTVAL (*charset_is_wordchar_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_wordchar_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_not_wordchar_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_is_whitespace_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_whitespace_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_not_whitespace_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_is_digit_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_digit_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_not_digit_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_is_punctuation_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_punctuation_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_not_punctuation_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_is_newline_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_newline_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_not_newline_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef INTVAL (*charset_find_word_boundary_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef STRING * (*charset_string_from_codepoint_t)(PARROT_INTERP, UINTVAL codepoint);
-typedef size_t (*charset_compute_hash_t)(PARROT_INTERP, ARGIN(const STRING *src), size_t seed);
-
-typedef STRING * (*charset_converter_t)(PARROT_INTERP, ARGIN(const STRING *src));
-
-/* HEADERIZER BEGIN: src/string/charset.c */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const char * Parrot_charset_c_name(SHIM_INTERP, INTVAL number_of_charset);
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-STRING * Parrot_charset_name(SHIM_INTERP, INTVAL number_of_charset);
-
-PARROT_EXPORT
-PARROT_WARN_UNUSED_RESULT
-INTVAL Parrot_charset_number(PARROT_INTERP,
- ARGIN(const STRING *charsetname))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_WARN_UNUSED_RESULT
-INTVAL Parrot_charset_number_of_str(SHIM_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(2);
-
-PARROT_EXPORT
-void Parrot_charsets_encodings_deinit(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-PARROT_EXPORT
-void Parrot_charsets_encodings_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_WARN_UNUSED_RESULT
-PARROT_CAN_RETURN_NULL
-const CHARSET * Parrot_default_charset(SHIM_INTERP);
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const CHARSET * Parrot_find_charset(SHIM_INTERP,
- ARGIN(const char *charsetname))
- __attribute__nonnull__(2);
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_WARN_UNUSED_RESULT
-PARROT_CAN_RETURN_NULL
-charset_converter_t Parrot_find_charset_converter(SHIM_INTERP,
- ARGIN(const CHARSET *lhs),
- ARGIN(const CHARSET *rhs))
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const CHARSET * Parrot_get_charset(SHIM_INTERP, INTVAL number_of_charset);
-
-PARROT_EXPORT
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const CHARSET * Parrot_load_charset(PARROT_INTERP,
- ARGIN(const char *charsetname))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_EXPORT
-INTVAL Parrot_make_default_charset(SHIM_INTERP,
- SHIM(const char *charsetname),
- ARGIN(const CHARSET *charset))
- __attribute__nonnull__(3);
-
-PARROT_EXPORT
-PARROT_CANNOT_RETURN_NULL
-PARROT_MALLOC
-CHARSET * Parrot_new_charset(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-PARROT_EXPORT
-INTVAL Parrot_register_charset(PARROT_INTERP,
- ARGIN(const char *charsetname),
- ARGIN(CHARSET *charset))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_EXPORT
-void Parrot_register_charset_converter(PARROT_INTERP,
- ARGIN(const CHARSET *lhs),
- ARGIN(const CHARSET *rhs),
- ARGIN(charset_converter_t func))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- __attribute__nonnull__(4);
-
-#define ASSERT_ARGS_Parrot_charset_c_name __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_Parrot_charset_name __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_Parrot_charset_number __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(charsetname))
-#define ASSERT_ARGS_Parrot_charset_number_of_str __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_Parrot_charsets_encodings_deinit \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_Parrot_charsets_encodings_init \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_Parrot_default_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_Parrot_find_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(charsetname))
-#define ASSERT_ARGS_Parrot_find_charset_converter __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs))
-#define ASSERT_ARGS_Parrot_get_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_Parrot_load_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(charsetname))
-#define ASSERT_ARGS_Parrot_make_default_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(charset))
-#define ASSERT_ARGS_Parrot_new_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_Parrot_register_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(charsetname) \
- , PARROT_ASSERT_ARG(charset))
-#define ASSERT_ARGS_Parrot_register_charset_converter \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs) \
- , PARROT_ASSERT_ARG(func))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/charset.c */
-
-struct _charset {
- const char *name;
- charset_get_graphemes_t get_graphemes;
- charset_to_charset_t to_charset;
- charset_compose_t compose;
- charset_decompose_t decompose;
- charset_upcase_t upcase;
- charset_downcase_t downcase;
- charset_titlecase_t titlecase;
- charset_upcase_first_t upcase_first;
- charset_downcase_first_t downcase_first;
- charset_titlecase_first_t titlecase_first;
- charset_compare_t compare;
- charset_index_t index;
- charset_rindex_t rindex;
- charset_validate_t validate;
- charset_is_cclass_t is_cclass;
- charset_find_cclass_t find_cclass;
- charset_find_not_cclass_t find_not_cclass;
- charset_string_from_codepoint_t string_from_codepoint;
- charset_compute_hash_t compute_hash;
- const ENCODING *preferred_encoding;
-};
-
-#define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((source)->charset)->get_graphemes((interp), (source), (offset), (count))
-#define CHARSET_TO_UNICODE(interp, source, dest) ((source)->charset)->to_unicode((interp), (source), (dest))
-#define CHARSET_COMPOSE(interp, source) ((source)->charset)->compose((interp), (source))
-#define CHARSET_DECOMPOSE(interp, source) ((source)->charset)->decompose((interp), (source))
-#define CHARSET_UPCASE(interp, source) ((source)->charset)->upcase((interp), (source))
-#define CHARSET_DOWNCASE(interp, source) ((source)->charset)->downcase((interp), (source))
-#define CHARSET_TITLECASE(interp, source) ((source)->charset)->titlecase((interp), (source))
-#define CHARSET_UPCASE_FIRST(interp, source) ((source)->charset)->upcase_first((interp), (source))
-#define CHARSET_DOWNCASE_FIRST(interp, source) ((source)->charset)->downcase_first((interp), (source))
-#define CHARSET_TITLECASE_FIRST(interp, source) ((source)->charset)->titlecase_first((interp), (source))
-#define CHARSET_COMPARE(interp, lhs, rhs) ((const CHARSET *)(lhs)->charset)->compare((interp), (lhs), (rhs))
-#define CHARSET_INDEX(interp, source, search, offset) ((source)->charset)->index((interp), (source), (search), (offset))
-#define CHARSET_RINDEX(interp, source, search, offset) ((source)->charset)->rindex((interp), (source), (search), (offset))
-#define CHARSET_VALIDATE(interp, source) ((source)->charset)->validate((interp), (source))
-#define CHARSET_IS_CCLASS(interp, flags, source, offset) ((source)->charset)->is_cclass((interp), (flags), (source), (offset))
-#define CHARSET_FIND_CCLASS(interp, flags, source, offset, count) ((source)->charset)->find_cclass((interp), (flags), (source), (offset), (count))
-#define CHARSET_FIND_NOT_CCLASS(interp, flags, source, offset, count) ((source)->charset)->find_not_cclass((interp), (flags), (source), (offset), (count))
-#define CHARSET_COMPUTE_HASH(interp, source, seed) ((source)->charset)->compute_hash((interp), (source), (seed))
-#define CHARSET_GET_PREFERRED_ENCODING(interp, source) ((source)->charset)->preferred_encoding
-
-#define CHARSET_TO_ENCODING(interp, source) ((source)->encoding)->to_encoding((interp), (source))
-#define CHARSET_COPY_TO_ENCODING(interp, source) ((source)->encoding)->copy_to_encoding((interp), (source))
-#define CHARSET_GET_CODEPOINT(interp, source, offset) ((source)->encoding)->get_codepoint((interp), (source), (offset))
-#define CHARSET_GET_BYTE(interp, source, offset) ((source)->encoding)->get_byte((interp), (source), (offset))
-#define CHARSET_SET_BYTE(interp, source, offset, value) ((source)->encoding)->set_byte((interp), (source), (offset), (value))
-#define CHARSET_GET_CODEPOINTS(interp, source, offset, count) ((source)->encoding)->get_codepoints((interp), (source), (offset), (count))
-#define CHARSET_GET_BYTES(interp, source, offset, count) ((source)->encoding)->get_bytes((interp), (source), (offset), (count))
-#define CHARSET_CODEPOINTS(interp, source) ((source)->encoding)->codepoints((interp), (source))
-#define CHARSET_BYTES(interp, source) ((source)->encoding)->bytes((interp), (source))
-
-
-#endif /* PARROT_CHARSET_H_GUARD */
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/include/parrot/encoding.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -15,68 +15,15 @@
#include "parrot/parrot.h"
-typedef STRING * (*encoding_to_encoding_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef UINTVAL (*encoding_get_codepoint_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef UINTVAL (*encoding_get_byte_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
-typedef void (*encoding_set_byte_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
-typedef STRING * (*encoding_get_codepoints_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
-typedef STRING * (*encoding_get_bytes_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
-typedef UINTVAL (*encoding_codepoints_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef UINTVAL (*encoding_bytes_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef UINTVAL (*encoding_find_cclass_t)(PARROT_INTERP, ARGIN(const STRING *s), ARGIN(const INTVAL *typetable), INTVAL flags, UINTVAL offset, UINTVAL count);
-typedef size_t (*encoding_hash_t)(PARROT_INTERP, ARGIN(const STRING *s), size_t hashval);
-
-/* iterator support */
-
-struct string_iterator_t; /* s. parrot/string.h */
-
-typedef UINTVAL (*encoding_iter_get_t)(
- PARROT_INTERP, const STRING *str, const String_iter *i, INTVAL offset);
-typedef void (*encoding_iter_skip_t)(
- PARROT_INTERP, const STRING *str, String_iter *i, INTVAL skip);
-typedef UINTVAL (*encoding_iter_get_and_advance_t)(
- PARROT_INTERP, const STRING *str, String_iter *i);
-typedef void (*encoding_iter_set_and_advance_t)(
- PARROT_INTERP, STRING *str, String_iter *i, UINTVAL c);
-typedef void (*encoding_iter_set_position_t)(
- PARROT_INTERP, const STRING *str, String_iter *i, UINTVAL pos);
-
-struct _encoding {
- ARGIN(const char *name);
- UINTVAL max_bytes_per_codepoint;
- encoding_to_encoding_t to_encoding;
- encoding_get_codepoint_t get_codepoint;
- encoding_get_byte_t get_byte;
- encoding_set_byte_t set_byte;
- encoding_get_codepoints_t get_codepoints;
- encoding_get_bytes_t get_bytes;
- encoding_codepoints_t codepoints;
- encoding_bytes_t bytes;
- encoding_find_cclass_t find_cclass;
- encoding_hash_t hash;
- encoding_iter_get_t iter_get;
- encoding_iter_skip_t iter_skip;
- encoding_iter_get_and_advance_t iter_get_and_advance;
- encoding_iter_set_and_advance_t iter_set_and_advance;
- encoding_iter_set_position_t iter_set_position;
-};
-
-typedef struct _encoding ENCODING;
-
-#if !defined PARROT_NO_EXTERN_ENCODING_PTRS
-PARROT_DATA ENCODING *Parrot_fixed_8_encoding_ptr;
-PARROT_DATA ENCODING *Parrot_utf8_encoding_ptr;
-PARROT_DATA ENCODING *Parrot_utf16_encoding_ptr;
-PARROT_DATA ENCODING *Parrot_ucs2_encoding_ptr;
-PARROT_DATA ENCODING *Parrot_ucs4_encoding_ptr;
-PARROT_DATA ENCODING *Parrot_default_encoding_ptr;
-#endif
-
-#define PARROT_DEFAULT_ENCODING Parrot_fixed_8_encoding_ptr
-#define PARROT_FIXED_8_ENCODING Parrot_fixed_8_encoding_ptr
-#define PARROT_DEFAULT_FOR_UNICODE_ENCODING NULL
+PARROT_DATA STR_VTABLE *Parrot_ascii_encoding_ptr;
+PARROT_DATA STR_VTABLE *Parrot_latin1_encoding_ptr;
+PARROT_DATA STR_VTABLE *Parrot_binary_encoding_ptr;
+PARROT_DATA STR_VTABLE *Parrot_utf8_encoding_ptr;
+PARROT_DATA STR_VTABLE *Parrot_utf16_encoding_ptr;
+PARROT_DATA STR_VTABLE *Parrot_ucs2_encoding_ptr;
+PARROT_DATA STR_VTABLE *Parrot_ucs4_encoding_ptr;
-typedef INTVAL (*encoding_converter_t)(PARROT_INTERP, ENCODING *lhs, ENCODING *rhs);
+PARROT_DATA STR_VTABLE *Parrot_default_encoding_ptr;
/* HEADERIZER BEGIN: src/string/encoding.c */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
@@ -85,7 +32,7 @@
PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
-const ENCODING * Parrot_default_encoding(SHIM_INTERP);
+const STR_VTABLE * Parrot_default_encoding(SHIM_INTERP);
PARROT_EXPORT
PARROT_PURE_FUNCTION
@@ -114,32 +61,28 @@
__attribute__nonnull__(2);
PARROT_EXPORT
+void Parrot_encodings_init(PARROT_INTERP)
+ __attribute__nonnull__(1);
+
+PARROT_EXPORT
PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
PARROT_CAN_RETURN_NULL
-const ENCODING * Parrot_find_encoding(SHIM_INTERP,
+const STR_VTABLE * Parrot_find_encoding(SHIM_INTERP,
ARGIN(const char *encodingname))
__attribute__nonnull__(2);
PARROT_EXPORT
-PARROT_DOES_NOT_RETURN
-encoding_converter_t Parrot_find_encoding_converter(PARROT_INTERP,
- ARGIN(ENCODING *lhs),
- ARGIN(ENCODING *rhs))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_EXPORT
PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
PARROT_CAN_RETURN_NULL
-const ENCODING* Parrot_get_encoding(SHIM_INTERP, INTVAL number_of_encoding);
+const STR_VTABLE* Parrot_get_encoding(SHIM_INTERP,
+ INTVAL number_of_encoding);
PARROT_EXPORT
PARROT_DOES_NOT_RETURN
PARROT_CANNOT_RETURN_NULL
-const ENCODING * Parrot_load_encoding(PARROT_INTERP,
+const STR_VTABLE * Parrot_load_encoding(PARROT_INTERP,
ARGIN(const char *encodingname))
__attribute__nonnull__(1)
__attribute__nonnull__(2);
@@ -147,24 +90,21 @@
PARROT_EXPORT
INTVAL Parrot_make_default_encoding(SHIM_INTERP,
SHIM(const char *encodingname),
- ARGIN(ENCODING *encoding))
+ ARGIN(STR_VTABLE *encoding))
__attribute__nonnull__(3);
PARROT_EXPORT
PARROT_MALLOC
PARROT_CANNOT_RETURN_NULL
-ENCODING * Parrot_new_encoding(PARROT_INTERP)
+STR_VTABLE * Parrot_new_encoding(PARROT_INTERP)
__attribute__nonnull__(1);
PARROT_EXPORT
-INTVAL Parrot_register_encoding(PARROT_INTERP,
- ARGIN(const char *encodingname),
- ARGIN(ENCODING *encoding))
+INTVAL Parrot_register_encoding(PARROT_INTERP, ARGIN(STR_VTABLE *encoding))
__attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
+ __attribute__nonnull__(2);
-void parrot_deinit_encodings(PARROT_INTERP)
+void Parrot_deinit_encodings(PARROT_INTERP)
__attribute__nonnull__(1);
void Parrot_str_internal_register_encoding_names(PARROT_INTERP)
@@ -178,13 +118,10 @@
, PARROT_ASSERT_ARG(encodingname))
#define ASSERT_ARGS_Parrot_encoding_number_of_str __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_Parrot_encodings_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_find_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(encodingname))
-#define ASSERT_ARGS_Parrot_find_encoding_converter \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs))
#define ASSERT_ARGS_Parrot_get_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
#define ASSERT_ARGS_Parrot_load_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
@@ -195,9 +132,8 @@
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_register_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(encodingname) \
, PARROT_ASSERT_ARG(encoding))
-#define ASSERT_ARGS_parrot_deinit_encodings __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+#define ASSERT_ARGS_Parrot_deinit_encodings __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_str_internal_register_encoding_names \
__attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -205,27 +141,6 @@
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding.c */
-#define ENCODING_MAX_BYTES_PER_CODEPOINT(i, src) \
- ((src)->encoding)->max_bytes_per_codepoint
-#define ENCODING_GET_CODEPOINT(i, src, offset) \
- ((src)->encoding)->get_codepoint((i), (src), (offset))
-#define ENCODING_GET_BYTE(i, src, offset) \
- ((src)->encoding)->get_byte((i), (src), (offset))
-#define ENCODING_SET_BYTE(i, src, offset, value) \
- ((src)->encoding)->set_byte((i), (src), (offset), (value))
-#define ENCODING_GET_CODEPOINTS(i, src, offset, count) \
- ((src)->encoding)->get_codepoints((i), (src), (offset), (count))
-#define ENCODING_GET_BYTES(i, src, offset, count) \
- ((src)->encoding)->get_bytes((i), (src), (offset), (count))
-#define ENCODING_CODEPOINTS(i, src) \
- ((src)->encoding)->codepoints((i), (src))
-#define ENCODING_BYTES(i, src) \
- ((src)->encoding)->bytes((i), (src))
-#define ENCODING_FIND_CCLASS(i, src, typetable, flags, pos, end) \
- ((src)->encoding)->find_cclass((i), (src), (typetable), (flags), (pos), (end))
-#define ENCODING_HASH(i, src, seed) \
- ((src)->encoding)->hash((i), (src), (seed))
-
#endif /* PARROT_ENCODING_H_GUARD */
/*
Modified: trunk/include/parrot/parrot.h
==============================================================================
--- trunk/include/parrot/parrot.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/include/parrot/parrot.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -255,7 +255,6 @@
#include "parrot/interpreter.h"
#include "parrot/datatypes.h"
#include "parrot/encoding.h"
-#include "parrot/charset.h"
#include "parrot/string.h"
#include "parrot/string_primitives.h"
#include "parrot/hash.h"
Modified: trunk/include/parrot/pobj.h
==============================================================================
--- trunk/include/parrot/pobj.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/include/parrot/pobj.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -31,6 +31,11 @@
size_t _buflen; /* Length of buffer data. */
} Buffer;
+typedef enum Forward_flag {
+ Buffer_moved_FLAG = 1 << 0,
+ Buffer_shared_FLAG = 1 << 1
+} Forward_flags;
+
/* Use these macros to access the two buffer header slots. */
#define Buffer_bufstart(buffer) (buffer)->_bufstart
@@ -96,8 +101,7 @@
UINTVAL hashval; /* Cached hash value. */
/* parrot_string_representation_t representation;*/
- const struct _encoding *encoding; /* Pointer to encoding structure. */
- const struct _charset *charset; /* Pointer to charset structure. */
+ const struct _str_vtable *encoding; /* Pointer to string vtable. */
};
/* Here is the Parrot PMC object, "inheriting" from PObj. */
Modified: trunk/include/parrot/string.h
==============================================================================
--- trunk/include/parrot/string.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/include/parrot/string.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -18,38 +18,51 @@
#ifdef PARROT_IN_CORE
+#include "parrot/compiler.h"
#include "parrot/pobj.h"
+#include "parrot/cclass.h"
#include "parrot/parrot.h"
-typedef struct parrot_string_t STRING;
-
-typedef enum Forward_flag {
- Buffer_moved_FLAG = 1 << 0,
- Buffer_shared_FLAG = 1 << 1
-} Forward_flags;
-
-/* String iterator */
-typedef struct string_iterator_t {
- UINTVAL bytepos;
- UINTVAL charpos;
-} String_iter;
-
-#define STRING_ITER_INIT(i, iter) \
- (iter)->charpos = (iter)->bytepos = 0
-#define STRING_ITER_GET(i, str, iter, offset) \
- ((str)->encoding)->iter_get((i), (str), (iter), (offset))
-#define STRING_ITER_SKIP(i, str, iter, skip) \
- ((str)->encoding)->iter_skip((i), (str), (iter), (skip))
-#define STRING_ITER_GET_AND_ADVANCE(i, str, iter) \
- ((str)->encoding)->iter_get_and_advance((i), (str), (iter))
-#define STRING_ITER_SET_AND_ADVANCE(i, str, iter, c) \
- ((str)->encoding)->iter_set_and_advance((i), (str), (iter), (c))
-#define STRING_ITER_SET_POSITION(i, str, iter, pos) \
- ((str)->encoding)->iter_set_position((i), (str), (iter), (pos))
-
#define STREQ(x, y) (strcmp((x), (y))==0)
#define STRNEQ(x, y) (strcmp((x), (y))!=0)
+#define STRING_length(src) ((src) ? (src)->strlen : 0U)
+#define STRING_byte_length(src) ((src) ? (src)->bufused : 0U)
+#define STRING_max_bytes_per_codepoint(src) ((src)->encoding)->max_bytes_per_codepoint
+
+#define STRING_equal(interp, lhs, rhs) ((lhs)->encoding)->equal((interp), (lhs), (rhs))
+#define STRING_compare(interp, lhs, rhs) ((lhs)->encoding)->compare((interp), (lhs), (rhs))
+#define STRING_index(interp, src, search, offset) ((src)->encoding)->index((interp), (src), (search), (offset))
+#define STRING_rindex(interp, src, search, offset) ((src)->encoding)->rindex((interp), (src), (search), (offset))
+#define STRING_hash(i, src, seed) ((src)->encoding)->hash((i), (src), (seed))
+#define STRING_validate(interp, src) ((src)->encoding)->validate((interp), (src))
+
+#define STRING_scan(i, src) ((src)->encoding)->scan((i), (src))
+#define STRING_ord(i, src, offset) ((src)->encoding)->ord((i), (src), (offset))
+#define STRING_substr(i, src, offset, count) ((src)->encoding)->substr((i), (src), (offset), (count))
+
+#define STRING_is_cclass(interp, flags, src, offset) ((src)->encoding)->is_cclass((interp), (flags), (src), (offset))
+#define STRING_find_cclass(interp, flags, src, offset, count) ((src)->encoding)->find_cclass((interp), (flags), (src), (offset), (count))
+#define STRING_find_not_cclass(interp, flags, src, offset, count) ((src)->encoding)->find_not_cclass((interp), (flags), (src), (offset), (count))
+
+#define STRING_get_graphemes(interp, src, offset, count) ((src)->encoding)->get_graphemes((interp), (src), (offset), (count))
+#define STRING_compose(interp, src) ((src)->encoding)->compose((interp), (src))
+#define STRING_decompose(interp, src) ((src)->encoding)->decompose((interp), (src))
+
+#define STRING_upcase(interp, src) ((src)->encoding)->upcase((interp), (src))
+#define STRING_downcase(interp, src) ((src)->encoding)->downcase((interp), (src))
+#define STRING_titlecase(interp, src) ((src)->encoding)->titlecase((interp), (src))
+#define STRING_upcase_first(interp, src) ((src)->encoding)->upcase_first((interp), (src))
+#define STRING_downcase_first(interp, src) ((src)->encoding)->downcase_first((interp), (src))
+#define STRING_titlecase_first(interp, src) ((src)->encoding)->titlecase_first((interp), (src))
+
+#define STRING_ITER_INIT(i, iter) (iter)->charpos = (iter)->bytepos = 0
+#define STRING_iter_get(i, str, iter, offset) ((str)->encoding)->iter_get((i), (str), (iter), (offset))
+#define STRING_iter_skip(i, str, iter, skip) ((str)->encoding)->iter_skip((i), (str), (iter), (skip))
+#define STRING_iter_get_and_advance(i, str, iter) ((str)->encoding)->iter_get_and_advance((i), (str), (iter))
+#define STRING_iter_set_and_advance(i, str, iter, c) ((str)->encoding)->iter_set_and_advance((i), (str), (iter), (c))
+#define STRING_iter_set_position(i, str, iter, pos) ((str)->encoding)->iter_set_position((i), (str), (iter), (pos))
+
/* stringinfo parameters */
/* &gen_from_def(stringinfo.pasm) */
@@ -63,6 +76,98 @@
/* &end_gen */
+typedef struct parrot_string_t STRING;
+
+/* String iterator */
+typedef struct string_iterator_t {
+ UINTVAL bytepos;
+ UINTVAL charpos;
+} String_iter;
+
+/* constructors */
+typedef STRING * (*str_vtable_to_encoding_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_chr_t)(PARROT_INTERP, UINTVAL codepoint);
+
+typedef INTVAL (*str_vtable_equal_t)(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs));
+typedef INTVAL (*str_vtable_compare_t)(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs));
+typedef INTVAL (*str_vtable_index_t)(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search_string), UINTVAL offset);
+typedef INTVAL (*str_vtable_rindex_t)(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search_string), UINTVAL offset);
+typedef size_t (*str_vtable_hash_t)(PARROT_INTERP, ARGIN(const STRING *s), size_t hashval);
+typedef UINTVAL (*str_vtable_validate_t)(PARROT_INTERP, ARGIN(const STRING *src));
+
+typedef UINTVAL (*str_vtable_scan_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef UINTVAL (*str_vtable_ord_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset);
+typedef STRING * (*str_vtable_substr_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
+
+/* character classes */
+typedef INTVAL (*str_vtable_is_cclass_t)(PARROT_INTERP, INTVAL, ARGIN(const STRING *src), UINTVAL offset);
+typedef INTVAL (*str_vtable_find_cclass_t)(PARROT_INTERP, INTVAL, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
+typedef INTVAL (*str_vtable_find_not_cclass_t)(PARROT_INTERP, INTVAL, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
+
+/* graphemes */
+typedef STRING * (*str_vtable_get_graphemes_t)(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count);
+typedef STRING * (*str_vtable_compose_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_decompose_t)(PARROT_INTERP, ARGIN(const STRING *src));
+
+/* case conversion, TODO: move to single function with a flag */
+typedef STRING * (*str_vtable_upcase_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_downcase_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_titlecase_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_upcase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_downcase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
+typedef STRING * (*str_vtable_titlecase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
+
+/* iterator functions */
+typedef UINTVAL (*str_vtable_iter_get_t)(PARROT_INTERP, const STRING *str, const String_iter *i, INTVAL offset);
+typedef void (*str_vtable_iter_skip_t)(PARROT_INTERP, const STRING *str, String_iter *i, INTVAL skip);
+typedef UINTVAL (*str_vtable_iter_get_and_advance_t)(PARROT_INTERP, const STRING *str, String_iter *i);
+typedef void (*str_vtable_iter_set_and_advance_t)(PARROT_INTERP, STRING *str, String_iter *i, UINTVAL c);
+typedef void (*str_vtable_iter_set_position_t)(PARROT_INTERP, const STRING *str, String_iter *i, UINTVAL pos);
+
+struct _str_vtable {
+ int num;
+ const char *name;
+ STRING *name_str;
+ UINTVAL max_bytes_per_codepoint;
+
+ str_vtable_to_encoding_t to_encoding;
+ str_vtable_chr_t chr;
+
+ str_vtable_equal_t equal;
+ str_vtable_compare_t compare;
+ str_vtable_index_t index;
+ str_vtable_rindex_t rindex;
+ str_vtable_hash_t hash;
+ str_vtable_validate_t validate;
+
+ str_vtable_scan_t scan;
+ str_vtable_ord_t ord;
+ str_vtable_substr_t substr;
+
+ str_vtable_is_cclass_t is_cclass;
+ str_vtable_find_cclass_t find_cclass;
+ str_vtable_find_not_cclass_t find_not_cclass;
+
+ str_vtable_get_graphemes_t get_graphemes;
+ str_vtable_compose_t compose;
+ str_vtable_decompose_t decompose;
+
+ str_vtable_upcase_t upcase;
+ str_vtable_downcase_t downcase;
+ str_vtable_titlecase_t titlecase;
+ str_vtable_upcase_first_t upcase_first;
+ str_vtable_downcase_first_t downcase_first;
+ str_vtable_titlecase_first_t titlecase_first;
+
+ str_vtable_iter_get_t iter_get;
+ str_vtable_iter_skip_t iter_skip;
+ str_vtable_iter_get_and_advance_t iter_get_and_advance;
+ str_vtable_iter_set_and_advance_t iter_set_and_advance;
+ str_vtable_iter_set_position_t iter_set_position;
+};
+
+typedef struct _str_vtable STR_VTABLE;
+
#endif /* PARROT_IN_CORE */
#endif /* PARROT_STRING_H_GUARD */
Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/include/parrot/string_funcs.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -297,12 +297,10 @@
STRING * Parrot_str_new_init(PARROT_INTERP,
ARGIN_NULLOK(const char *buffer),
UINTVAL len,
- ARGIN(const ENCODING *encoding),
- ARGIN(const CHARSET *charset),
+ ARGIN(const STR_VTABLE *encoding),
UINTVAL flags)
__attribute__nonnull__(1)
- __attribute__nonnull__(4)
- __attribute__nonnull__(5);
+ __attribute__nonnull__(4);
PARROT_EXPORT
PARROT_CANNOT_RETURN_NULL
@@ -326,15 +324,12 @@
PARROT_EXPORT
PARROT_IGNORABLE_RESULT
PARROT_CAN_RETURN_NULL
-const CHARSET * Parrot_str_rep_compatible(PARROT_INTERP,
+const STR_VTABLE * Parrot_str_rep_compatible(PARROT_INTERP,
ARGIN(const STRING *a),
- ARGIN(const STRING *b),
- ARGOUT(const ENCODING **e))
+ ARGIN(const STRING *b))
__attribute__nonnull__(1)
__attribute__nonnull__(2)
- __attribute__nonnull__(3)
- __attribute__nonnull__(4)
- FUNC_MODIFIES(*e);
+ __attribute__nonnull__(3);
PARROT_EXPORT
PARROT_CANNOT_RETURN_NULL
@@ -415,13 +410,11 @@
PARROT_CANNOT_RETURN_NULL
STRING * Parrot_str_unescape_string(PARROT_INTERP,
ARGIN(const STRING *src),
- ARGIN(const CHARSET *charset),
- ARGIN(const ENCODING *encoding),
+ ARGIN(const STR_VTABLE *encoding),
UINTVAL flags)
__attribute__nonnull__(1)
__attribute__nonnull__(2)
- __attribute__nonnull__(3)
- __attribute__nonnull__(4);
+ __attribute__nonnull__(3);
PARROT_EXPORT
void Parrot_str_unpin(PARROT_INTERP, ARGMOD(STRING *s))
@@ -454,17 +447,7 @@
STRING * string_make(PARROT_INTERP,
ARGIN_NULLOK(const char *buffer),
UINTVAL len,
- ARGIN_NULLOK(const char *charset_name),
- UINTVAL flags)
- __attribute__nonnull__(1);
-
-PARROT_EXPORT
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-STRING * string_make_from_charset(PARROT_INTERP,
- ARGIN_NULLOK(const char *buffer),
- UINTVAL len,
- INTVAL charset_nr,
+ ARGIN_NULLOK(const char *encoding_name),
UINTVAL flags)
__attribute__nonnull__(1);
@@ -608,8 +591,7 @@
, PARROT_ASSERT_ARG(buffer))
#define ASSERT_ARGS_Parrot_str_new_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(encoding) \
- , PARROT_ASSERT_ARG(charset))
+ , PARROT_ASSERT_ARG(encoding))
#define ASSERT_ARGS_Parrot_str_new_noinit __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_str_not_equal __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -619,8 +601,7 @@
#define ASSERT_ARGS_Parrot_str_rep_compatible __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(a) \
- , PARROT_ASSERT_ARG(b) \
- , PARROT_ASSERT_ARG(e))
+ , PARROT_ASSERT_ARG(b))
#define ASSERT_ARGS_Parrot_str_repeat __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(s))
@@ -649,7 +630,6 @@
#define ASSERT_ARGS_Parrot_str_unescape_string __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(charset) \
, PARROT_ASSERT_ARG(encoding))
#define ASSERT_ARGS_Parrot_str_unpin __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
@@ -663,8 +643,6 @@
, PARROT_ASSERT_ARG(s))
#define ASSERT_ARGS_string_make __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_string_make_from_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_string_max_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(s))
#define ASSERT_ARGS_string_ord __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
Modified: trunk/lib/Parrot/Configure/Step/List.pm
==============================================================================
--- trunk/lib/Parrot/Configure/Step/List.pm Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/lib/Parrot/Configure/Step/List.pm Tue Sep 7 22:58:38 2010 (r48833)
@@ -28,8 +28,6 @@
init::optimize
inter::shlibs
inter::libparrot
- inter::charset
- inter::encoding
inter::types
auto::ops
auto::pmc
Modified: trunk/src/dynext.c
==============================================================================
--- trunk/src/dynext.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/dynext.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -546,7 +546,7 @@
char * const raw_str = Parrot_str_to_cstring(s, orig);
STRING * const ret =
Parrot_str_new_init(d, raw_str, strlen(raw_str),
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_constant_FLAG);
Parrot_str_free_cstring(raw_str);
return ret;
Modified: trunk/src/dynpmc/Defines.in
==============================================================================
--- trunk/src/dynpmc/Defines.in Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/dynpmc/Defines.in Tue Sep 7 22:58:38 2010 (r48833)
@@ -24,7 +24,6 @@
include/parrot/caches.h \
include/parrot/call.h \
include/parrot/cclass.h \
- include/parrot/charset.h \
include/parrot/compiler.h \
include/parrot/config.h \
include/parrot/context.h \
Modified: trunk/src/global_setup.c
==============================================================================
--- trunk/src/global_setup.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/global_setup.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -98,7 +98,7 @@
STRING * const config_string =
Parrot_str_new_init(interp,
(const char *)parrot_config_stored, parrot_config_size_stored,
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_external_FLAG|PObj_constant_FLAG);
config_hash = Parrot_thaw(interp, config_string);
Modified: trunk/src/hash.c
==============================================================================
--- trunk/src/hash.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/hash.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -165,7 +165,7 @@
if (s1->hashval != s2->hashval)
return 1;
- return CHARSET_COMPARE(interp, s1, s2);
+ return STRING_compare(interp, s1, s2);
}
@@ -188,11 +188,8 @@
STRING const *s1 = (STRING const *)search_key;
STRING const *s2 = (STRING const *)bucket_key;
- if (s1 && s2 && (
- s1->charset != s2->charset ||
- s1->encoding != s2->encoding)) {
+ if (s1 && s2 && s1->encoding != s2->encoding)
return 1;
- }
return hash_compare_string(interp, search_key, bucket_key);
}
@@ -1174,7 +1171,7 @@
if (s == s2
|| (hashval == s2->hashval
- && CHARSET_COMPARE(interp, s, s2) == 0))
+ && STRING_compare(interp, s, s2) == 0))
return bucket;
bucket = bucket->next;
@@ -1284,7 +1281,7 @@
if (s == s2
|| (hashval == s2->hashval
- && CHARSET_COMPARE(interp, s, s2) == 0))
+ && STRING_compare(interp, s, s2) == 0))
break;
bucket = bucket->next;
Modified: trunk/src/io/buffer.c
==============================================================================
--- trunk/src/io/buffer.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/io/buffer.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -232,7 +232,7 @@
char *buf = (char *) Parrot_io_get_buffer_start(interp, filehandle);
size_t size = Parrot_io_get_buffer_size(interp, filehandle);
STRING *s = Parrot_str_new_init(interp, buf, size,
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_external_FLAG);
size_t got = PIO_READ(interp, filehandle, &s);
@@ -338,7 +338,7 @@
if (len >= Parrot_io_get_buffer_size(interp, filehandle)) {
STRING *sf = Parrot_str_new_init(interp, (char *)out_buf, len,
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_external_FLAG);
got = PIO_READ(interp, filehandle, &sf);
s->strlen = s->bufused = current + got;
Modified: trunk/src/io/utf8.c
==============================================================================
--- trunk/src/io/utf8.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/io/utf8.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -53,7 +53,6 @@
size_t len = Parrot_io_read_buffer(interp, filehandle, buf);
s = *buf;
- s->charset = Parrot_unicode_charset_ptr;
s->encoding = Parrot_utf8_encoding_ptr;
/* count chars, verify utf8 */
@@ -74,8 +73,8 @@
/* need len - 1 more chars */
--len2;
- s2 = Parrot_str_new_init(interp, NULL, len2, Parrot_utf8_encoding_ptr,
- Parrot_unicode_charset_ptr, 0);
+ s2 = Parrot_str_new_init(interp, NULL, len2,
+ Parrot_utf8_encoding_ptr, 0);
s2->bufused = len2;
read = Parrot_io_read_buffer(interp, filehandle, &s2);
Modified: trunk/src/library.c
==============================================================================
--- trunk/src/library.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/library.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -345,7 +345,7 @@
const char * const file_name = (const char *)file->strstart;
if (file->strlen <= 1)
return 0;
- PARROT_ASSERT(file->encoding == Parrot_fixed_8_encoding_ptr ||
+ PARROT_ASSERT(STRING_max_bytes_per_codepoint(file) == 1 ||
file->encoding == Parrot_utf8_encoding_ptr);
/* XXX ../foo, ./bar */
@@ -887,23 +887,21 @@
/* This is a quick fix for TT #65
* TODO: redo it with the string reimplementation
*/
- const char * charset = Parrot_charset_c_name(interp,
- Parrot_charset_number_of_str(interp, in));
- STRING * const slash1 = string_make(interp, "/", 1, charset,
- PObj_external_FLAG|PObj_constant_FLAG);
- STRING * const slash2 = string_make(interp, "\\", 1, charset,
- PObj_external_FLAG|PObj_constant_FLAG);
- STRING * const dot = string_make(interp, ".", 1, charset,
- PObj_external_FLAG|PObj_constant_FLAG);
+ STRING * const slash1 = Parrot_str_new_init(interp, "/", 1,
+ in->encoding, PObj_external_FLAG|PObj_constant_FLAG);
+ STRING * const slash2 = Parrot_str_new_init(interp, "\\", 1,
+ in->encoding, PObj_external_FLAG|PObj_constant_FLAG);
+ STRING * const dot = Parrot_str_new_init(interp, ".", 1,
+ in->encoding, PObj_external_FLAG|PObj_constant_FLAG);
const INTVAL len = Parrot_str_byte_length(interp, in);
STRING *stem;
INTVAL pos_sl, pos_dot;
- pos_sl = CHARSET_RINDEX(interp, in, slash1, len);
+ pos_sl = STRING_rindex(interp, in, slash1, len);
if (pos_sl == -1)
- pos_sl = CHARSET_RINDEX(interp, in, slash2, len);
- pos_dot = CHARSET_RINDEX(interp, in, dot, len);
+ pos_sl = STRING_rindex(interp, in, slash2, len);
+ pos_dot = STRING_rindex(interp, in, dot, len);
/* ignore dot in directory name */
if (pos_dot != -1 && pos_dot < pos_sl)
Modified: trunk/src/ops/core_ops.c
==============================================================================
--- trunk/src/ops/core_ops.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/ops/core_ops.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -23103,35 +23103,35 @@
opcode_t *
Parrot_charset_i_s(opcode_t *cur_opcode, PARROT_INTERP) {
const Parrot_Context * const CUR_CTX = Parrot_pcc_get_context_struct(interp, interp->ctx);
- IREG(1) = Parrot_charset_number_of_str(interp, SREG(2));
+ IREG(1) = Parrot_encoding_number_of_str(interp, SREG(2));
return (opcode_t *)cur_opcode + 3;}
opcode_t *
Parrot_charset_i_sc(opcode_t *cur_opcode, PARROT_INTERP) {
const Parrot_Context * const CUR_CTX = Parrot_pcc_get_context_struct(interp, interp->ctx);
- IREG(1) = Parrot_charset_number_of_str(interp, CONST(2).u.string);
+ IREG(1) = Parrot_encoding_number_of_str(interp, CONST(2).u.string);
return (opcode_t *)cur_opcode + 3;}
opcode_t *
Parrot_charsetname_s_i(opcode_t *cur_opcode, PARROT_INTERP) {
const Parrot_Context * const CUR_CTX = Parrot_pcc_get_context_struct(interp, interp->ctx);
- SREG(1) = Parrot_charset_name(interp, IREG(2));
+ SREG(1) = Parrot_encoding_name(interp, IREG(2));
return (opcode_t *)cur_opcode + 3;}
opcode_t *
Parrot_charsetname_s_ic(opcode_t *cur_opcode, PARROT_INTERP) {
const Parrot_Context * const CUR_CTX = Parrot_pcc_get_context_struct(interp, interp->ctx);
- SREG(1) = Parrot_charset_name(interp, cur_opcode[2]);
+ SREG(1) = Parrot_encoding_name(interp, cur_opcode[2]);
return (opcode_t *)cur_opcode + 3;}
opcode_t *
Parrot_find_charset_i_s(opcode_t *cur_opcode, PARROT_INTERP) {
const Parrot_Context * const CUR_CTX = Parrot_pcc_get_context_struct(interp, interp->ctx);
- const INTVAL n = Parrot_charset_number(interp, SREG(2));
+ const INTVAL n = Parrot_encoding_number(interp, SREG(2));
if (n < 0) {
opcode_t *handler = Parrot_ex_throw_from_op_args(interp, NULL,
EXCEPTION_INVALID_CHARTYPE,
@@ -23144,7 +23144,7 @@
opcode_t *
Parrot_find_charset_i_sc(opcode_t *cur_opcode, PARROT_INTERP) {
const Parrot_Context * const CUR_CTX = Parrot_pcc_get_context_struct(interp, interp->ctx);
- const INTVAL n = Parrot_charset_number(interp, CONST(2).u.string);
+ const INTVAL n = Parrot_encoding_number(interp, CONST(2).u.string);
if (n < 0) {
opcode_t *handler = Parrot_ex_throw_from_op_args(interp, NULL,
EXCEPTION_INVALID_CHARTYPE,
Modified: trunk/src/ops/string.ops
==============================================================================
--- trunk/src/ops/string.ops Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/ops/string.ops Tue Sep 7 22:58:38 2010 (r48833)
@@ -474,15 +474,15 @@
=cut
op charset(out INT, in STR) :base_core {
- $1 = Parrot_charset_number_of_str(interp, $2);
+ $1 = Parrot_encoding_number_of_str(interp, $2);
}
op charsetname(out STR, in INT) :base_core {
- $1 = Parrot_charset_name(interp, $2);
+ $1 = Parrot_encoding_name(interp, $2);
}
op find_charset(out INT, in STR) :base_core {
- const INTVAL n = Parrot_charset_number(interp, $2);
+ const INTVAL n = Parrot_encoding_number(interp, $2);
if (n < 0) {
opcode_t *handler = Parrot_ex_throw_from_op_args(interp, NULL,
EXCEPTION_INVALID_CHARTYPE,
Modified: trunk/src/packdump.c
==============================================================================
--- trunk/src/packdump.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/packdump.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -169,8 +169,8 @@
case PFC_STRING:
Parrot_io_printf(interp, " [ 'PFC_STRING', {\n");
pobj_flag_dump(interp, (long)PObj_get_FLAGS(self->u.string));
- Parrot_io_printf(interp, " CHARSET => %ld,\n",
- self->u.string->charset);
+ Parrot_io_printf(interp, " ENCODING => %ld,\n",
+ self->u.string->encoding);
i = self->u.string->bufused;
Parrot_io_printf(interp, " SIZE => %ld,\n",
(long)i);
Modified: trunk/src/packfile.c
==============================================================================
--- trunk/src/packfile.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/packfile.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -3202,7 +3202,7 @@
/* Set up new entry and insert it. */
PackFile_DebugFilenameMapping *mapping = debug->mappings + insert_pos;
STRING *namestr = Parrot_str_new_init(interp, filename, strlen(filename),
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET, 0);
+ Parrot_default_encoding_ptr, 0);
size_t count = ct->const_count;
size_t i;
@@ -3228,7 +3228,7 @@
fnconst = &ct->constants[ct->const_count - 1];
fnconst->type = PFC_STRING;
fnconst->u.string = Parrot_str_new_init(interp, filename, strlen(filename),
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_constant_FLAG);
}
Modified: trunk/src/packfile/pf_items.c
==============================================================================
--- trunk/src/packfile/pf_items.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/packfile/pf_items.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -1217,7 +1217,7 @@
const int wordsize = pf ? pf->header->wordsize : sizeof (opcode_t);
size_t size = PF_fetch_opcode(pf, cursor);
STRING *s = Parrot_str_new_init(interp, (const char *)*cursor, size,
- Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr,
+ Parrot_binary_encoding_ptr,
PObj_external_FLAG);
*((const unsigned char **)(cursor)) += ROUND_UP_B(size, wordsize);
return s;
@@ -1244,8 +1244,7 @@
ASSERT_ARGS(PF_store_buf)
const int wordsize = sizeof (opcode_t);
- PARROT_ASSERT(s->encoding == Parrot_fixed_8_encoding_ptr);
- PARROT_ASSERT(s->charset == Parrot_binary_charset_ptr);
+ PARROT_ASSERT(s->encoding == Parrot_binary_encoding_ptr);
*cursor++ = s->bufused;
@@ -1315,9 +1314,7 @@
STRING *s;
UINTVAL flags;
UINTVAL encoding_nr;
- UINTVAL charset_nr;
- const ENCODING *encoding;
- const CHARSET *charset;
+ const STR_VTABLE *encoding;
size_t size;
const int wordsize = pf ? pf->header->wordsize : sizeof (opcode_t);
opcode_t flag_charset_word = PF_fetch_opcode(pf, cursor);
@@ -1326,30 +1323,24 @@
return STRINGNULL;
/* decode flags, charset and encoding */
- flags = (flag_charset_word & 0x1 ? PObj_constant_FLAG : 0) |
- (flag_charset_word & 0x2 ? PObj_private7_FLAG : 0) ;
- encoding_nr = (flag_charset_word >> 16);
- charset_nr = (flag_charset_word >> 8) & 0xFF;
+ flags = (flag_charset_word & 0x1 ? PObj_constant_FLAG : 0) |
+ (flag_charset_word & 0x2 ? PObj_private7_FLAG : 0) ;
+ encoding_nr = (flag_charset_word >> 8) & 0xFF;
size = (size_t)PF_fetch_opcode(pf, cursor);
TRACE_PRINTF(("PF_fetch_string(): flags=0x%04x, ", flags));
TRACE_PRINTF(("encoding_nr=%ld, ", encoding_nr));
- TRACE_PRINTF(("charset_nr=%ld, ", charset_nr));
TRACE_PRINTF(("size=%ld.\n", size));
encoding = Parrot_get_encoding(interp, encoding_nr);
- charset = Parrot_get_charset(interp, charset_nr);
if (!encoding)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
"Invalid encoding number '%d' specified", encoding_nr);
- if (!charset)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Invalid charset number '%d' specified", charset_nr);
s = Parrot_str_new_init(interp, (const char *)*cursor, size,
- encoding, charset, flags);
+ encoding, flags);
/* print only printable characters */
TRACE_PRINTF_VAL(("PF_fetch_string(): string is '%s' at 0x%x\n",
@@ -1411,8 +1402,7 @@
*/
/* encode charset_nr, encoding_nr and flags into the same word */
- *cursor++ = (Parrot_encoding_number_of_str(NULL, s) << 16) |
- (Parrot_charset_number_of_str(NULL, s) << 8) |
+ *cursor++ = (Parrot_encoding_number_of_str(NULL, s) << 8) |
(PObj_get_FLAGS(s) & PObj_constant_FLAG ? 0x1 : 0x0) |
(PObj_get_FLAGS(s) & PObj_private7_FLAG ? 0x2 : 0x0) ;
*cursor++ = s->bufused;
Modified: trunk/src/packout.c
==============================================================================
--- trunk/src/packout.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/packout.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -290,10 +290,7 @@
if (constant->type == PFC_STRING) {
STRING * const sc = constant->u.string;
if (Parrot_str_equal(interp, key_str, sc)
- && Parrot_charset_number_of_str(interp, key_str)
- == Parrot_charset_number_of_str(interp, sc)
- && Parrot_encoding_number_of_str(interp, key_str)
- == Parrot_encoding_number_of_str(interp, sc)) {
+ && key_str->encoding == sc->encoding) {
return i;
}
}
Modified: trunk/src/pmc/bytebuffer.pmc
==============================================================================
--- trunk/src/pmc/bytebuffer.pmc Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/pmc/bytebuffer.pmc Tue Sep 7 22:58:38 2010 (r48833)
@@ -23,8 +23,7 @@
static STRING * build_string(PARROT_INTERP,
ARGIN(const unsigned char *content),
INTVAL size,
- ARGIN_NULLOK(const CHARSET *charset),
- ARGIN_NULLOK(const ENCODING *encoding))
+ ARGIN_NULLOK(const STR_VTABLE *encoding))
__attribute__nonnull__(1)
__attribute__nonnull__(2);
@@ -315,17 +314,15 @@
*/
- METHOD get_string(STRING *charsetname, STRING *encodingname) {
+ METHOD get_string(STRING *encodingname) {
STRING *result;
unsigned char *content;
INTVAL size;
- const CHARSET *charset = Parrot_get_charset(INTERP,
- Parrot_charset_number(INTERP, charsetname));
- const ENCODING *encoding = Parrot_get_encoding(INTERP,
+ const STR_VTABLE *encoding = Parrot_get_encoding(INTERP,
Parrot_encoding_number(INTERP, encodingname));
GET_ATTR_content(INTERP, SELF, content);
GET_ATTR_size(INTERP, SELF, size);
- result = build_string(INTERP, content, size, charset, encoding);
+ result = build_string(INTERP, content, size, encoding);
RETURN(STRING *result);
}
@@ -344,11 +341,11 @@
STRING *result;
unsigned char *content;
INTVAL size;
- const CHARSET* charset = STRING_IS_NULL(as) ? PARROT_DEFAULT_CHARSET : as->charset;
- const ENCODING *encoding = STRING_IS_NULL(as) ? PARROT_DEFAULT_ENCODING : as->encoding;
+ const STR_VTABLE *encoding = STRING_IS_NULL(as) ?
+ Parrot_default_encoding_ptr : as->encoding;
GET_ATTR_content(INTERP, SELF, content);
GET_ATTR_size(INTERP, SELF, size);
- result = build_string(INTERP, content, size, charset, encoding);
+ result = build_string(INTERP, content, size, encoding);
RETURN(STRING *result);
}
@@ -368,7 +365,7 @@
decrease the number of reallocations.
=item C<static STRING * build_string(PARROT_INTERP, const unsigned char
-*content, INTVAL size, const CHARSET *charset, const ENCODING *encoding)>
+*content, INTVAL size, const STR_VTABLE *encoding)>
Build a string fro the buffer content with the charset and encoding specified.
@@ -393,19 +390,15 @@
static STRING *
build_string(PARROT_INTERP, ARGIN(const unsigned char *content),
INTVAL size,
- ARGIN_NULLOK(const CHARSET *charset),
- ARGIN_NULLOK(const ENCODING *encoding))
+ ARGIN_NULLOK(const STR_VTABLE *encoding))
{
ASSERT_ARGS(build_string)
STRING *result;
- if (charset == NULL)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
- "Invalid charset");
if (encoding == NULL)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"Invalid encoding");
- result = Parrot_str_new_init(interp, (const char *)content, size, encoding, charset, 0);
- if (!CHARSET_VALIDATE(interp, result))
+ result = Parrot_str_new_init(interp, (const char *)content, size, encoding, 0);
+ if (!STRING_validate(interp, result))
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_STRING_REPRESENTATION,
"Invalid buffer content");
Modified: trunk/src/pmc/packfile.pmc
==============================================================================
--- trunk/src/pmc/packfile.pmc Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/pmc/packfile.pmc Tue Sep 7 22:58:38 2010 (r48833)
@@ -160,7 +160,7 @@
PackFile_pack(INTERP, pf, ptr);
str = Parrot_str_new_init(INTERP, (const char*)ptr, length,
- PARROT_FIXED_8_ENCODING, PARROT_BINARY_CHARSET, 0);
+ Parrot_binary_encoding_ptr, 0);
Parrot_gc_free_memory_chunk(INTERP, ptr);
PackFile_destroy(INTERP, pf);
Modified: trunk/src/pmc/packfilefixupentry.pmc
==============================================================================
--- trunk/src/pmc/packfilefixupentry.pmc Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/pmc/packfilefixupentry.pmc Tue Sep 7 22:58:38 2010 (r48833)
@@ -98,8 +98,7 @@
attrs->type = entry->type;
attrs->name = Parrot_str_new_init(INTERP, entry->name,
- strlen(entry->name), PARROT_FIXED_8_ENCODING,
- PARROT_BINARY_CHARSET, 0);
+ strlen(entry->name), Parrot_binary_encoding_ptr, 0);
attrs->offset = entry->offset;
}
Modified: trunk/src/pmc/string.pmc
==============================================================================
--- trunk/src/pmc/string.pmc Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/pmc/string.pmc Tue Sep 7 22:58:38 2010 (r48833)
@@ -193,7 +193,7 @@
if (PObj_constant_TEST(SELF) && !PObj_constant_TEST(value)) {
char * const copy = Parrot_str_to_cstring(INTERP, value);
value = Parrot_str_new_init(INTERP, copy, strlen(copy),
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_constant_FLAG);
Parrot_str_free_cstring(copy);
}
@@ -692,7 +692,7 @@
if (!len)
RETURN(STRING src);
- if (src->charset != Parrot_ascii_charset_ptr)
+ if (src->encoding != Parrot_ascii_encoding_ptr)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_INVALID_ENCODING,
"Can't translate non-ascii");
@@ -730,7 +730,7 @@
if (!len)
RETURN(INTVAL 0);
- if (src->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(src) != 1)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_INVALID_ENCODING,
"Can't is_integer non fixed_8");
@@ -794,7 +794,7 @@
if (!Parrot_str_length(INTERP, substring))
RETURN(INTVAL res);
- res = CHARSET_RINDEX(INTERP, src, substring, (UINTVAL)start);
+ res = STRING_rindex(INTERP, src, substring, (UINTVAL)start);
RETURN(INTVAL res);
}
@@ -808,15 +808,13 @@
*/
- METHOD unescape(STRING *charsetname, STRING *encodingname)
+ METHOD unescape(STRING *encodingname)
{
- const CHARSET *charset = Parrot_get_charset(INTERP,
- Parrot_charset_number(INTERP, charsetname));
- const ENCODING *encoding = Parrot_get_encoding(INTERP,
+ const STR_VTABLE *encoding = Parrot_get_encoding(INTERP,
Parrot_encoding_number(INTERP, encodingname));
STRING * const src = VTABLE_get_string(INTERP, SELF);
STRING * const dest = Parrot_str_unescape_string(INTERP, src,
- charset, encoding, 0);
+ encoding, 0);
RETURN(STRING *dest);
}
Modified: trunk/src/pmc/stringbuilder.pmc
==============================================================================
--- trunk/src/pmc/stringbuilder.pmc Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/pmc/stringbuilder.pmc Tue Sep 7 22:58:38 2010 (r48833)
@@ -64,7 +64,6 @@
STRING * const buffer = mem_gc_allocate_zeroed_typed(INTERP, STRING);
buffer->encoding = Parrot_default_encoding_ptr;
- buffer->charset = Parrot_default_charset_ptr;
buffer->_buflen = initial_size;
buffer->_bufstart = buffer->strstart
= mem_gc_allocate_n_typed(INTERP,
@@ -158,10 +157,9 @@
*/
VTABLE void push_string(STRING *s) {
- STRING *buffer;
- size_t total_size;
- const CHARSET *cs;
- const ENCODING *enc = NULL;
+ STRING *buffer;
+ size_t total_size;
+ const STR_VTABLE *enc;
/* Early return on NULL strings */
if (STRING_IS_NULL(s))
@@ -173,14 +171,12 @@
/* Always copy the encoding of the first string. The IO functions
assume that the concatenation of utf8 strings doesn't change
the encoding. */
- buffer->charset = s->charset;
buffer->encoding = s->encoding;
}
else {
- cs = Parrot_str_rep_compatible(interp, buffer, s, &enc);
+ enc = Parrot_str_rep_compatible(interp, buffer, s);
- if (cs) {
- buffer->charset = cs;
+ if (enc) {
buffer->encoding = enc;
}
else {
@@ -206,7 +202,6 @@
}
buffer->bufused = new_buffer->bufused;
- buffer->charset = new_buffer->charset;
buffer->encoding = new_buffer->encoding;
mem_sys_memcopy(buffer->strstart, new_buffer->strstart,
@@ -222,7 +217,6 @@
/* Calculate (possibly new) total size */
total_size = calculate_capacity(INTERP, total_size);
- /* Parrot_unicode_charset_ptr can produce NULL buffer */
buffer->_bufstart = buffer->strstart = mem_gc_realloc_n_typed(INTERP,
buffer->_bufstart, total_size, char);
buffer->_buflen = total_size;
@@ -298,7 +292,6 @@
buffer->bufused = s->bufused;
buffer->strlen = Parrot_str_length(INTERP, s);
buffer->encoding = s->encoding;
- buffer->charset = s->charset;
}
VTABLE void set_pmc(PMC *s) {
Modified: trunk/src/pmc/stringiterator.pmc
==============================================================================
--- trunk/src/pmc/stringiterator.pmc Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/pmc/stringiterator.pmc Tue Sep 7 22:58:38 2010 (r48833)
@@ -156,11 +156,11 @@
GET_ATTR_str_val(INTERP, SELF, str_val);
if (value == ITERATE_FROM_START) {
SET_ATTR_reverse(INTERP, SELF, 0);
- STRING_ITER_SET_POSITION(INTERP, str_val, iter, 0);
+ STRING_iter_set_position(INTERP, str_val, iter, 0);
}
else if (value == ITERATE_FROM_END) {
SET_ATTR_reverse(INTERP, SELF, 1);
- STRING_ITER_SET_POSITION(INTERP, str_val, iter, str_val->strlen);
+ STRING_iter_set_position(INTERP, str_val, iter, str_val->strlen);
}
else {
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_INVALID_OPERATION,
@@ -209,7 +209,7 @@
"StopIteration");
ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String));
- STRING_ITER_SKIP(INTERP, str_val, iter, 1);
+ STRING_iter_skip(INTERP, str_val, iter, 1);
substr = Parrot_str_iter_substr(INTERP, str_val, &old_iter, iter);
VTABLE_set_string_native(INTERP, ret, substr);
return ret;
@@ -234,7 +234,7 @@
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- STRING_ITER_SKIP(INTERP, str_val, iter, 1);
+ STRING_iter_skip(INTERP, str_val, iter, 1);
return Parrot_str_iter_substr(INTERP, str_val, &old_iter, iter);
}
@@ -256,7 +256,7 @@
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- return STRING_ITER_GET_AND_ADVANCE(INTERP, str_val, iter);
+ return STRING_iter_get_and_advance(INTERP, str_val, iter);
}
/*
@@ -281,7 +281,7 @@
"StopIteration");
ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String));
- STRING_ITER_SKIP(INTERP, str_val, iter, -1);
+ STRING_iter_skip(INTERP, str_val, iter, -1);
substr = Parrot_str_iter_substr(INTERP, str_val, iter, &old_iter);
VTABLE_set_string_native(INTERP, ret, substr);
return ret;
@@ -307,7 +307,7 @@
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- STRING_ITER_SKIP(INTERP, str_val, iter, -1);
+ STRING_iter_skip(INTERP, str_val, iter, -1);
return Parrot_str_iter_substr(INTERP, str_val, iter, &old_iter);
}
@@ -330,8 +330,8 @@
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- STRING_ITER_SKIP(INTERP, str_val, iter, -1);
- return STRING_ITER_GET(INTERP, str_val, iter, 0);
+ STRING_iter_skip(INTERP, str_val, iter, -1);
+ return STRING_iter_get(INTERP, str_val, iter, 0);
}
/*
@@ -354,7 +354,7 @@
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- return STRING_ITER_GET(INTERP, str_val, iter, idx);
+ return STRING_iter_get(INTERP, str_val, iter, idx);
}
/*
@@ -379,9 +379,9 @@
"StopIteration");
if (idx != 0)
- STRING_ITER_SKIP(INTERP, str_val, &iter, idx);
+ STRING_iter_skip(INTERP, str_val, &iter, idx);
next_iter = iter;
- STRING_ITER_SKIP(INTERP, str_val, &next_iter, 1);
+ STRING_iter_skip(INTERP, str_val, &next_iter, 1);
return Parrot_str_iter_substr(INTERP, str_val, &iter, &next_iter);
}
Modified: trunk/src/spf_vtable.c
==============================================================================
--- trunk/src/spf_vtable.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/spf_vtable.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -452,7 +452,6 @@
++obj->index;
s = VTABLE_get_string(interp, tmp);
- /* XXX Parrot_str_copy like below? + adjusting bufused */
return Parrot_str_substr(interp, s, 0, 1);
}
Modified: trunk/src/string/api.c
==============================================================================
--- trunk/src/string/api.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/api.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -35,7 +35,6 @@
#define nonnull_encoding_name(s) (s) ? (s)->encoding->name : "null string"
#define ASSERT_STRING_SANITY(s) \
PARROT_ASSERT((s)->encoding); \
- PARROT_ASSERT((s)->charset); \
PARROT_ASSERT(!PObj_on_free_list_TEST(s))
/* HEADERIZER HFILE: include/parrot/string_funcs.h */
@@ -46,14 +45,11 @@
PARROT_INLINE
PARROT_IGNORABLE_RESULT
PARROT_CAN_RETURN_NULL
-static const CHARSET * string_rep_compatible(SHIM_INTERP,
+static const STR_VTABLE * string_rep_compatible(SHIM_INTERP,
ARGIN(const STRING *a),
- ARGIN(const STRING *b),
- ARGOUT(const ENCODING **e))
+ ARGIN(const STRING *b))
__attribute__nonnull__(2)
- __attribute__nonnull__(3)
- __attribute__nonnull__(4)
- FUNC_MODIFIES(*e);
+ __attribute__nonnull__(3);
PARROT_DOES_NOT_RETURN
PARROT_COLD
@@ -62,8 +58,7 @@
#define ASSERT_ARGS_string_rep_compatible __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(a) \
- , PARROT_ASSERT_ARG(b) \
- , PARROT_ASSERT_ARG(e))
+ , PARROT_ASSERT_ARG(b))
#define ASSERT_ARGS_throw_illegal_escape __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
@@ -137,15 +132,15 @@
return;
}
- /* Set up the cstring cache, then load the basic encodings and charsets */
+ /* Set up the cstring cache, then load the basic encodings */
const_cstring_hash = parrot_new_cstring_hash(interp);
interp->const_cstring_hash = const_cstring_hash;
- Parrot_charsets_encodings_init(interp);
+ Parrot_encodings_init(interp);
#if PARROT_CATCH_NULL
/* initialize STRINGNULL, but not in the constant table */
STRINGNULL = Parrot_str_new_init(interp, NULL, 0,
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_constant_FLAG);
#endif
@@ -158,7 +153,7 @@
Parrot_str_new_init(interp,
parrot_cstrings[i].string,
parrot_cstrings[i].len,
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_external_FLAG|PObj_constant_FLAG);
parrot_hash_put(interp, const_cstring_hash,
PARROT_const_cast(char *, parrot_cstrings[i].string), (void *)s);
@@ -187,7 +182,7 @@
if (!interp->parent_interpreter) {
mem_internal_free(interp->const_cstring_table);
interp->const_cstring_table = NULL;
- Parrot_charsets_encodings_deinit(interp);
+ Parrot_deinit_encodings(interp);
parrot_hash_destroy(interp, interp->const_cstring_hash);
}
}
@@ -218,8 +213,7 @@
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARTYPE,
"Unsupported representation");
- s->charset = PARROT_DEFAULT_CHARSET;
- s->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, s);
+ s->encoding = Parrot_default_encoding_ptr;
Parrot_gc_allocate_string_storage(interp, s,
(size_t)string_max_bytes(interp, s, capacity));
@@ -230,10 +224,10 @@
/*
-=item C<static const CHARSET * string_rep_compatible(PARROT_INTERP, const STRING
-*a, const STRING *b, const ENCODING **e)>
+=item C<static const STR_VTABLE * string_rep_compatible(PARROT_INTERP, const
+STRING *a, const STRING *b)>
-Find the "lowest" possible charset and encoding for the given string. E.g.
+Find the "lowest" possible encoding for the given string. E.g.
ascii <op> utf8 => utf8
=> ascii, B<if> C<STRING *b> has ascii chars only.
@@ -247,66 +241,55 @@
PARROT_INLINE
PARROT_IGNORABLE_RESULT
PARROT_CAN_RETURN_NULL
-static const CHARSET *
+static const STR_VTABLE *
string_rep_compatible(SHIM_INTERP,
- ARGIN(const STRING *a), ARGIN(const STRING *b), ARGOUT(const ENCODING **e))
+ ARGIN(const STRING *a), ARGIN(const STRING *b))
{
ASSERT_ARGS(string_rep_compatible)
- if (a->encoding == b->encoding && a->charset == b->charset) {
- *e = a->encoding;
- return a->charset;
+ if (a->encoding == b->encoding) {
+ return a->encoding;
}
/* a table could possibly simplify the logic */
if (a->encoding == Parrot_utf8_encoding_ptr
- && b->charset == Parrot_ascii_charset_ptr) {
+ && b->encoding == Parrot_ascii_encoding_ptr) {
if (a->strlen == a->bufused) {
- *e = Parrot_fixed_8_encoding_ptr;
- return b->charset;
+ return b->encoding;
}
- *e = a->encoding;
- return a->charset;
+ return a->encoding;
}
if (b->encoding == Parrot_utf8_encoding_ptr
- && a->charset == Parrot_ascii_charset_ptr) {
+ && a->encoding == Parrot_ascii_encoding_ptr) {
if (b->strlen == b->bufused) {
- *e = Parrot_fixed_8_encoding_ptr;
- return a->charset;
+ return a->encoding;
}
- *e = b->encoding;
- return b->charset;
+ return b->encoding;
}
- if (a->encoding != b->encoding)
+ if (STRING_max_bytes_per_codepoint(a) != 1 ||
+ STRING_max_bytes_per_codepoint(b) != 1)
return NULL;
- if (a->encoding != Parrot_fixed_8_encoding_ptr)
- return NULL;
-
- *e = Parrot_fixed_8_encoding_ptr;
-
- if (a->charset == b->charset)
- return a->charset;
- if (b->charset == Parrot_ascii_charset_ptr)
- return a->charset;
- if (a->charset == Parrot_ascii_charset_ptr)
- return b->charset;
- if (a->charset == Parrot_binary_charset_ptr)
- return a->charset;
- if (b->charset == Parrot_binary_charset_ptr)
- return b->charset;
+ if (b->encoding == Parrot_ascii_encoding_ptr)
+ return a->encoding;
+ if (a->encoding == Parrot_ascii_encoding_ptr)
+ return b->encoding;
+ if (a->encoding == Parrot_binary_encoding_ptr)
+ return a->encoding;
+ if (b->encoding == Parrot_binary_encoding_ptr)
+ return b->encoding;
return NULL;
}
/*
-=item C<const CHARSET * Parrot_str_rep_compatible(PARROT_INTERP, const STRING
-*a, const STRING *b, const ENCODING **e)>
+=item C<const STR_VTABLE * Parrot_str_rep_compatible(PARROT_INTERP, const STRING
+*a, const STRING *b)>
-Find the "lowest" possible charset and encoding for the given string. E.g.
+Find the "lowest" possible encoding for the given string. E.g.
ascii <op> utf8 => utf8
=> ascii, B<if> C<STRING *b> has ascii chars only.
@@ -320,12 +303,12 @@
PARROT_EXPORT
PARROT_IGNORABLE_RESULT
PARROT_CAN_RETURN_NULL
-const CHARSET *
+const STR_VTABLE *
Parrot_str_rep_compatible(PARROT_INTERP,
- ARGIN(const STRING *a), ARGIN(const STRING *b), ARGOUT(const ENCODING **e))
+ ARGIN(const STRING *a), ARGIN(const STRING *b))
{
ASSERT_ARGS(Parrot_str_rep_compatible)
- return string_rep_compatible(interp, a, b, e);
+ return string_rep_compatible(interp, a, b);
}
/*
@@ -358,7 +341,6 @@
result->bufused = s->bufused;
result->hashval = s->hashval;
result->encoding = s->encoding;
- result->charset = s->charset;
return result;
}
@@ -427,30 +409,28 @@
ARGIN_NULLOK(const STRING *b))
{
ASSERT_ARGS(Parrot_str_concat)
- const CHARSET *cs;
- const ENCODING *enc = NULL;
- STRING *dest;
- UINTVAL total_length;
-
- /* XXX should this be a CHARSET method? */
-
- /* If B isn't real, we just bail */
- const UINTVAL b_len = b ? Parrot_str_length(interp, b) : 0;
- if (!b_len)
- return STRING_IS_NULL(a) ? STRINGNULL : Parrot_str_copy(interp, a);
-
- /* Is A real? */
- if (STRING_IS_NULL(a) || Buffer_bufstart(a) == NULL)
- return Parrot_str_copy(interp, b);
+ const STR_VTABLE *enc;
+ STRING *dest;
+ UINTVAL total_length;
+
+ if (STRING_IS_NULL(a)) {
+ if (STRING_IS_NULL(b))
+ return STRINGNULL;
+ else
+ return Parrot_str_copy(interp, b);
+ }
+ else {
+ if (STRING_IS_NULL(b))
+ return Parrot_str_copy(interp, a);
+ }
ASSERT_STRING_SANITY(a);
ASSERT_STRING_SANITY(b);
- cs = string_rep_compatible(interp, a, b, &enc);
+ enc = string_rep_compatible(interp, a, b);
- if (!cs) {
+ if (!enc) {
/* upgrade strings for concatenation */
- cs = Parrot_unicode_charset_ptr;
if (a->encoding == Parrot_ucs4_encoding_ptr
|| b->encoding == Parrot_ucs4_encoding_ptr)
enc = Parrot_ucs4_encoding_ptr;
@@ -462,22 +442,15 @@
else
enc = Parrot_utf8_encoding_ptr;
- a = Parrot_unicode_charset_ptr->to_charset(interp, a);
- b = Parrot_unicode_charset_ptr->to_charset(interp, b);
-
- if (a->encoding != enc)
- a = enc->to_encoding(interp, a);
- if (b->encoding != enc)
- b = enc->to_encoding(interp, b);
+ a = enc->to_encoding(interp, a);
+ b = enc->to_encoding(interp, b);
}
/* calc usable and total bytes */
total_length = a->bufused + b->bufused;
dest = Parrot_str_new_noinit(interp, enum_stringrep_one, total_length);
PARROT_ASSERT(enc);
- PARROT_ASSERT(cs);
dest->encoding = enc;
- dest->charset = cs;
/* Copy A first */
mem_sys_memcopy(dest->strstart, a->strstart, a->bufused);
@@ -487,7 +460,7 @@
b->strstart, b->bufused);
dest->bufused = a->bufused + b->bufused;
- dest->strlen = a->strlen + b_len;
+ dest->strlen = a->strlen + b->strlen;
return dest;
}
@@ -516,7 +489,7 @@
const UINTVAL buff_length = (len > 0) ? len : buffer ? strlen(buffer) : 0;
return Parrot_str_new_init(interp, buffer, buff_length,
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET, 0);
+ Parrot_default_encoding_ptr, 0);
}
@@ -549,8 +522,7 @@
result->strstart = (char *)Buffer_bufstart(result);
result->bufused = len;
result->strlen = len;
- result->encoding = Parrot_fixed_8_encoding_ptr;
- result->charset = Parrot_binary_charset_ptr;
+ result->encoding = Parrot_binary_encoding_ptr;
Buffer_buflen(buffer) = 0;
Buffer_bufstart(buffer) = NULL;
@@ -616,7 +588,7 @@
return s;
s = Parrot_str_new_init(interp, buffer, strlen(buffer),
- PARROT_DEFAULT_ENCODING, PARROT_DEFAULT_CHARSET,
+ Parrot_default_encoding_ptr,
PObj_external_FLAG|PObj_constant_FLAG);
parrot_hash_put(interp, cstring_cache,
@@ -629,12 +601,12 @@
/*
=item C<STRING * string_make(PARROT_INTERP, const char *buffer, UINTVAL len,
-const char *charset_name, UINTVAL flags)>
+const char *encoding_name, UINTVAL flags)>
Creates and returns a new Parrot string using C<len> bytes of string data read
from C<buffer>.
-The value of C<charset_name> specifies the string's representation.
+The value of C<encoding_name> specifies the string's representation.
The currently recognised values are:
'iso-8859-1'
@@ -645,7 +617,7 @@
The encoding is implicitly guessed; C<unicode> implies the C<utf-8> encoding,
and the other three assume C<fixed-8> encoding.
-If C<charset> is unspecified, the default charset 'ascii' will be used.
+If C<encoding_name> is unspecified, the default encoding 'ascii' will be used.
The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
together.
@@ -659,74 +631,28 @@
PARROT_CANNOT_RETURN_NULL
STRING *
string_make(PARROT_INTERP, ARGIN_NULLOK(const char *buffer),
- UINTVAL len, ARGIN_NULLOK(const char *charset_name), UINTVAL flags)
+ UINTVAL len, ARGIN_NULLOK(const char *encoding_name), UINTVAL flags)
{
ASSERT_ARGS(string_make)
- const CHARSET *charset;
+ const STR_VTABLE *encoding;
- if (charset_name) {
- charset = Parrot_find_charset(interp, charset_name);
- if (!charset)
+ if (encoding_name) {
+ encoding = Parrot_find_encoding(interp, encoding_name);
+ if (!encoding)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Can't make '%s' charset strings", charset_name);
+ "Can't make '%s' encoding strings", encoding_name);
}
else
- charset = Parrot_get_charset(interp, 0);
-
- return Parrot_str_new_init(interp, buffer, len,
- charset->preferred_encoding, charset, flags);
-}
-
-
-/*
-
-=item C<STRING * string_make_from_charset(PARROT_INTERP, const char *buffer,
-UINTVAL len, INTVAL charset_nr, UINTVAL flags)>
-
-Creates and returns a new Parrot string using C<len> bytes of string data read
-from C<buffer>.
-
-The value of C<charset_name> specifies the string's representation. It must be
-a valid charset identifier.
-
- 'iso-8859-1'
- 'ascii'
- 'binary'
- 'unicode'
-
-The encoding is implicitly guessed; C<unicode> implies the C<utf-8> encoding,
-and the other three assume C<fixed-8> encoding.
-
-The value of C<flags> is optionally one or more C<PObj_*> flags C<OR>-ed
-together.
-
-=cut
+ encoding = Parrot_default_encoding_ptr;
-*/
-
-PARROT_EXPORT
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-STRING *
-string_make_from_charset(PARROT_INTERP, ARGIN_NULLOK(const char *buffer),
- UINTVAL len, INTVAL charset_nr, UINTVAL flags)
-{
- ASSERT_ARGS(string_make_from_charset)
- const CHARSET *charset = Parrot_get_charset(interp, charset_nr);
-
- if (!charset)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Invalid charset number '%d' specified", charset_nr);
-
- return Parrot_str_new_init(interp, buffer, len,
- charset->preferred_encoding, charset, flags);
+ return Parrot_str_new_init(interp, buffer, len, encoding, flags);
}
/*
=item C<STRING * Parrot_str_new_init(PARROT_INTERP, const char *buffer, UINTVAL
-len, const ENCODING *encoding, const CHARSET *charset, UINTVAL flags)>
+len, const STR_VTABLE *encoding, UINTVAL flags)>
Given a buffer, its length, an encoding, a character set, and STRING flags,
creates and returns a new string. Don't call this directly.
@@ -740,13 +666,12 @@
PARROT_CANNOT_RETURN_NULL
STRING *
Parrot_str_new_init(PARROT_INTERP, ARGIN_NULLOK(const char *buffer), UINTVAL len,
- ARGIN(const ENCODING *encoding), ARGIN(const CHARSET *charset), UINTVAL flags)
+ ARGIN(const STR_VTABLE *encoding), UINTVAL flags)
{
ASSERT_ARGS(Parrot_str_new_init)
DECL_CONST_CAST;
STRING * const s = Parrot_gc_new_string_header(interp, flags);
s->encoding = encoding;
- s->charset = charset;
if (flags & PObj_external_FLAG) {
/*
@@ -761,10 +686,10 @@
Buffer_bufstart(s) = s->strstart = PARROT_const_cast(char *, buffer);
Buffer_buflen(s) = s->bufused = len;
- if (encoding == Parrot_fixed_8_encoding_ptr)
+ if (encoding->max_bytes_per_codepoint == 1)
s->strlen = len;
else
- s->strlen = CHARSET_CODEPOINTS(interp, s);
+ s->strlen = STRING_scan(interp, s);
return s;
}
@@ -774,10 +699,10 @@
if (buffer) {
mem_sys_memcopy(s->strstart, buffer, len);
s->bufused = len;
- if (encoding == Parrot_fixed_8_encoding_ptr)
+ if (encoding->max_bytes_per_codepoint == 1)
s->strlen = len;
else
- s->strlen = CHARSET_CODEPOINTS(interp, s);
+ s->strlen = STRING_scan(interp, s);
}
else
s->strlen = s->bufused = 0;
@@ -835,7 +760,7 @@
{
ASSERT_ARGS(Parrot_str_indexed)
ASSERT_STRING_SANITY(s);
- return (INTVAL)CHARSET_GET_CODEPOINT(interp, s, idx);
+ return (INTVAL)STRING_ord(interp, s, idx);
}
@@ -883,7 +808,7 @@
STRING *src = PARROT_const_cast(STRING *, s);
STRING *search = PARROT_const_cast(STRING *, s2);
- return CHARSET_INDEX(interp, src, search, (UINTVAL)start);
+ return STRING_index(interp, src, search, (UINTVAL)start);
}
}
@@ -951,17 +876,13 @@
string_chr(PARROT_INTERP, UINTVAL character)
{
ASSERT_ARGS(string_chr)
- if (character > 0xff)
- return Parrot_unicode_charset_ptr->string_from_codepoint(interp,
- character);
+ if (character > 0xff)
+ return Parrot_utf8_encoding_ptr->chr(interp, character);
else if (character > 0x7f)
- return Parrot_iso_8859_1_charset_ptr->string_from_codepoint(interp,
- character);
-
+ return Parrot_latin1_encoding_ptr->chr(interp, character);
else
- return Parrot_ascii_charset_ptr->string_from_codepoint(interp,
- character);
+ return Parrot_ascii_encoding_ptr->chr(interp, character);
}
@@ -1012,7 +933,7 @@
{
ASSERT_ARGS(string_max_bytes)
PARROT_ASSERT(s->encoding);
- return ENCODING_MAX_BYTES_PER_CODEPOINT(interp, s) * nchars;
+ return STRING_max_bytes_per_codepoint(s) * nchars;
}
@@ -1034,7 +955,7 @@
ASSERT_ARGS(Parrot_str_repeat)
STRING * const dest = Parrot_str_new_init(interp, NULL,
s->bufused * num,
- s->encoding, s->charset, 0);
+ s->encoding, 0);
if (num > 0) {
/* copy s into dest num times */
UINTVAL length = s->bufused;
@@ -1102,7 +1023,7 @@
if (true_length > (src->strlen - true_offset))
true_length = (UINTVAL)(src->strlen - true_offset);
- return CHARSET_GET_CODEPOINTS(interp, src, true_offset, true_length);
+ return STRING_substr(interp, src, true_offset, true_length);
}
/*
@@ -1176,12 +1097,12 @@
}
STRING_ITER_INIT(interp, &search_iter);
- c0 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter);
+ c0 = STRING_iter_get_and_advance(interp, search, &search_iter);
search_start = search_iter;
next_start = *start;
while (start->charpos + len <= src->strlen) {
- UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, &next_start);
+ UINTVAL c1 = STRING_iter_get_and_advance(interp, src, &next_start);
if (c1 == c0) {
UINTVAL c2;
@@ -1190,8 +1111,8 @@
do {
if (search_iter.charpos >= len)
return start->charpos;
- c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, end);
- c2 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter);
+ c1 = STRING_iter_get_and_advance(interp, src, end);
+ c2 = STRING_iter_get_and_advance(interp, search, &search_iter);
} while (c1 == c2);
search_iter = search_start;
@@ -1237,12 +1158,11 @@
INTVAL offset, INTVAL length, ARGIN(const STRING *rep))
{
ASSERT_ARGS(Parrot_str_replace)
- String_iter iter;
- const CHARSET *cs;
- const ENCODING *enc;
- STRING *dest = NULL;
- UINTVAL true_offset = (UINTVAL)offset;
- UINTVAL true_length = (UINTVAL)length;
+ String_iter iter;
+ const STR_VTABLE *enc;
+ STRING *dest = NULL;
+ UINTVAL true_offset = (UINTVAL)offset;
+ UINTVAL true_length = (UINTVAL)length;
UINTVAL start_byte, end_byte, start_char, end_char;
INTVAL buf_size;
@@ -1269,24 +1189,23 @@
true_length = (UINTVAL)(src->strlen - true_offset);
/* may have different reps..... */
- cs = string_rep_compatible(interp, src, rep, &enc);
+ enc = string_rep_compatible(interp, src, rep);
- if (!cs) {
+ if (!enc) {
src = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
rep = Parrot_utf16_encoding_ptr->to_encoding(interp, rep);
- /* Remember selected charset and encoding */
+ /* Remember selected encoding */
enc = src->encoding;
- cs = src->charset;
}
/* get byte position of the part that will be replaced */
STRING_ITER_INIT(interp, &iter);
- STRING_ITER_SET_POSITION(interp, src, &iter, true_offset);
+ STRING_iter_set_position(interp, src, &iter, true_offset);
start_byte = iter.bytepos;
start_char = iter.charpos;
- STRING_ITER_SKIP(interp, src, &iter, true_length);
+ STRING_iter_skip(interp, src, &iter, true_length);
end_byte = iter.bytepos;
end_char = iter.charpos;
@@ -1299,9 +1218,8 @@
/* Now do the replacement */
dest = Parrot_gc_new_string_header(interp, 0);
- /* Set encoding and charset to compatible */
+ /* Set encoding to compatible */
dest->encoding = enc;
- dest->charset = cs;
/* Clear COW flag. We own buffer */
PObj_get_FLAGS(dest) = PObj_is_string_FLAG
@@ -1373,7 +1291,7 @@
return chopped;
}
- if (chopped->encoding == Parrot_fixed_8_encoding_ptr) {
+ if (STRING_max_bytes_per_codepoint(chopped) == 1) {
chopped->bufused = new_length;
}
else if (chopped->encoding == Parrot_ucs2_encoding_ptr) {
@@ -1384,7 +1302,7 @@
String_iter iter;
STRING_ITER_INIT(interp, &iter);
- STRING_ITER_SET_POSITION(interp, s, &iter, new_length);
+ STRING_iter_set_position(interp, s, &iter, new_length);
chopped->bufused = iter.bytepos;
}
@@ -1423,7 +1341,7 @@
ASSERT_STRING_SANITY(s1);
ASSERT_STRING_SANITY(s2);
- return CHARSET_COMPARE(interp, s1, s2);
+ return STRING_compare(interp, s1, s2);
}
@@ -1496,7 +1414,7 @@
* both strings are non-null
* both strings have same length
*/
- return CHARSET_COMPARE(interp, s1, s2) == 0;
+ return STRING_compare(interp, s1, s2) == 0;
}
@@ -1522,13 +1440,13 @@
STRING *res;
size_t minlen;
- /* we could also trans_charset to iso-8859-1 */
- if (s1 && s1->encoding != Parrot_fixed_8_encoding_ptr)
+ /* we could also trans_encoding to iso-8859-1 */
+ if (s1 && STRING_max_bytes_per_codepoint(s1) != 1)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_and (%s/%s) unsupported",
s1->encoding->name, nonnull_encoding_name(s2));
- if (s2 && s2->encoding != Parrot_fixed_8_encoding_ptr)
+ if (s2 && STRING_max_bytes_per_codepoint(s2) != 1)
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
"string bitwise_and (%s/%s) unsupported",
nonnull_encoding_name(s1), s2->encoding->name);
@@ -1540,7 +1458,7 @@
minlen = 0;
res = Parrot_str_new_init(interp, NULL, minlen,
- Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
+ Parrot_binary_encoding_ptr, 0);
if (STRING_IS_NULL(s1) || STRING_IS_NULL(s2)) {
res->bufused = 0;
@@ -1664,7 +1582,7 @@
size_t maxlen = 0;
if (!STRING_IS_NULL(s1)) {
- if (s1->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(s1) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_or (%s/%s) unsupported",
@@ -1674,7 +1592,7 @@
}
if (!STRING_IS_NULL(s2)) {
- if (s2->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(s2) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_or (%s/%s) unsupported",
@@ -1685,7 +1603,7 @@
}
res = Parrot_str_new_init(interp, NULL, maxlen,
- Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
+ Parrot_binary_encoding_ptr, 0);
if (!maxlen) {
res->bufused = 0;
@@ -1730,7 +1648,7 @@
size_t maxlen = 0;
if (!STRING_IS_NULL(s1)) {
- if (s1->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(s1) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_xor (%s/%s) unsupported",
@@ -1740,7 +1658,7 @@
}
if (!STRING_IS_NULL(s2)) {
- if (s2->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(s2) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
"string bitwise_xor (%s/%s) unsupported",
@@ -1751,7 +1669,7 @@
}
res = Parrot_str_new_init(interp, NULL, maxlen,
- Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
+ Parrot_binary_encoding_ptr, 0);
if (!maxlen) {
res->bufused = 0;
@@ -1807,11 +1725,11 @@
size_t len;
if (!STRING_IS_NULL(s)) {
- if (s->encoding != Parrot_fixed_8_encoding_ptr)
+ if (STRING_max_bytes_per_codepoint(s) != 1)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_ENCODING,
- "string bitwise_not (%s/%s) unsupported",
- s->encoding->name, s->encoding->name);
+ "string bitwise_not (%s) unsupported",
+ s->encoding->name);
len = s->bufused;
}
@@ -1819,7 +1737,7 @@
len = 0;
res = Parrot_str_new_init(interp, NULL, len,
- Parrot_fixed_8_encoding_ptr, Parrot_binary_charset_ptr, 0);
+ Parrot_binary_encoding_ptr, 0);
if (!len) {
res->bufused = 0;
@@ -1965,7 +1883,7 @@
STRING_ITER_INIT(interp, &iter);
while (state != parse_end && iter.charpos < s->strlen) {
- const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter);
+ const UINTVAL c = STRING_iter_get_and_advance(interp, s, &iter);
/* Check for overflow */
if (c > 255)
break;
@@ -2064,7 +1982,7 @@
/* Handcrafted FSM to read float value */
while (state != parse_end && iter.charpos < s->strlen) {
- const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter);
+ const UINTVAL c = STRING_iter_get_and_advance(interp, s, &iter);
/* Check for overflow */
if (c > 255)
break;
@@ -2431,15 +2349,8 @@
size_t hashval = interp->hash_seed;
if (!STRING_IS_NULL(s)) {
- if (s->strlen) {
- if (s->encoding->hash)
- hashval = ENCODING_HASH(interp, s, hashval);
- else if (s->charset->compute_hash)
- hashval = CHARSET_COMPUTE_HASH(interp, s, hashval);
- else {
- exit_fatal(1, "String subsystem not properly initialized");
- }
- }
+ if (s->strlen)
+ hashval = STRING_hash(interp, s, hashval);
s->hashval = hashval;
}
@@ -2512,14 +2423,14 @@
/* create ascii result */
result = Parrot_str_new_init(interp, NULL, charlen,
- Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
+ Parrot_ascii_encoding_ptr, 0);
/* more work TODO */
STRING_ITER_INIT(interp, &iter);
dp = (unsigned char *)result->strstart;
for (i = 0; len > 0; --len) {
- UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
+ UINTVAL c = STRING_iter_get_and_advance(interp, src, &iter);
if (c < 0x7f) {
/* process ASCII chars */
if (i >= charlen - 2) {
@@ -2622,12 +2533,11 @@
/*
=item C<STRING * Parrot_str_unescape_string(PARROT_INTERP, const STRING *src,
-const CHARSET *charset, const ENCODING *encoding, UINTVAL flags)>
+const STR_VTABLE *encoding, UINTVAL flags)>
EXPERIMENTAL, see TT #1628
-Unescapes the src string returnning a new string with the charset
-and encoding specified.
+Unescapes the src string returnning a new string with the encoding specified.
=cut
@@ -2638,8 +2548,7 @@
PARROT_CANNOT_RETURN_NULL
STRING *
Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src),
- ARGIN(const CHARSET *charset),
- ARGIN(const ENCODING *encoding),
+ ARGIN(const STR_VTABLE *encoding),
UINTVAL flags)
{
ASSERT_ARGS(Parrot_str_unescape_string)
@@ -2653,7 +2562,6 @@
char digbuf[9];
int pending;
- result->charset = charset;
result->encoding = encoding;
reserved = string_max_bytes(interp, result, srclen);
Parrot_gc_allocate_string_storage(interp, result, reserved);
@@ -2662,14 +2570,14 @@
STRING_ITER_INIT(interp, &itersrc);
STRING_ITER_INIT(interp, &iterdest);
while (itersrc.bytepos < srclen) {
- INTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ INTVAL c = STRING_iter_get_and_advance(interp, src, &itersrc);
INTVAL next;
do {
pending = 0;
next = c;
if (c == '\\') {
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
switch (c) {
/* Common one char sequences */
case 'a': next = '\a'; break;
@@ -2682,7 +2590,7 @@
case 'e': next = '\x1B'; break;
/* Escape character */
case 'c':
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
/* This assumes ascii-alike encoding */
if (c < 'A' || c > 'Z')
throw_illegal_escape(interp);
@@ -2690,11 +2598,11 @@
break;
case 'x':
digcount = 0;
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
if (c == '{') {
/* \x{h..h} 1..8 hex digits */
while (itersrc.bytepos < srclen) {
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
if (c == '}')
break;
if (!isxdigit(c))
@@ -2718,7 +2626,7 @@
pending = 0;
break;
}
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
}
}
if (digcount == 0)
@@ -2729,7 +2637,7 @@
case 'u':
/* \uhhhh 4 hex digits */
for (digcount = 0; digcount < 4; ++digcount) {
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
if (!isxdigit(c))
throw_illegal_escape(interp);
digbuf[digcount] = c;
@@ -2740,7 +2648,7 @@
case 'U':
/* \Uhhhhhhhh 8 hex digits */
for (digcount = 0; digcount < 8; ++digcount) {
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
if (!isxdigit(c))
throw_illegal_escape(interp);
digbuf[digcount] = c;
@@ -2753,7 +2661,7 @@
/* \ooo 1..3 oct digits */
digbuf[0] = c;
for (digcount = 1; digcount < 3; ++digcount) {
- c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc);
+ c = STRING_iter_get_and_advance(interp, src, &itersrc);
if (c < '0' || c > '7')
break;
digbuf[digcount] = c;
@@ -2767,7 +2675,7 @@
next = c;
}
}
- STRING_ITER_SET_AND_ADVANCE(interp, result, &iterdest, next);
+ STRING_iter_set_and_advance(interp, result, &iterdest, next);
} while (pending);
}
result->bufused = iterdest.bytepos;
@@ -2802,9 +2710,8 @@
{
ASSERT_ARGS(Parrot_str_unescape)
- STRING *result;
- const CHARSET *charset;
- const ENCODING *encoding = NULL;
+ STRING *result;
+ const STR_VTABLE *encoding;
/* does the encoding have a character set? */
const char *p = enc_char ? strchr(enc_char, ':') : NULL;
@@ -2823,25 +2730,17 @@
#define MAX_ENCODING_NAME_ALLOWED 63
char buffer[MAX_ENCODING_NAME_ALLOWED + 1];
size_t l = p - enc_char;
- charset = NULL;
if (l < MAX_ENCODING_NAME_ALLOWED) {
memcpy(buffer, enc_char, l);
buffer[l] = '\0';
- encoding = Parrot_find_encoding(interp, buffer);
}
- if (!encoding)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Can't make '%s' encoding strings", enc_char);
-
- charset = Parrot_find_charset(interp, p + 1);
- if (!charset)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Can't make '%s' charset strings", p + 1);
+ else {
+ buffer[0] = '\0';
+ }
- result = Parrot_str_new_init(interp, cstring, clength,
- encoding, charset, flags);
- encoding = Parrot_fixed_8_encoding_ptr;
+ result = string_make(interp, cstring, clength, buffer, flags);
+ encoding = Parrot_ascii_encoding_ptr;
}
else {
result = string_make(interp, cstring, clength, enc_char, flags);
@@ -2880,9 +2779,9 @@
/* Force validating the string */
if (encoding != result->encoding)
- result->strlen = CHARSET_CODEPOINTS(interp, result);
+ result->strlen = STRING_scan(interp, result);
- if (!CHARSET_VALIDATE(interp, result))
+ if (!STRING_validate(interp, result))
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_INVALID_STRING_REPRESENTATION, "Malformed string");
@@ -2912,7 +2811,7 @@
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNEXPECTED_NULL,
"Can't upcase NULL string");
else {
- STRING * const res = CHARSET_UPCASE(interp, s);
+ STRING * const res = STRING_upcase(interp, s);
res->hashval = 0;
return res;
}
@@ -2942,7 +2841,7 @@
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNEXPECTED_NULL,
"Can't downcase NULL string");
else {
- STRING * const res = CHARSET_DOWNCASE(interp, s);
+ STRING * const res = STRING_downcase(interp, s);
res->hashval = 0;
return res;
}
@@ -2972,7 +2871,7 @@
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNEXPECTED_NULL,
"Can't titlecase NULL string");
else {
- STRING * const res = CHARSET_TITLECASE(interp, s);
+ STRING * const res = STRING_titlecase(interp, s);
res->hashval = 0;
return res;
}
@@ -3063,7 +2962,7 @@
if (!Parrot_str_byte_length(interp, s))
return 0;
- return CHARSET_IS_CCLASS(interp, flags, s, offset);
+ return STRING_is_cclass(interp, flags, s, offset);
}
@@ -3090,7 +2989,7 @@
if (STRING_IS_NULL(s))
return -1;
- return CHARSET_FIND_CCLASS(interp, flags, s, offset, count);
+ return STRING_find_cclass(interp, flags, s, offset, count);
}
@@ -3119,7 +3018,7 @@
if (STRING_IS_NULL(s))
return -1;
- return CHARSET_FIND_NOT_CCLASS(interp, flags, s, offset, count);
+ return STRING_find_not_cclass(interp, flags, s, offset, count);
}
@@ -3143,21 +3042,8 @@
INTVAL charset_nr)
{
ASSERT_ARGS(Parrot_str_change_charset)
- const CHARSET *new_charset;
-
- if (STRING_IS_NULL(src))
- return STRINGNULL;
-
- new_charset = Parrot_get_charset(interp, charset_nr);
-
- if (!new_charset)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARTYPE,
- "charset #%d not found", (int) charset_nr);
- if (new_charset == src->charset)
- return src;
-
- return new_charset->to_charset(interp, src);
+ return Parrot_str_change_encoding(interp, src, charset_nr);
}
@@ -3166,8 +3052,7 @@
=item C<STRING* Parrot_str_change_encoding(PARROT_INTERP, STRING *src, INTVAL
encoding_nr)>
-Converts C<src> to the given charset or encoding and returns the result as a
-new string.
+Converts C<src> to the given encoding and returns the result as a new string.
=cut
@@ -3181,7 +3066,7 @@
INTVAL encoding_nr)
{
ASSERT_ARGS(Parrot_str_change_encoding)
- const ENCODING *new_encoding;
+ const STR_VTABLE *new_encoding;
if (STRING_IS_NULL(src))
return STRINGNULL;
@@ -3223,7 +3108,7 @@
if (!src->strlen)
return Parrot_str_new_noinit(interp, enum_stringrep_one, 0);
- return CHARSET_COMPOSE(interp, src);
+ return STRING_compose(interp, src);
}
@@ -3325,7 +3210,7 @@
do {
const String_iter old_iter = iter;
- STRING_ITER_SKIP(interp, str, &iter, 1);
+ STRING_iter_skip(interp, str, &iter, 1);
tstr = Parrot_str_iter_substr(interp, str, &old_iter, &iter);
VTABLE_set_string_keyed_int(interp, res, old_iter.charpos, tstr);
} while (iter.charpos < slen);
Deleted: trunk/src/string/charset.c
==============================================================================
--- trunk/src/string/charset.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,640 +0,0 @@
-/*
-Copyright (C) 2004-2009, Parrot Foundation.
-$Id$
-
-=head1 NAME
-
-src/string/charset.c - global charset functions
-
-=head1 DESCRIPTION
-
-These are Parrot's generic charset handling functions
-
-=over 4
-
-=cut
-
-*/
-
-#define PARROT_NO_EXTERN_CHARSET_PTRS
-#include "parrot/parrot.h"
-
-#include "encoding/fixed_8.h"
-#include "encoding/utf8.h"
-#include "encoding/utf16.h"
-#include "encoding/ucs2.h"
-#include "encoding/ucs4.h"
-
-#include "charset/ascii.h"
-#include "charset/binary.h"
-#include "charset/iso-8859-1.h"
-#include "charset/unicode.h"
-
-const CHARSET *Parrot_iso_8859_1_charset_ptr;
-const CHARSET *Parrot_binary_charset_ptr;
-const CHARSET *Parrot_default_charset_ptr;
-const CHARSET *Parrot_unicode_charset_ptr;
-const CHARSET *Parrot_ascii_charset_ptr;
-
-/* all registered charsets are collected in one global structure */
-
-typedef struct To_converter {
- NOTNULL(const CHARSET *to);
- NOTNULL(charset_converter_t func);
-} To_converter;
-
-typedef struct One_charset {
- NOTNULL(CHARSET *charset);
- STRING *name;
- To_converter *to_converters;
- int n_converters;
-} One_charset;
-
-typedef struct All_charsets {
- One_charset *set;
- int n_charsets;
-} All_charsets;
-
-static All_charsets *all_charsets;
-
-/* HEADERIZER HFILE: include/parrot/charset.h */
-
-/* HEADERIZER BEGIN: static */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-static void Parrot_str_internal_register_charset_names(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-static INTVAL register_charset(PARROT_INTERP,
- ARGIN(const char *charsetname),
- ARGIN(CHARSET *charset))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-static void register_static_converters(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_Parrot_str_internal_register_charset_names \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_register_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(charsetname) \
- , PARROT_ASSERT_ARG(charset))
-#define ASSERT_ARGS_register_static_converters __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
-
-/*
-
-=item C<CHARSET * Parrot_new_charset(PARROT_INTERP)>
-
-Allocates a new C<CHARSET> structure from the system.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_CANNOT_RETURN_NULL
-PARROT_MALLOC
-CHARSET *
-Parrot_new_charset(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_new_charset)
- return mem_gc_allocate_zeroed_typed(interp, CHARSET);
-}
-
-/*
-
-=item C<void Parrot_charsets_encodings_deinit(PARROT_INTERP)>
-
-Deinitializes (unloads) the charset system. Frees all charsets and the array
-that holds the charsets back to the system.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-void
-Parrot_charsets_encodings_deinit(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_charsets_encodings_deinit)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (all_charsets->set[i].n_converters)
- mem_gc_free(interp, all_charsets->set[i].to_converters);
- mem_gc_free(interp, all_charsets->set[i].charset);
- }
- mem_gc_free(interp, all_charsets->set);
- mem_gc_free(interp, all_charsets);
- all_charsets = NULL;
- parrot_deinit_encodings(interp);
-}
-
-/*
-
-=item C<const CHARSET * Parrot_find_charset(PARROT_INTERP, const char
-*charsetname)>
-
-Searches through the list of charsets for the charset given by C<charsetname>.
-Returns the charset if it is found, NULL otherwise.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const CHARSET *
-Parrot_find_charset(SHIM_INTERP, ARGIN(const char *charsetname))
-{
- ASSERT_ARGS(Parrot_find_charset)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (STREQ(all_charsets->set[i].charset->name, charsetname))
- return all_charsets->set[i].charset;
- }
-
- return NULL;
-}
-
-/*
-
-=item C<const CHARSET * Parrot_load_charset(PARROT_INTERP, const char
-*charsetname)>
-
-Throws an exception (Can't load charsets dynamically yet. https://trac.parrot.org/parrot/wiki/StringsTasklist).
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const CHARSET *
-Parrot_load_charset(PARROT_INTERP, ARGIN(const char *charsetname))
-{
- ASSERT_ARGS(Parrot_load_charset)
- UNUSED(charsetname);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Can't load charsets yet");
-}
-
-/*
-
-=item C<INTVAL Parrot_charset_number(PARROT_INTERP, const STRING *charsetname)>
-
-Return the number of the charset or -1 if not found.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_WARN_UNUSED_RESULT
-INTVAL
-Parrot_charset_number(PARROT_INTERP, ARGIN(const STRING *charsetname))
-{
- ASSERT_ARGS(Parrot_charset_number)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (Parrot_str_equal(interp, all_charsets->set[i].name, charsetname))
- return i;
- }
- return -1;
-}
-
-/*
-
-=item C<INTVAL Parrot_charset_number_of_str(PARROT_INTERP, const STRING *src)>
-
-Return the number of the charset of the given string or -1 if not found.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_WARN_UNUSED_RESULT
-INTVAL
-Parrot_charset_number_of_str(SHIM_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(Parrot_charset_number_of_str)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (src->charset == all_charsets->set[i].charset)
- return i;
- }
- return -1;
-}
-
-/*
-
-=item C<STRING * Parrot_charset_name(PARROT_INTERP, INTVAL number_of_charset)>
-
-Returns the name of the charset given by the INTVAL index
-C<number_of_charset>.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-STRING *
-Parrot_charset_name(SHIM_INTERP, INTVAL number_of_charset)
-{
- ASSERT_ARGS(Parrot_charset_name)
- if (number_of_charset < 0 || number_of_charset >= all_charsets->n_charsets)
- return STRINGNULL;
- return all_charsets->set[number_of_charset].name;
-}
-
-/*
-
-=item C<const CHARSET * Parrot_get_charset(PARROT_INTERP, INTVAL
-number_of_charset)>
-
-Returns the charset given by the INTVAL index C<number_of_charset>.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const CHARSET *
-Parrot_get_charset(SHIM_INTERP, INTVAL number_of_charset)
-{
- ASSERT_ARGS(Parrot_get_charset)
- if (number_of_charset < 0 || number_of_charset >= all_charsets->n_charsets)
- return NULL;
- return all_charsets->set[number_of_charset].charset;
-}
-
-/*
-
-=item C<const char * Parrot_charset_c_name(PARROT_INTERP, INTVAL
-number_of_charset)>
-
-Returns a NULL-terminated C string with the name of the charset given by
-INTVAL index C<number_of_charset>.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_CAN_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-const char *
-Parrot_charset_c_name(SHIM_INTERP, INTVAL number_of_charset)
-{
- ASSERT_ARGS(Parrot_charset_c_name)
- if (number_of_charset < 0 || number_of_charset >= all_charsets->n_charsets)
- return NULL;
- return all_charsets->set[number_of_charset].charset->name;
-}
-
-/*
-
-=item C<static INTVAL register_charset(PARROT_INTERP, const char *charsetname,
-CHARSET *charset)>
-
-Adds a new charset C<charset> with name <charsetname> to the list of
-all charsets. Returns 0 and does nothing if a charset with that name
-already exists. Returns 1 otherwise.
-
-=cut
-
-*/
-
-static INTVAL
-register_charset(PARROT_INTERP, ARGIN(const char *charsetname),
- ARGIN(CHARSET *charset))
-{
- ASSERT_ARGS(register_charset)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (STREQ(all_charsets->set[i].charset->name, charsetname))
- return 0;
- }
- /*
- * TODO
- * this needs either a LOCK or we just forbid dynamic
- * loading of charsets from inside threads
- */
- if (!n)
- all_charsets->set = mem_gc_allocate_zeroed_typed(interp, One_charset);
- else
- all_charsets->set = mem_gc_realloc_n_typed_zeroed(interp,
- all_charsets->set, n + 1, n, One_charset);
-
- ++all_charsets->n_charsets;
- all_charsets->set[n].charset = charset;
- all_charsets->set[n].n_converters = 0;
-
- return 1;
-}
-
-/*
-
-=item C<static void Parrot_str_internal_register_charset_names(PARROT_INTERP)>
-
-Helper function for initializing characterset names. We can't create the
-STRING names until the default encodings and charsets are already initted,
-so the name generation is split into a second init stage.
-
-=cut
-
-*/
-
-static void
-Parrot_str_internal_register_charset_names(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_str_internal_register_charset_names)
- int n;
- for (n = 0; n < all_charsets->n_charsets; ++n)
- all_charsets->set[n].name =
- Parrot_str_new_constant(interp, all_charsets->set[n].charset->name);
-}
-
-/*
-
-=item C<static void register_static_converters(PARROT_INTERP)>
-
-Registers several standard converters between common charsets, including:
-
- ISO 8859_1 -> ascii
- ISO 8859_1 -> bin
- ascii -> bin
- ascii -> ISO 8859_1
-
-=cut
-
-*/
-
-static void
-register_static_converters(PARROT_INTERP)
-{
- ASSERT_ARGS(register_static_converters)
- Parrot_register_charset_converter(interp,
- Parrot_iso_8859_1_charset_ptr, Parrot_ascii_charset_ptr,
- charset_cvt_iso_8859_1_to_ascii);
- Parrot_register_charset_converter(interp,
- Parrot_iso_8859_1_charset_ptr, Parrot_binary_charset_ptr,
- charset_cvt_ascii_to_binary);
-
- Parrot_register_charset_converter(interp,
- Parrot_ascii_charset_ptr, Parrot_binary_charset_ptr,
- charset_cvt_ascii_to_binary);
- Parrot_register_charset_converter(interp,
- Parrot_ascii_charset_ptr, Parrot_iso_8859_1_charset_ptr,
- charset_cvt_ascii_to_iso_8859_1);
-}
-
-/*
-
-=item C<INTVAL Parrot_register_charset(PARROT_INTERP, const char *charsetname,
-CHARSET *charset)>
-
-Register a new charset C<charset> with name C<charsetname>. Charset may only
-be one of the 4 following names:
-
- binary
- iso-8859-1
- unicode
- ascii
-
-Attempts to register other charsets are ignored. Returns 0 if the registration
-failed, for any reason.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-INTVAL
-Parrot_register_charset(PARROT_INTERP, ARGIN(const char *charsetname),
- ARGIN(CHARSET *charset))
-{
- ASSERT_ARGS(Parrot_register_charset)
- if (!all_charsets) {
- all_charsets = mem_gc_allocate_zeroed_typed(interp, All_charsets);
- all_charsets->set = NULL;
- all_charsets->n_charsets = 0;
- }
-
- if (STREQ("binary", charsetname)) {
- Parrot_binary_charset_ptr = charset;
- return register_charset(interp, charsetname, charset);
- }
-
- if (STREQ("iso-8859-1", charsetname)) {
- Parrot_iso_8859_1_charset_ptr = charset;
- return register_charset(interp, charsetname, charset);
- }
-
- if (STREQ("unicode", charsetname)) {
- Parrot_unicode_charset_ptr = charset;
- return register_charset(interp, charsetname, charset);
- }
-
- if (STREQ("ascii", charsetname)) {
- if (!Parrot_default_charset_ptr)
- Parrot_default_charset_ptr = charset;
-
- Parrot_ascii_charset_ptr = charset;
- return register_charset(interp, charsetname, charset);
- }
-
- return 0;
-}
-
-/*
-
-=item C<void Parrot_charsets_encodings_init(PARROT_INTERP)>
-
-Creates the initial charsets and encodings, and registers the initial
-charset converters.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-void
-Parrot_charsets_encodings_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_charsets_encodings_init)
- /* the order is crucial here:
- * 1) encodings, default = fixed_8
- * 2) charsets default = ascii */
- Parrot_encoding_fixed_8_init(interp);
- Parrot_encoding_utf8_init(interp);
- Parrot_encoding_ucs2_init(interp);
- Parrot_encoding_utf16_init(interp);
- Parrot_encoding_ucs4_init(interp);
-
- Parrot_charset_ascii_init(interp);
- Parrot_charset_iso_8859_1_init(interp);
- Parrot_charset_binary_init(interp);
- Parrot_charset_unicode_init(interp);
-
- /* Now that the plugins are registered, we can create STRING
- * names for them. */
- Parrot_str_internal_register_encoding_names(interp);
- Parrot_str_internal_register_charset_names(interp);
-
- /* now install charset converters */
- register_static_converters(interp);
-}
-
-/*
-
-=item C<INTVAL Parrot_make_default_charset(PARROT_INTERP, const char
-*charsetname, const CHARSET *charset)>
-
-Sets the current default charset to C<charset> with name C<charsetname>.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-INTVAL
-Parrot_make_default_charset(SHIM_INTERP, SHIM(const char *charsetname),
- ARGIN(const CHARSET *charset))
-{
- ASSERT_ARGS(Parrot_make_default_charset)
- Parrot_default_charset_ptr = charset;
- return 1;
-}
-
-/*
-
-=item C<const CHARSET * Parrot_default_charset(PARROT_INTERP)>
-
-Returns the default charset.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_WARN_UNUSED_RESULT
-PARROT_CAN_RETURN_NULL
-const CHARSET *
-Parrot_default_charset(SHIM_INTERP)
-{
- ASSERT_ARGS(Parrot_default_charset)
- return Parrot_default_charset_ptr;
-}
-
-/*
-
-=item C<charset_converter_t Parrot_find_charset_converter(PARROT_INTERP, const
-CHARSET *lhs, const CHARSET *rhs)>
-
-Finds a converter from charset C<lhs> to charset C<rhs>.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_PURE_FUNCTION
-PARROT_WARN_UNUSED_RESULT
-PARROT_CAN_RETURN_NULL
-charset_converter_t
-Parrot_find_charset_converter(SHIM_INTERP,
- ARGIN(const CHARSET *lhs), ARGIN(const CHARSET *rhs))
-{
- ASSERT_ARGS(Parrot_find_charset_converter)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (lhs == all_charsets->set[i].charset) {
- const One_charset * const left = all_charsets->set + i;
- const int nc = left->n_converters;
- int j;
-
- for (j = 0; j < nc; ++j) {
- if (left->to_converters[j].to == rhs)
- return left->to_converters[j].func;
- }
- }
- }
- return NULL;
-}
-
-/*
-
-=item C<void Parrot_register_charset_converter(PARROT_INTERP, const CHARSET
-*lhs, const CHARSET *rhs, charset_converter_t func)>
-
-Registers a converter C<func> from charset C<lhs> to C<rhs>.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-void
-Parrot_register_charset_converter(PARROT_INTERP,
- ARGIN(const CHARSET *lhs), ARGIN(const CHARSET *rhs),
- ARGIN(charset_converter_t func))
-{
- ASSERT_ARGS(Parrot_register_charset_converter)
- int i;
- const int n = all_charsets->n_charsets;
-
- for (i = 0; i < n; ++i) {
- if (lhs == all_charsets->set[i].charset) {
- One_charset * const left = all_charsets->set + i;
- const int nc = left->n_converters++;
-
- if (nc) {
- left->to_converters = mem_gc_realloc_n_typed_zeroed(interp,
- left->to_converters, nc + 1, nc, To_converter);
- }
- else
- left->to_converters = mem_gc_allocate_zeroed_typed(interp, To_converter);
- left->to_converters[nc].to = rhs;
- left->to_converters[nc].func = func;
- }
- }
-}
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/ascii.c
==============================================================================
--- trunk/src/string/charset/ascii.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,876 +0,0 @@
-/*
-Copyright (C) 2004-2010, Parrot Foundation.
-$Id$
-
-=head1 NAME
-
-src/string/charset/ascii.c
-
-=head1 DESCRIPTION
-
-This file implements the charset functions for ascii data and common
-charset functionality for similar charsets like iso-8859-1.
-
-=over 4
-
-=cut
-
-*/
-
-#include "parrot/parrot.h"
-#include "ascii.h"
-
-/*
- * TODO check interpreter error and warnings setting
- */
-
-#include "tables.h"
-
-/* HEADERIZER HFILE: src/string/charset/ascii.h */
-
-/* HEADERIZER BEGIN: static */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* compose(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* decompose(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static INTVAL find_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-static INTVAL find_not_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-PARROT_WARN_UNUSED_RESULT
-static INTVAL is_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_ascii(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_charset(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL validate(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-#define ASSERT_ARGS_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_decompose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_string_from_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_ascii __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
-
-/*
-
-=item C<STRING * ascii_get_graphemes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Retrieves the graphemes for the STRING C<src>, starting at
-C<offset> and ending at C<offset + count>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-STRING *
-ascii_get_graphemes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(ascii_get_graphemes)
- return ENCODING_GET_BYTES(interp, src, offset, count);
-}
-
-/*
-
-=item C<static STRING * to_ascii(PARROT_INTERP, const STRING *src)>
-
-Attempts to convert STRING C<src> to ASCII in STRING C<dest>. Throws
-an exception if unconvertable UNICODE characters are involved.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-to_ascii(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_ascii)
- String_iter iter;
- unsigned char *p;
- const UINTVAL len = src->strlen;
-
- /* the string can't grow. Just clone it */
- STRING * const dest = Parrot_str_clone(interp, src);
-
- p = (unsigned char *)dest->strstart;
- STRING_ITER_INIT(interp, &iter);
- while (iter.charpos < len) {
- const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (c >= 128)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
- "can't convert unicode string to ascii");
- *p++ = (unsigned char)c;
- }
- dest->bufused = len;
- dest->strlen = len;
- dest->charset = Parrot_ascii_charset_ptr;
- dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
- return dest;
-}
-
-/*
-
-=item C<static STRING * to_charset(PARROT_INTERP, const STRING *src)>
-
-Converts STRING C<src> to ASCII charset STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-to_charset(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_charset)
- const charset_converter_t conversion_func =
- Parrot_find_charset_converter(interp, src->charset, Parrot_ascii_charset_ptr);
-
- if (conversion_func) {
- return conversion_func(interp, src);
- }
- else {
- return to_ascii(interp, src);
- }
-}
-
-/*
-
-=item C<static STRING* compose(PARROT_INTERP, const STRING *src)>
-
-Can't compose ASCII strings, so performs a string copy on it and
-returns the new string.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-compose(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(compose)
-
- STRING * const dest = Parrot_str_copy(interp, src);
-
- return dest;
-}
-
-/*
-
-=item C<static STRING* decompose(PARROT_INTERP, const STRING *src)>
-
-Can't decompose ASCII, so we perform a string copy instead and return
-a pointer to the new string.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-decompose(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(decompose)
-
- STRING * const dest = Parrot_str_copy(interp, src);
-
- return dest;
-}
-
-/*
-
-=item C<static STRING* upcase(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING C<src> to all uppercase.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(upcase)
- STRING * const result = Parrot_str_clone(interp, src);
- const UINTVAL n = src->strlen;
-
- if (n) {
- char * const buffer = result->strstart;
- UINTVAL offset;
-
- for (offset = 0; offset < n; ++offset) {
- buffer[offset] = (char)toupper((unsigned char)buffer[offset]);
- }
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* downcase(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING C<src> to all lower-case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(downcase)
- STRING *result = Parrot_str_clone(interp, src);
- const UINTVAL n = src->strlen;
-
- if (n) {
- char * const buffer = result->strstart;
- UINTVAL offset;
-
- for (offset = 0; offset < n; ++offset) {
- buffer[offset] = (char)tolower((unsigned char)buffer[offset]);
- }
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* titlecase(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING given by C<src> to title case, where
-the first character is upper case and all the rest of the characters
-are lower-case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(titlecase)
- STRING *result = Parrot_str_clone(interp, src);
- const UINTVAL n = src->strlen;
-
- if (n) {
- char * const buffer = result->strstart;
- UINTVAL offset;
-
- buffer[0] = (char)toupper((unsigned char)buffer[0]);
- for (offset = 1; offset < n; ++offset) {
- buffer[offset] = (char)tolower((unsigned char)buffer[offset]);
- }
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* upcase_first(PARROT_INTERP, const STRING *src)>
-
-Sets the first character in the STRING C<src> to upper case,
-but doesn't modify the rest of the string.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(upcase_first)
- STRING * const result = Parrot_str_clone(interp, src);
-
- if (result->strlen > 0) {
- char * const buffer = result->strstart;
- buffer[0] = (char)toupper((unsigned char)buffer[0]);
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* downcase_first(PARROT_INTERP, const STRING *src)>
-
-Sets the first character of the STRING C<src> to lowercase,
-but doesn't modify the rest of the characters.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(downcase_first)
- STRING * const result = Parrot_str_clone(interp, src);
-
- if (result->strlen > 0) {
- char * const buffer = result->strstart;
- buffer[0] = (char)tolower((unsigned char)buffer[0]);
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* titlecase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first letter of STRING C<src> to upper case,
-but doesn't modify the rest of the string.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(titlecase_first)
- STRING * const result = Parrot_str_clone(interp, src);
-
- if (result->strlen > 0) {
- char * const buffer = result->strstart;
- buffer[0] = (char)toupper((unsigned char)buffer[0]);
- }
-
- return result;
-}
-
-/*
-
-=item C<INTVAL ascii_compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
-
-Compares two strings as ASCII strings. If STRING C<lhs> > C<rhs>, returns
-1. If C<lhs> == C<rhs> returns 0. If STRING C<lhs> < C<rhs>, returns -1.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL
-ascii_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
-{
- ASSERT_ARGS(ascii_compare)
- const UINTVAL l_len = lhs->strlen;
- const UINTVAL r_len = rhs->strlen;
- const UINTVAL min_len = l_len > r_len ? r_len : l_len;
- String_iter iter;
-
- if (lhs->encoding == rhs->encoding) {
- const int ret_val = memcmp(lhs->strstart, rhs->strstart, min_len);
- if (ret_val)
- return ret_val < 0 ? -1 : 1;
- }
- else {
- STRING_ITER_INIT(interp, &iter);
- while (iter.charpos < min_len) {
- const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, iter.charpos);
- const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &iter);
- if (cl != cr)
- return cl < cr ? -1 : 1;
- }
- }
- if (l_len < r_len) {
- return -1;
- }
- if (l_len > r_len) {
- return 1;
- }
- return 0;
-}
-
-/*
-
-=item C<INTVAL mixed_cs_index(PARROT_INTERP, const STRING *src, const STRING
-*search, UINTVAL offs)>
-
-Searches for the first instance of STRING C<search> in STRING C<src>.
-returns the position where the substring is found if it is indeed found.
-Returns -1 otherwise. Operates on different types of strings, not just
-ASCII.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL
-mixed_cs_index(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search),
- UINTVAL offs)
-{
- ASSERT_ARGS(mixed_cs_index)
- String_iter start, end;
-
- STRING_ITER_INIT(interp, &start);
- STRING_ITER_SET_POSITION(interp, src, &start, offs);
-
- return Parrot_str_iter_index(interp, src, &start, &end, search);
-}
-
-/*
-
-=item C<INTVAL ascii_cs_index(PARROT_INTERP, const STRING *src, const STRING
-*search_string, UINTVAL offset)>
-
-Searches for the first instance of STRING C<search> in STRING C<src>.
-returns the position where the substring is found if it is indeed found.
-Returns -1 otherwise.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL
-ascii_cs_index(PARROT_INTERP, ARGIN(const STRING *src),
- ARGIN(const STRING *search_string), UINTVAL offset)
-{
- ASSERT_ARGS(ascii_cs_index)
- INTVAL retval;
- if (src->charset != search_string->charset) {
- return mixed_cs_index(interp, src, search_string, offset);
- }
-
- PARROT_ASSERT(src->encoding == Parrot_fixed_8_encoding_ptr);
- retval = Parrot_byte_index(interp, src,
- search_string, offset);
- return retval;
-}
-
-/*
-
-=item C<INTVAL ascii_cs_rindex(PARROT_INTERP, const STRING *src, const STRING
-*search_string, UINTVAL offset)>
-
-Searches for the last instance of STRING C<search_string> in STRING
-C<src>. Starts searching at C<offset>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL
-ascii_cs_rindex(PARROT_INTERP, ARGIN(const STRING *src),
- ARGIN(const STRING *search_string), UINTVAL offset)
-{
- ASSERT_ARGS(ascii_cs_rindex)
- INTVAL retval;
-
- if (src->charset != search_string->charset)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Cross-charset index not supported");
-
- PARROT_ASSERT(src->encoding == Parrot_fixed_8_encoding_ptr);
- retval = Parrot_byte_rindex(interp, src,
- search_string, offset);
- return retval;
-}
-
-/*
-
-=item C<static UINTVAL validate(PARROT_INTERP, const STRING *src)>
-
-Verifies that the given string is valid ASCII. Returns 1 if it is ASCII,
-returns 0 otherwise.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-validate(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(validate)
- String_iter iter;
- const INTVAL length = Parrot_str_length(interp, src);
-
- STRING_ITER_INIT(interp, &iter);
- while (iter.charpos < length) {
- const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (codepoint >= 0x80)
- return 0;
- }
- return 1;
-}
-
-/*
-
-=item C<static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)>
-
-Creates a new STRING object from a single codepoint C<codepoint>. Returns
-the new STRING.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-static STRING *
-string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
-{
- ASSERT_ARGS(string_from_codepoint)
- char real_codepoint = (char)codepoint;
- STRING * const return_string = string_make(interp, &real_codepoint, 1, "ascii", 0);
- return return_string;
-}
-
-/*
-
-=item C<static INTVAL is_cclass(PARROT_INTERP, INTVAL flags, const STRING *src,
-UINTVAL offset)>
-
-Returns Boolean.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static INTVAL
-is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(is_cclass)
- UINTVAL codepoint;
-
- if (offset >= src->strlen)
- return 0;
- codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
-
- if (codepoint >= sizeof (Parrot_ascii_typetable) / sizeof (Parrot_ascii_typetable[0])) {
- return 0;
- }
- return (Parrot_ascii_typetable[codepoint] & flags) ? 1 : 0;
-}
-
-/*
-
-=item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Find a character in the given character class. Delegates to the find_cclass
-method of the encoding plugin.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static INTVAL
-find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_cclass)
- UINTVAL pos = offset;
- UINTVAL end = offset + count;
-
- end = src->strlen < end ? src->strlen : end;
- return ENCODING_FIND_CCLASS(interp, src, Parrot_ascii_typetable,
- flags, pos, end);
-}
-
-/*
-
-=item C<static INTVAL find_not_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Returns C<INTVAL>.
-
-=cut
-
-*/
-
-static INTVAL
-find_not_cclass(PARROT_INTERP,
- INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_not_cclass)
- UINTVAL pos = offset;
- UINTVAL end = offset + count;
-
- end = src->strlen < end ? src->strlen : end;
- for (; pos < end; ++pos) {
- const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, src, pos);
- if ((Parrot_ascii_typetable[codepoint] & flags) == 0) {
- return pos;
- }
- }
- return end;
-}
-
-/*
-
-=item C<size_t ascii_compute_hash(PARROT_INTERP, const STRING *src, size_t
-seed)>
-
-Computes the hash of STRING C<src> starting with seed value
-C<seed>.
-
-=cut
-
-*/
-
-PARROT_PURE_FUNCTION
-size_t
-ascii_compute_hash(SHIM_INTERP, ARGIN(const STRING *src), size_t seed)
-{
- ASSERT_ARGS(ascii_compute_hash)
- size_t hashval = seed;
- const char *buffptr = (const char *)src->strstart;
- UINTVAL len = src->strlen;
-
- PARROT_ASSERT(src->encoding == Parrot_fixed_8_encoding_ptr);
- while (len--) {
- hashval += hashval << 5;
- hashval += *buffptr++;
- }
- return hashval;
-}
-
-/*
-
-=item C<void Parrot_charset_ascii_init(PARROT_INTERP)>
-
-Initialize the ASCII charset but registering all the necessary
-function pointers and settings.
-
-=cut
-
-*/
-
-void
-Parrot_charset_ascii_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_charset_ascii_init)
- CHARSET * const return_set = Parrot_new_charset(interp);
- static const CHARSET base_set = {
- "ascii",
- ascii_get_graphemes,
- to_charset,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- ascii_compare,
- ascii_cs_index,
- ascii_cs_rindex,
- validate,
- is_cclass,
- find_cclass,
- find_not_cclass,
- string_from_codepoint,
- ascii_compute_hash,
- NULL
- };
-
- STRUCT_COPY_FROM_STRUCT(return_set, base_set);
- return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
- Parrot_register_charset(interp, "ascii", return_set);
-
- return;
-}
-
-/*
-
-=item C<STRING * charset_cvt_ascii_to_binary(PARROT_INTERP, const STRING *src)>
-
-Converts an ASCII STRING C<src> to a binary STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-STRING *
-charset_cvt_ascii_to_binary(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(charset_cvt_ascii_to_binary)
- STRING * const dest = Parrot_str_clone(interp, src);
- UINTVAL offs;
-
- for (offs = 0; offs < src->strlen; ++offs) {
- const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
- ENCODING_SET_BYTE(interp, dest, offs, c);
- }
-
- dest->charset = Parrot_binary_charset_ptr;
- return dest;
-}
-
-/*
-
-=item C<STRING * charset_cvt_ascii_to_iso_8859_1(PARROT_INTERP, const STRING
-*src)>
-
-Converts ASCII STRING C<src> to ISO8859-1 STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-STRING *
-charset_cvt_ascii_to_iso_8859_1(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(charset_cvt_ascii_to_iso_8859_1)
- STRING * const dest = Parrot_str_clone(interp, src);
- UINTVAL offs;
-
- for (offs = 0; offs < src->strlen; ++offs) {
- const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
- ENCODING_SET_BYTE(interp, dest, offs, c);
- }
-
- dest->charset = Parrot_iso_8859_1_charset_ptr;
- return dest;
-}
-
-/*
-
-=back
-
-=cut
-
-*/
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/ascii.h
==============================================================================
--- trunk/src/string/charset/ascii.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,128 +0,0 @@
-/* ascii.h
- * Copyright (C) 2004-2007, Parrot Foundation.
- * SVN Info
- * $Id$
- * Overview:
- * This is the header for the ascii charset functions
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-#ifndef PARROT_CHARSET_ASCII_H_GUARD
-#define PARROT_CHARSET_ASCII_H_GUARD
-
-/*
- * common functions for ascii-ish charsets
- */
-
-/* HEADERIZER BEGIN: src/string/charset/ascii.c */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL ascii_compare(PARROT_INTERP,
- ARGIN(const STRING *lhs),
- ARGIN(const STRING *rhs))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_PURE_FUNCTION
-size_t ascii_compute_hash(SHIM_INTERP,
- ARGIN(const STRING *src),
- size_t seed)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL ascii_cs_index(PARROT_INTERP,
- ARGIN(const STRING *src),
- ARGIN(const STRING *search_string),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL ascii_cs_rindex(PARROT_INTERP,
- ARGIN(const STRING *src),
- ARGIN(const STRING *search_string),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-STRING * ascii_get_graphemes(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-STRING * charset_cvt_ascii_to_binary(PARROT_INTERP,
- ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-STRING * charset_cvt_ascii_to_iso_8859_1(PARROT_INTERP,
- ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-INTVAL mixed_cs_index(PARROT_INTERP,
- ARGIN(const STRING *src),
- ARGIN(const STRING *search),
- UINTVAL offs)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-void Parrot_charset_ascii_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_ascii_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs))
-#define ASSERT_ARGS_ascii_compute_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_ascii_cs_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(search_string))
-#define ASSERT_ARGS_ascii_cs_rindex __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(search_string))
-#define ASSERT_ARGS_ascii_get_graphemes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_charset_cvt_ascii_to_binary __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_charset_cvt_ascii_to_iso_8859_1 \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_mixed_cs_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(search))
-#define ASSERT_ARGS_Parrot_charset_ascii_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/charset/ascii.c */
-
-#endif /* PARROT_CHARSET_ASCII_H_GUARD */
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/binary.c
==============================================================================
--- trunk/src/string/charset/binary.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,494 +0,0 @@
-/*
-Copyright (C) 2004-2010, Parrot Foundation.
-$Id$
-
-=head1 NAME
-
-src/string/charset/binary.c
-
-=head1 DESCRIPTION
-
-This file implements the charset functions for binary data
-
-=over 4
-
-=cut
-
-*/
-
-#include "parrot/parrot.h"
-
-/* In local src/string/charset/ directory */
-#include "ascii.h"
-#include "binary.h"
-
-/* HEADERIZER HFILE: src/string/charset/binary.h */
-
-/* HEADERIZER BEGIN: static */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-static INTVAL compare(SHIM_INTERP,
- ARGIN(const STRING *lhs),
- ARGIN(const STRING *rhs))
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* compose(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* decompose(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-static INTVAL find_cclass(SHIM_INTERP,
- SHIM(INTVAL flags),
- SHIM(const STRING *src),
- UINTVAL offset,
- UINTVAL count);
-
-static INTVAL find_not_cclass(SHIM_INTERP,
- SHIM(INTVAL flags),
- SHIM(const STRING *src),
- UINTVAL offset,
- UINTVAL count);
-
-static INTVAL is_cclass(SHIM_INTERP,
- SHIM(INTVAL flags),
- SHIM(const STRING *src),
- SHIM(UINTVAL offset));
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* to_charset(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-static UINTVAL validate(SHIM_INTERP, SHIM(const STRING *src));
-#define ASSERT_ARGS_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs))
-#define ASSERT_ARGS_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_decompose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-#define ASSERT_ARGS_string_from_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_to_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
-
-#ifdef EXCEPTION
-# undef EXCEPTION
-#endif
-
-#define EXCEPTION(err, str) \
- Parrot_ex_throw_from_c_args(interp, NULL, (err), (str))
-
-
-/*
-
-=item C<static STRING* to_charset(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING C<src> to STRING C<dest> in binary mode. Throws
-an exception if a suitable conversion function is not found.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-to_charset(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_charset)
- charset_converter_t conversion_func =
- Parrot_find_charset_converter(interp, src->charset, Parrot_binary_charset_ptr);
-
- if (conversion_func)
- return conversion_func(interp, src);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "to_charset for binary not implemented");
-}
-
-/*
-
-=item C<static STRING* compose(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot compose a binary string.
-
-=cut
-
-*/
-
-/* A err. can't compose binary */
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-compose(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(compose)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't compose binary data");
-}
-
-/*
-
-=item C<static STRING* decompose(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot decompose a binary string.
-
-=cut
-
-*/
-
-/* A err. can't decompose binary */
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-decompose(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(decompose)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't decompose binary data");
-}
-
-/*
-
-=item C<static STRING* upcase(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot convert a binary string to
-upper case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(upcase)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't upcase binary data");
-}
-
-/*
-
-=item C<static STRING* downcase(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot convert a binary string to
-lower-case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(downcase)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't downcase binary data");
-}
-
-/*
-
-=item C<static STRING* titlecase(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot convert a binary string to
-title case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(titlecase)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't titlecase binary data");
-}
-
-/*
-
-=item C<static STRING* upcase_first(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot set the first "character" of the
-binary string to uppercase.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase_first(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(upcase_first)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't upcase binary data");
-}
-
-/*
-
-=item C<static STRING* downcase_first(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we cannot set the first "character"
-of the binary string to lowercase.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase_first(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(downcase_first)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't downcase binary data");
-}
-
-/*
-
-=item C<static STRING* titlecase_first(PARROT_INTERP, const STRING *src)>
-
-Throws an exception because we can't convert the first "character"
-of binary data to title case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(titlecase_first)
- EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't titlecase binary data");
-}
-
-/*
-
-=item C<static INTVAL compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
-
-Compare the two buffers, first by size, then with memcmp.
-
-=cut
-
-*/
-
-static INTVAL
-compare(SHIM_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
-{
- ASSERT_ARGS(compare)
- const UINTVAL l_len = lhs->strlen;
- const UINTVAL r_len = rhs->strlen;
- if (l_len != r_len)
- return l_len - r_len;
-
- return memcmp(lhs->strstart, rhs->strstart, l_len);
-}
-
-/*
-
-=item C<static UINTVAL validate(PARROT_INTERP, const STRING *src)>
-
-Returns 1. All sequential data is valid binary data.
-
-=cut
-
-*/
-
-/* Binary's always valid */
-static UINTVAL
-validate(SHIM_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(validate)
- return 1;
-}
-
-/*
-
-=item C<static INTVAL is_cclass(PARROT_INTERP, INTVAL flags, const STRING *src,
-UINTVAL offset)>
-
-Returns Boolean.
-
-=cut
-
-*/
-
-static INTVAL
-is_cclass(SHIM_INTERP, SHIM(INTVAL flags), SHIM(const STRING *src), SHIM(UINTVAL offset))
-{
- ASSERT_ARGS(is_cclass)
- return 0;
-}
-
-/*
-
-=item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Find a character in the given character class.
-
-=cut
-
-*/
-
-static INTVAL
-find_cclass(SHIM_INTERP, SHIM(INTVAL flags),
- SHIM(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_cclass)
- return offset + count;
-}
-
-/*
-
-=item C<static INTVAL find_not_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Returns C<INTVAL>.
-
-=cut
-
-*/
-
-static INTVAL
-find_not_cclass(SHIM_INTERP, SHIM(INTVAL flags),
- SHIM(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_not_cclass)
- return offset + count;
-}
-
-/*
-
-=item C<static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)>
-
-Creates a new STRING object from a single codepoint C<codepoint>. Returns
-the new STRING.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
-{
- ASSERT_ARGS(string_from_codepoint)
- STRING *return_string;
- char real_codepoint = (char)codepoint;
- return_string = string_make(interp, &real_codepoint, 1, "binary", 0);
- return return_string;
-}
-
-
-/*
-
-=item C<void Parrot_charset_binary_init(PARROT_INTERP)>
-
-Initialize the binary charset, including function pointers and
-settings.
-
-=cut
-
-*/
-
-void
-Parrot_charset_binary_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_charset_binary_init)
- CHARSET * const return_set = Parrot_new_charset(interp);
- static const CHARSET base_set = {
- "binary",
- ascii_get_graphemes,
- to_charset,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- compare,
- ascii_cs_index,
- ascii_cs_rindex,
- validate,
- is_cclass,
- find_cclass,
- find_not_cclass,
- string_from_codepoint,
- ascii_compute_hash,
- NULL
- };
-
- STRUCT_COPY_FROM_STRUCT(return_set, base_set);
- return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
- Parrot_register_charset(interp, "binary", return_set);
-
- return;
-
-}
-
-/*
-
-=back
-
-=cut
-
-*/
-
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/binary.h
==============================================================================
--- trunk/src/string/charset/binary.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,34 +0,0 @@
-/* binary.h
- * Copyright (C) 2004-2007, Parrot Foundation.
- * SVN Info
- * $Id$
- * Overview:
- * This is the header for the binary charset functions
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-#ifndef PARROT_CHARSET_BINARY_H_GUARD
-#define PARROT_CHARSET_BINARY_H_GUARD
-
-/* HEADERIZER BEGIN: src/string/charset/binary.c */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-void Parrot_charset_binary_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_Parrot_charset_binary_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/charset/binary.c */
-
-#endif /* PARROT_CHARSET_BINARY_H_GUARD */
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/iso-8859-1.c
==============================================================================
--- trunk/src/string/charset/iso-8859-1.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,733 +0,0 @@
-/*
-Copyright (C) 2004-2010, Parrot Foundation.
-$Id$
-
-=head1 NAME
-
-src/string/charset/iso-8859-1.c
-
-=head1 DESCRIPTION
-
-This file implements the charset functions for iso-8859-1 data
-
-=over 4
-
-=cut
-
-*/
-
-#include "parrot/parrot.h"
-#include "iso-8859-1.h"
-#include "ascii.h"
-
-/* HEADERIZER HFILE: src/string/charset/iso-8859-1.h */
-
-/* HEADERIZER BEGIN: static */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-static STRING* compose(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* decompose(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static INTVAL find_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-static INTVAL find_not_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-static INTVAL is_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-static STRING * to_charset(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_iso_8859_1(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_unicode(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static UINTVAL validate(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-#define ASSERT_ARGS_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_decompose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_string_from_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_iso_8859_1 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_unicode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
-
-#include "tables.h"
-
-/*
-
-=item C<static STRING * to_iso_8859_1(PARROT_INTERP, const STRING *src)>
-
-Converts STRING C<src> to iso-8859-1 in STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-to_iso_8859_1(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_iso_8859_1)
- UINTVAL src_len;
- String_iter iter;
- /* iso-8859-1 is never bigger then source */
- STRING * dest = Parrot_str_clone(interp, src);
-
- STRING_ITER_INIT(interp, &iter);
- src_len = src->strlen;
- dest->bufused = src_len;
- while (iter.charpos < src_len) {
- const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (c >= 0x100)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
- "lossy conversion to iso-8559-1");
-
- Parrot_fixed_8_encoding_ptr->set_byte(interp, dest, iter.charpos - 1, c);
- }
- dest->charset = Parrot_iso_8859_1_charset_ptr;
- dest->encoding = Parrot_fixed_8_encoding_ptr;
- return dest;
-}
-
-/*
-
-=item C<static STRING * to_unicode(PARROT_INTERP, const STRING *src)>
-
-Converts STRING C<src> to unicode STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-to_unicode(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_unicode)
- STRING * dest = Parrot_str_clone(interp, src);
- UINTVAL offs;
- String_iter iter;
-
- dest->charset = Parrot_unicode_charset_ptr;
- dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
- Parrot_gc_reallocate_string_storage(interp, dest, src->strlen);
- STRING_ITER_INIT(interp, &iter);
- while (iter.charpos < src->strlen) {
- const UINTVAL c = ENCODING_GET_BYTE(interp, src, iter.charpos);
-
- if (iter.bytepos >= Buffer_buflen(dest) - 4) {
- UINTVAL need = (UINTVAL)((src->strlen - iter.charpos) * 1.5);
- if (need < 16)
- need = 16;
- Parrot_gc_reallocate_string_storage(interp, dest,
- Buffer_buflen(dest) + need);
- }
- STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, c);
- }
- dest->bufused = iter.bytepos;
- dest->strlen = iter.charpos;
- return dest;
-}
-
-/*
-
-=item C<static STRING * to_charset(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING C<src> to an ISO-8859-1 STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-static STRING *
-to_charset(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_charset)
- const charset_converter_t conversion_func =
- Parrot_find_charset_converter(interp, src->charset, Parrot_iso_8859_1_charset_ptr);
-
- if (conversion_func)
- return conversion_func(interp, src);
- else
- return to_iso_8859_1(interp, src);
-}
-
-
-/*
-
-=item C<static STRING* compose(PARROT_INTERP, const STRING *src)>
-
-ISO-8859-1 does not support composing, so we just copy the STRING C<src> and return the
-copy.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-static STRING*
-compose(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(compose)
-
- STRING * const dest = Parrot_str_copy(interp, src);
-
- return dest;
-}
-
-/*
-
-=item C<static STRING* decompose(PARROT_INTERP, const STRING *src)>
-
-SO-8859-1 does not support decomposing, so we throw an exception.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-decompose(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(decompose)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "decompose for iso-8859-1 not implemented");
-}
-
-/*
-
-=item C<static STRING* upcase(PARROT_INTERP, const STRING *src)>
-
-Convert all graphemes in the STRING C<src> to upper case, for those
-graphemes that support cases.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(upcase)
- unsigned char *buffer;
- UINTVAL offset = 0;
- STRING *result = Parrot_str_clone(interp, src);
-
- if (!result->strlen)
- return result;
-
- buffer = (unsigned char *)result->strstart;
- for (offset = 0; offset < result->strlen; ++offset) {
- unsigned int c = buffer[offset]; /* XXX use encoding ? */
- if (c >= 0xe0 && c != 0xf7)
- c &= ~0x20;
- else
- c = toupper((unsigned char)c);
- buffer[offset] = (unsigned char)c;
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* downcase(PARROT_INTERP, const STRING *src)>
-
-Converts all graphemes in STRING C<src> to lower-case, for those graphemes
-that support cases.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(downcase)
- unsigned char *buffer;
- UINTVAL offset = 0;
- STRING *result = Parrot_str_clone(interp, src);
-
- if (!result->strlen)
- return result;
-
- buffer = (unsigned char *)result->strstart;
- for (offset = 0; offset < result->strlen; ++offset) {
- unsigned int c = buffer[offset];
- if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
- c |= 0x20;
- else
- c = tolower((unsigned char)c);
- buffer[offset] = (unsigned char)c;
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* titlecase(PARROT_INTERP, const STRING *src)>
-
-Converts the graphemes in STRING C<src> to title case, for those graphemes
-that support cases.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(titlecase)
- unsigned char *buffer;
- unsigned int c;
- UINTVAL offset;
- STRING *result = Parrot_str_clone(interp, src);
-
- if (!result->strlen)
- return result;
-
- buffer = (unsigned char *)result->strstart;
- c = buffer[0];
- if (c >= 0xe0 && c != 0xf7)
- c &= ~0x20;
- else
- c = toupper((unsigned char)c);
- buffer[0] = (unsigned char)c;
-
- for (offset = 1; offset < result->strlen; ++offset) {
- c = buffer[offset];
- if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
- c |= 0x20;
- else
- c = tolower((unsigned char)c);
- buffer[offset] = (unsigned char)c;
- }
-
- return result;
-}
-
-/*
-
-=item C<static STRING* upcase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first grapheme in STRING C<src> to upper case, if it
-supports cases.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(upcase_first)
- unsigned char *buffer;
- unsigned int c;
- STRING *result = Parrot_str_clone(interp, src);
-
- if (!result->strlen)
- return result;
-
- buffer = (unsigned char *)result->strstart;
- c = buffer[0];
- if (c >= 0xe0 && c != 0xf7)
- c &= ~0x20;
- else
- c = toupper((unsigned char)c);
- buffer[0] = (unsigned char)c;
-
- return result;
-}
-
-/*
-
-=item C<static STRING* downcase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first character of the STRING C<src> to lower case, if the
-grapheme supports lower case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(downcase_first)
- unsigned char *buffer;
- unsigned int c;
- STRING *result = Parrot_str_clone(interp, src);
-
- if (!result->strlen)
- return result;
-
- buffer = (unsigned char *)result->strstart;
- c = buffer[0];
- if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
- c &= ~0x20;
- else
- c = tolower((unsigned char)c);
- buffer[0] = (unsigned char)c;
-
- return result;
-}
-
-/*
-
-=item C<static STRING* titlecase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first grapheme in STRING C<src> to title case, if the grapheme
-supports case.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(titlecase_first)
- return upcase_first(interp, src);
-}
-
-
-/*
-
-=item C<static UINTVAL validate(PARROT_INTERP, const STRING *src)>
-
-Returns 1 if the STRING C<src> is a valid ISO-8859-1 STRING. Returns 0 otherwise.
-
-=cut
-
-*/
-
-static UINTVAL
-validate(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(validate)
- INTVAL offset;
- const INTVAL length = Parrot_str_length(interp, src);
-
- for (offset = 0; offset < length; ++offset) {
- const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
- if (codepoint >= 0x100)
- return 0;
- }
- return 1;
-}
-
-/*
-
-=item C<static INTVAL is_cclass(PARROT_INTERP, INTVAL flags, const STRING *src,
-UINTVAL offset)>
-
-Returns Boolean.
-
-=cut
-
-*/
-
-static INTVAL
-is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(is_cclass)
- UINTVAL codepoint;
-
- if (offset >= src->strlen) return 0;
- codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
-
- if (codepoint >= sizeof (Parrot_ascii_typetable) /
- sizeof (Parrot_ascii_typetable[0])) {
- return 0;
- }
- return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
-}
-
-/*
-
-=item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Find a character in the given character class. Delegates to the find_cclass
-method of the encoding plugin.
-
-=cut
-
-*/
-
-static INTVAL
-find_cclass(PARROT_INTERP, INTVAL flags,
- ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_cclass)
- const UINTVAL pos = offset;
- UINTVAL end = offset + count;
-
- end = src->strlen < end ? src->strlen : end;
- return ENCODING_FIND_CCLASS(interp, src,
- Parrot_iso_8859_1_typetable, flags, pos, end);
-}
-
-/*
-
-=item C<static INTVAL find_not_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Returns C<INTVAL>.
-
-=cut
-
-*/
-
-static INTVAL
-find_not_cclass(PARROT_INTERP, INTVAL flags,
- ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_not_cclass)
- UINTVAL pos = offset;
- UINTVAL end = offset + count;
-
- end = src->strlen < end ? src->strlen : end;
- for (; pos < end; ++pos) {
- const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, src, pos);
- if ((Parrot_iso_8859_1_typetable[codepoint] & flags) == 0) {
- return pos;
- }
- }
- return end;
-}
-
-
-/*
-
-=item C<static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)>
-
-Creates a new STRING from the single codepoint C<codepoint>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
-{
- ASSERT_ARGS(string_from_codepoint)
- char real_codepoint = (char)codepoint;
- STRING * const return_string = string_make(interp, &real_codepoint, 1,
- "iso-8859-1", 0);
- return return_string;
-}
-
-/*
-
-=item C<void Parrot_charset_iso_8859_1_init(PARROT_INTERP)>
-
-Initializes the ISO-8859-1 charset by installing all the necessary function pointers.
-
-=cut
-
-*/
-
-void
-Parrot_charset_iso_8859_1_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_charset_iso_8859_1_init)
- CHARSET * const return_set = Parrot_new_charset(interp);
- static const CHARSET base_set = {
- "iso-8859-1",
- ascii_get_graphemes,
- to_charset,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- ascii_compare,
- ascii_cs_index,
- ascii_cs_rindex,
- validate,
- is_cclass,
- find_cclass,
- find_not_cclass,
- string_from_codepoint,
- ascii_compute_hash,
- NULL
- };
-
- STRUCT_COPY_FROM_STRUCT(return_set, base_set);
- return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
- Parrot_register_charset(interp, "iso-8859-1", return_set);
-
- return;
-}
-
-/*
-
-=item C<STRING * charset_cvt_iso_8859_1_to_ascii(PARROT_INTERP, const STRING
-*src)>
-
-Converts STRING C<src> in ISO-8859-1 to ASCII STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-STRING *
-charset_cvt_iso_8859_1_to_ascii(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(charset_cvt_iso_8859_1_to_ascii)
- UINTVAL offs;
- STRING *dest = Parrot_str_clone(interp, src);
-
- for (offs = 0; offs < src->strlen; ++offs) {
- UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
- if (c >= 0x80)
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
- "lossy conversion to ascii");
-
- ENCODING_SET_BYTE(interp, dest, offs, c);
- }
- return dest;
-}
-
-/*
-
-=back
-
-=cut
-
-*/
-
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/iso-8859-1.h
==============================================================================
--- trunk/src/string/charset/iso-8859-1.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,46 +0,0 @@
-/* iso_8859_1.h
- * Copyright (C) 2004-2007, Parrot Foundation.
- * SVN Info
- * $Id$
- * Overview:
- * This is the header for the iso_8859-1 charset functions
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-#ifndef PARROT_CHARSET_ISO_8859_1_H_GUARD
-#define PARROT_CHARSET_ISO_8859_1_H_GUARD
-
-/* HEADERIZER BEGIN: src/string/charset/iso-8859-1.c */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-PARROT_CANNOT_RETURN_NULL
-PARROT_WARN_UNUSED_RESULT
-STRING * charset_cvt_iso_8859_1_to_ascii(PARROT_INTERP,
- ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-void Parrot_charset_iso_8859_1_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_charset_cvt_iso_8859_1_to_ascii \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_Parrot_charset_iso_8859_1_init \
- __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/charset/iso-8859-1.c */
-
-#endif /* PARROT_CHARSET_ISO_8859_1_H_GUARD */
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/tables.c
==============================================================================
--- trunk/src/string/charset/tables.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,93 +0,0 @@
-/* $Id$
- * Copyright (C) 2005-2007, Parrot Foundation.
- *
- * DO NOT EDIT THIS FILE DIRECTLY!
- * please update the tools/dev/gen_charset_tables.pl script instead.
- *
- * Created by gen_charset_tables.pl 19534 2007-07-02 02:12:08Z petdance
- * Overview:
- * This file contains various charset tables.
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-/* HEADERIZER HFILE: none */
-
-
-#include "tables.h"
-const INTVAL Parrot_iso_8859_1_typetable[256] = {
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 0-7 */
-0x0200, 0x0320, 0x1220, 0x0220, 0x1220, 0x1220, 0x0200, 0x0200, /* 8-15 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 16-23 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 24-31 */
-0x0160, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 32-39 */
-0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 40-47 */
-0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, /* 48-55 */
-0x28d8, 0x28d8, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 56-63 */
-0x04c0, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28c5, /* 64-71 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 72-79 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 80-87 */
-0x28c5, 0x28c5, 0x28c5, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x24c0, /* 88-95 */
-0x04c0, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28c6, /* 96-103 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
-0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x1220, 0x0200, 0x0200, /* 128-135 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 136-143 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 144-151 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 152-159 */
-0x04e0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 160-167 */
-0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 168-175 */
-0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x28c6, 0x04c0, 0x04c0, /* 176-183 */
-0x04c0, 0x04c0, 0x28c4, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 184-191 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 192-199 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 200-207 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x04c0, /* 208-215 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c6, /* 216-223 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 224-231 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 232-239 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x04c0, /* 240-247 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 248-255 */
-};
-const INTVAL Parrot_ascii_typetable[256] = {
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 0-7 */
-0x0200, 0x0320, 0x1220, 0x0220, 0x1220, 0x1220, 0x0200, 0x0200, /* 8-15 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 16-23 */
-0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, /* 24-31 */
-0x0160, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 32-39 */
-0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 40-47 */
-0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, 0x28d8, /* 48-55 */
-0x28d8, 0x28d8, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x04c0, /* 56-63 */
-0x04c0, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28d5, 0x28c5, /* 64-71 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 72-79 */
-0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, 0x28c5, /* 80-87 */
-0x28c5, 0x28c5, 0x28c5, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x24c0, /* 88-95 */
-0x04c0, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28d6, 0x28c6, /* 96-103 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 104-111 */
-0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, 0x28c6, /* 112-119 */
-0x28c6, 0x28c6, 0x28c6, 0x04c0, 0x04c0, 0x04c0, 0x04c0, 0x0200, /* 120-127 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x1020, 0x0000, 0x0000, /* 128-135 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 136-143 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 144-151 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 152-159 */
-0x0020, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 160-167 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 168-175 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 176-183 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 184-191 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 192-199 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 200-207 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 208-215 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 216-223 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 224-231 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 232-239 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 240-247 */
-0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 248-255 */
-};
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/tables.h
==============================================================================
--- trunk/src/string/charset/tables.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,36 +0,0 @@
-/* $Id$
- * Copyright (C) 2005-2007, Parrot Foundation.
- *
- * DO NOT EDIT THIS FILE DIRECTLY!
- * please update the tools/dev/gen_charset_tables.pl script instead.
- *
- * Created by gen_charset_tables.pl 19534 2007-07-02 02:12:08Z petdance
- * Overview:
- * This file contains various charset tables.
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-/* HEADERIZER HFILE: none */
-
-
-#ifndef PARROT_CHARSET_TABLES_H_GUARD
-#define PARROT_CHARSET_TABLES_H_GUARD
-#include "parrot/cclass.h"
-#include "parrot/parrot.h"
-#define WHITESPACE enum_cclass_whitespace
-#define WORDCHAR enum_cclass_word
-#define PUNCTUATION enum_cclass_punctuation
-#define DIGIT enum_cclass_numeric
-extern const INTVAL Parrot_iso_8859_1_typetable[256];
-extern const INTVAL Parrot_ascii_typetable[256];
-#endif /* PARROT_CHARSET_TABLES_H_GUARD */
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
-
Deleted: trunk/src/string/charset/unicode.c
==============================================================================
--- trunk/src/string/charset/unicode.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,1075 +0,0 @@
-/*
-Copyright (C) 2005-2010, Parrot Foundation.
-$Id$
-
-=head1 NAME
-
-src/string/charset/unicode.c
-
-=head1 DESCRIPTION
-
-This file implements the charset functions for unicode data
-
-=over 4
-
-=cut
-
-*/
-
-#include "parrot/parrot.h"
-#include "unicode.h"
-#include "ascii.h"
-#include "tables.h"
-
-/* HEADERIZER HFILE: src/string/charset/unicode.h */
-
-/* HEADERIZER BEGIN: static */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-static INTVAL compare(PARROT_INTERP,
- ARGIN(const STRING *lhs),
- ARGIN(const STRING *rhs))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* compose(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static size_t compute_hash(PARROT_INTERP,
- ARGIN(const STRING *src),
- size_t seed)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static INTVAL cs_rindex(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(const STRING *search_string),
- SHIM(UINTVAL offset))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* decompose(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-static INTVAL find_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-static INTVAL find_not_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_graphemes(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static INTVAL is_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* to_charset(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-static UINTVAL validate(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-#define ASSERT_ARGS_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs))
-#define ASSERT_ARGS_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_compute_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_cs_rindex __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_decompose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_graphemes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_string_from_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_to_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_u_iscclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
-
-#ifdef EXCEPTION
-# undef EXCEPTION
-#endif
-
-#if PARROT_HAS_ICU
-# include <unicode/ucnv.h>
-# include <unicode/utypes.h>
-# include <unicode/uchar.h>
-# include <unicode/ustring.h>
-# include <unicode/unorm.h>
-#endif
-#define EXCEPTION(err, str) \
- Parrot_ex_throw_from_c_args(interp, NULL, (err), (str))
-
-#define UNIMPL EXCEPTION(EXCEPTION_UNIMPLEMENTED, "unimplemented unicode")
-
-
-/*
-
-=item C<static STRING * get_graphemes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Gets the graphemes from STRING C<src> starting at C<offset>. Gets
-C<count> graphemes total.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_graphemes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(get_graphemes)
- return ENCODING_GET_CODEPOINTS(interp, src, offset, count);
-}
-
-
-/*
-
-=item C<static STRING* to_charset(PARROT_INTERP, const STRING *src)>
-
-Converts input STRING C<src> to unicode STRING C<dest>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-to_charset(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_charset)
- const charset_converter_t conversion_func =
- Parrot_find_charset_converter(interp, src->charset,
- Parrot_unicode_charset_ptr);
-
- if (conversion_func)
- return conversion_func(interp, src);
-
- return Parrot_utf8_encoding_ptr->to_encoding(interp, src);
-}
-
-
-/*
-
-=item C<static STRING* compose(PARROT_INTERP, const STRING *src)>
-
-If Parrot is built with ICU, composes the STRING C<src>. Attempts to
-denormalize the STRING into the ICU default, NFC.
-
-If Parrot does not have ICU included, throws an exception.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-compose(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(compose)
-#if PARROT_HAS_ICU
- STRING *dest;
- int src_len, dest_len;
- UErrorCode err;
- /*
- U_STABLE int32_t U_EXPORT2
- unorm_normalize(const UChar *source, int32_t sourceLength,
- UNormalizationMode mode, int32_t options,
- UChar *result, int32_t resultLength,
- UErrorCode *status);
- */
- dest_len = src_len = src->strlen;
- dest = Parrot_str_new_init(interp, NULL, src_len * sizeof (UChar),
- src->encoding, src->charset, 0);
-
- err = U_ZERO_ERROR;
- dest_len = unorm_normalize((UChar *)src->strstart, src_len,
- UNORM_DEFAULT, /* default is NFC */
- 0, /* options 0 default - no specific icu
- * version */
- (UChar *)dest->strstart, dest_len, &err);
-
- dest->bufused = dest_len * sizeof (UChar);
-
- if (!U_SUCCESS(err)) {
- err = U_ZERO_ERROR;
- Parrot_gc_reallocate_string_storage(interp, dest, dest->bufused);
- dest_len = unorm_normalize((UChar *)src->strstart, src_len,
- UNORM_DEFAULT, /* default is NFC */
- 0, /* options 0 default - no specific
- * icu version */
- (UChar *)dest->strstart, dest_len, &err);
- PARROT_ASSERT(U_SUCCESS(err));
- dest->bufused = dest_len * sizeof (UChar);
- }
- dest->strlen = dest_len;
- return dest;
-#else
- UNUSED(src);
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
-}
-
-
-/*
-
-=item C<static STRING* decompose(PARROT_INTERP, const STRING *src)>
-
-Decompose function for unicode charset. This function is not yet implemented.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-decompose(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(decompose)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
-}
-
-
-/*
-
-=item C<static STRING* upcase(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING C<src> to all upper-case graphemes, for those characters
-which support upper-case versions.
-
-Throws an exception if ICU is not installed.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(upcase)
-#if PARROT_HAS_ICU
- UErrorCode err;
- int dest_len, src_len, needed;
- STRING *res;
-#endif
-
- if (src->bufused == src->strlen
- && src->encoding == Parrot_utf8_encoding_ptr) {
- return Parrot_ascii_charset_ptr->upcase(interp, src);
- }
-
-#if PARROT_HAS_ICU
- /* to_encoding will allocate new string */
- res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
- /*
- U_CAPI int32_t U_EXPORT2
- u_strToUpper(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode);
- */
- err = U_ZERO_ERROR;
-
- /* use all available space - see below XXX */
- /* TODO downcase, titlecase too */
- dest_len = Buffer_buflen(res) / sizeof (UChar);
- src_len = res->bufused / sizeof (UChar);
-
- /*
- * XXX troubles:
- * t/op/string_cs_45 upcase unicode:"\u01f0"
- * this creates \u004a \u030c J+NON-SPACING HACEK
- * the string needs resizing, *if* the src buffer is
- * too short. *But* with icu 3.2/3.4 the src string is
- * overwritten with partial result, despite the icu docs sayeth:
- *
- * The source string and the destination buffer
- * are allowed to overlap.
- *
- * Workaround: 'preflighting' returns needed length
- * Alternative: forget about inplace operation - create new result
- *
- * TODO downcase, titlecase
- */
- needed = u_strToUpper(NULL, 0,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
-
- if (needed > dest_len) {
- Parrot_gc_reallocate_string_storage(interp, res, needed * sizeof (UChar));
- dest_len = needed;
- }
-
- err = U_ZERO_ERROR;
- dest_len = u_strToUpper((UChar *)res->strstart, dest_len,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
- PARROT_ASSERT(U_SUCCESS(err));
- res->bufused = dest_len * sizeof (UChar);
-
- /* downgrade if possible */
- if (dest_len == (int)src->strlen)
- res->encoding = Parrot_ucs2_encoding_ptr;
- else {
- /* string is likely still ucs2 if it was earlier
- * but strlen changed due to combining char
- */
- res->strlen = dest_len;
- }
-
- return res;
-
-#else
- UNUSED(src);
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
-}
-
-
-/*
-
-=item C<static STRING* downcase(PARROT_INTERP, const STRING *src)>
-
-Converts all graphemes to lower-case, for those graphemes which have cases.
-
-Throws an exception if ICU is not installed.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(downcase)
-#if PARROT_HAS_ICU
- UErrorCode err;
- int dest_len, src_len;
- STRING *res;
-#endif
-
- if (src->bufused == src->strlen
- && src->encoding == Parrot_utf8_encoding_ptr) {
- return Parrot_ascii_charset_ptr->downcase(interp, src);
- }
-
-#if PARROT_HAS_ICU
- /* to_encoding will allocate new string */
- res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
- /*
-U_CAPI int32_t U_EXPORT2
-u_strToLower(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode);
- */
- err = U_ZERO_ERROR;
- src_len = res->bufused / sizeof (UChar);
- dest_len = u_strToLower((UChar *)res->strstart, src_len,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
- res->bufused = dest_len * sizeof (UChar);
-
- if (!U_SUCCESS(err)) {
- err = U_ZERO_ERROR;
- Parrot_gc_reallocate_string_storage(interp, res, res->bufused);
- dest_len = u_strToLower((UChar *)res->strstart, dest_len,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
- PARROT_ASSERT(U_SUCCESS(err));
- }
-
- /* downgrade if possible */
- if (dest_len == (int)res->strlen)
- res->encoding = Parrot_ucs2_encoding_ptr;
-
- return res;
-
-#else
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
-}
-
-
-/*
-
-=item C<static STRING* titlecase(PARROT_INTERP, const STRING *src)>
-
-Converts the string to title case, for those characters which support cases.
-
-Throws an exception if ICU is not installed.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(titlecase)
-#if PARROT_HAS_ICU
-
- UErrorCode err;
- int dest_len, src_len;
- STRING *res;
-
- if (src->bufused == src->strlen
- && src->encoding == Parrot_utf8_encoding_ptr) {
- return Parrot_ascii_charset_ptr->titlecase(interp, src);
- }
-
- /* to_encoding will allocate new string */
- res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
-
- /*
-U_CAPI int32_t U_EXPORT2
-u_strToTitle(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UBreakIterator *titleIter,
- const char *locale,
- UErrorCode *pErrorCode);
- */
-
- err = U_ZERO_ERROR;
- src_len = res->bufused / sizeof (UChar);
- dest_len = u_strToTitle((UChar *)res->strstart, src_len,
- (UChar *)res->strstart, src_len,
- NULL, /* default titleiter */
- NULL, /* locale = default */
- &err);
- res->bufused = dest_len * sizeof (UChar);
-
- if (!U_SUCCESS(err)) {
- err = U_ZERO_ERROR;
- Parrot_gc_reallocate_string_storage(interp, res, res->bufused);
- dest_len = u_strToTitle((UChar *)res->strstart, dest_len,
- (UChar *)res->strstart, src_len,
- NULL, NULL,
- &err);
- PARROT_ASSERT(U_SUCCESS(err));
- }
-
- /* downgrade if possible */
- if (dest_len == (int)res->strlen)
- res->encoding = Parrot_ucs2_encoding_ptr;
-
- return res;
-
-#else
- UNUSED(src);
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
-}
-
-
-/*
-
-=item C<static STRING* upcase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first grapheme in the STRING C<src> to uppercase, if the
-grapheme supports it. Not implemented.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase_first(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(upcase_first)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
-}
-
-
-/*
-
-=item C<static STRING* downcase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first grapheme in the STRING C<src> to lower-case, if
-the grapheme supports it. Not implemented
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase_first(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(downcase_first)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
-}
-
-
-/*
-
-=item C<static STRING* titlecase_first(PARROT_INTERP, const STRING *src)>
-
-Converts the first grapheme in STRING C<src> to title case, if the
-string supports it. Not implemented.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(titlecase_first)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
-}
-
-
-/*
-
-=item C<static INTVAL compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
-
-Compares two STRINGs, C<lhs> and C<rhs>. Returns -1 if C<lhs> < C<rhs>. Returns
-0 if C<lhs> = C<rhs>. Returns 1 if C<lhs> > C<rhs>.
-
-=cut
-
-*/
-
-static INTVAL
-compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
-{
- ASSERT_ARGS(compare)
- String_iter l_iter, r_iter;
- UINTVAL min_len, l_len, r_len;
-
- /* TODO make optimized equal - strings are equal length then already */
- STRING_ITER_INIT(interp, &l_iter);
- STRING_ITER_INIT(interp, &r_iter);
-
- l_len = lhs->strlen;
- r_len = rhs->strlen;
-
- min_len = l_len > r_len ? r_len : l_len;
-
- while (l_iter.charpos < min_len) {
- const UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, lhs, &l_iter);
- const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &r_iter);
-
- if (cl != cr)
- return cl < cr ? -1 : 1;
- }
-
- if (l_len < r_len)
- return -1;
-
- if (l_len > r_len)
- return 1;
-
- return 0;
-}
-
-
-/*
-
-=item C<static INTVAL cs_rindex(PARROT_INTERP, const STRING *src, const STRING
-*search_string, UINTVAL offset)>
-
-Finds the last index of substring C<search_string> in STRING C<src>,
-starting from C<offset>. Not implemented.
-
-=cut
-
-*/
-
-static INTVAL
-cs_rindex(PARROT_INTERP, SHIM(const STRING *src),
- SHIM(const STRING *search_string), SHIM(UINTVAL offset))
-{
- ASSERT_ARGS(cs_rindex)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
-}
-
-
-/*
-
-=item C<static UINTVAL validate(PARROT_INTERP, const STRING *src)>
-
-Returns 1 if the STRING C<src> is a valid unicode string, returns 0 otherwise.
-
-=cut
-
-*/
-
-static UINTVAL
-validate(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(validate)
- String_iter iter;
- const INTVAL length = Parrot_str_length(interp, src);
-
- STRING_ITER_INIT(interp, &iter);
- while (iter.charpos < length) {
- const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- /* Check for Unicode non-characters */
- if (codepoint >= 0xfdd0
- && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
- && codepoint <= 0x10ffff)
- return 0;
- }
-
- return 1;
-}
-
-
-/*
-
-=item C<static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)>
-
-Returns Boolean.
-
-=cut
-
-*/
-
-static int
-u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
-{
- ASSERT_ARGS(u_iscclass)
-#if PARROT_HAS_ICU
- UNUSED(interp);
- /* XXX which one
- return u_charDigitValue(codepoint);
- */
- if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1;
- if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1;
- if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1;
- if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1;
- if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1;
- if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1;
- if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1;
- if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1;
- if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1;
- if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1;
- if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1;
- if ((flags & enum_cclass_word) &&
- (u_isalnum(codepoint) || codepoint == '_')) return 1;
-
- return 0;
-#else
- if (codepoint < 256)
- return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
-
- if (flags == enum_cclass_any)
- return 1;
-
- /* All codepoints from u+0100 to u+02af are alphabetic, so we
- * cheat on the WORD and ALPHABETIC properties to include these
- * (and incorrectly exclude all others). This is a stopgap until
- * ICU is everywhere, or we have better non-ICU unicode support. */
- if (flags == enum_cclass_word || flags == enum_cclass_alphabetic)
- return (codepoint < 0x2b0);
-
- if (flags & enum_cclass_whitespace) {
- /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */
- switch (codepoint) {
- case 0x1680: case 0x180e: case 0x2000: case 0x2001:
- case 0x2002: case 0x2003: case 0x2004: case 0x2005:
- case 0x2006: case 0x2007: case 0x2008: case 0x2009:
- case 0x200a: case 0x2028: case 0x2029: case 0x202f:
- case 0x205f: case 0x3000:
- return 1;
- default:
- break;
- }
- }
-
- if (flags & enum_cclass_numeric) {
- /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
- if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1;
- if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1;
- if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1;
- if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1;
- if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1;
- if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1;
- if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1;
- if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1;
- if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1;
- if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1;
- if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1;
- if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1;
- if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1;
- if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1;
- if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1;
- if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1;
- if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1;
- if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1;
- if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1;
- if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1;
- if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1;
- if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1;
- }
-
- if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline))
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-
- return 0;
-#endif
-}
-
-
-/*
-
-=item C<static INTVAL is_cclass(PARROT_INTERP, INTVAL flags, const STRING *src,
-UINTVAL offset)>
-
-Returns Boolean.
-
-=cut
-
-*/
-
-static INTVAL
-is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(is_cclass)
- UINTVAL codepoint;
-
- if (offset >= src->strlen)
- return 0;
-
- codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
-
- if (codepoint >= 256)
- return u_iscclass(interp, codepoint, flags) != 0;
-
- return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
-}
-
-
-/*
-
-=item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Find a character in the given character class.
-
-=cut
-
-*/
-
-static INTVAL
-find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_cclass)
- String_iter iter;
- UINTVAL codepoint;
- UINTVAL end = offset + count;
-
- STRING_ITER_INIT(interp, &iter);
- STRING_ITER_SET_POSITION(interp, src, &iter, offset);
-
- end = src->strlen < end ? src->strlen : end;
-
- while (iter.charpos < end) {
- codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (codepoint >= 256) {
- if (u_iscclass(interp, codepoint, flags))
- return iter.charpos - 1;
- }
- else {
- if (Parrot_iso_8859_1_typetable[codepoint] & flags)
- return iter.charpos - 1;
- }
- }
-
- return end;
-}
-
-
-/*
-
-=item C<static INTVAL find_not_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
-
-Returns C<INTVAL>.
-
-=cut
-
-*/
-
-static INTVAL
-find_not_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src),
- UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(find_not_cclass)
- String_iter iter;
- UINTVAL codepoint;
- UINTVAL end = offset + count;
- int bit;
-
- if (offset > src->strlen) {
- /* XXX: Throw in this case? */
- return offset + count;
- }
-
- STRING_ITER_INIT(interp, &iter);
-
- if (offset)
- STRING_ITER_SET_POSITION(interp, src, &iter, offset);
-
- end = src->strlen < end ? src->strlen : end;
-
- if (flags == enum_cclass_any)
- return end;
-
- while (iter.charpos < end) {
- codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (codepoint >= 256) {
- for (bit = enum_cclass_uppercase;
- bit <= enum_cclass_word ; bit <<= 1) {
- if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
- return iter.charpos - 1;
- }
- }
- else {
- if (!(Parrot_iso_8859_1_typetable[codepoint] & flags))
- return iter.charpos - 1;
- }
- }
-
- return end;
-}
-
-
-/*
-
-=item C<static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)>
-
-Returns a one-codepoint string for the given codepoint.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
-{
- ASSERT_ARGS(string_from_codepoint)
- String_iter iter;
- STRING * const dest = string_make(interp, "", 1, "unicode", 0);
-
- dest->strlen = 1;
-
- STRING_ITER_INIT(interp, &iter);
- STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, codepoint);
- dest->bufused = iter.bytepos;
-
- return dest;
-}
-
-
-/*
-
-=item C<static size_t compute_hash(PARROT_INTERP, const STRING *src, size_t
-seed)>
-
-Computes the hash of the given STRING C<src> with starting seed value C<seed>.
-
-=cut
-
-*/
-
-static size_t
-compute_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed)
-{
- ASSERT_ARGS(compute_hash)
- String_iter iter;
- size_t hashval = seed;
-
- STRING_ITER_INIT(interp, &iter);
-
- while (iter.charpos < src->strlen) {
- const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- hashval += hashval << 5;
- hashval += c;
- }
-
- return hashval;
-}
-
-
-/*
-
-=item C<void Parrot_charset_unicode_init(PARROT_INTERP)>
-
-Initializes the Unicode charset by installing all the necessary function
-pointers.
-
-=cut
-
-*/
-
-void
-Parrot_charset_unicode_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_charset_unicode_init)
- CHARSET * const return_set = Parrot_new_charset(interp);
- static const CHARSET base_set = {
- "unicode",
- get_graphemes,
- to_charset,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- compare,
- mixed_cs_index,
- cs_rindex,
- validate,
- is_cclass,
- find_cclass,
- find_not_cclass,
- string_from_codepoint,
- compute_hash,
- NULL
- };
-
- STRUCT_COPY_FROM_STRUCT(return_set, base_set);
-
- /*
- * for now use utf8
- * TODO replace it with a fixed uint_16 or uint_32 encoding
- * XXX if this is changed, modify string_make so it
- * still takes "utf8" when fed "unicode" as charset!
- */
- return_set->preferred_encoding = Parrot_utf8_encoding_ptr;
- Parrot_register_charset(interp, "unicode", return_set);
-
- return;
-}
-
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Deleted: trunk/src/string/charset/unicode.h
==============================================================================
--- trunk/src/string/charset/unicode.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,40 +0,0 @@
-/* unicode.h
- * Copyright (C) 2005-2007, Parrot Foundation.
- * SVN Info
- * $Id$
- * Overview:
- * This is the header for the unicode charset functions
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-#ifndef PARROT_CHARSET_UNICODE_H_GUARD
-#define PARROT_CHARSET_UNICODE_H_GUARD
-
-/*
- * init function
- */
-
-
-/* HEADERIZER BEGIN: src/string/charset/unicode.c */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-void Parrot_charset_unicode_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_Parrot_charset_unicode_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/charset/unicode.c */
-
-
-#endif /* PARROT_CHARSET_UNICODE_H_GUARD */
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Modified: trunk/src/string/encoding.c
==============================================================================
--- trunk/src/string/encoding.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -16,50 +16,28 @@
*/
-#define PARROT_NO_EXTERN_ENCODING_PTRS
-#include "parrot/parrot.h"
+#include "parrot/encoding.h"
+
+STR_VTABLE *Parrot_default_encoding_ptr = NULL;
+
+static STR_VTABLE **encodings;
+static int n_encodings;
+/* for backwards compatibility */
+static STRING *unicode_str;
+static STRING *fixed_8_str;
/* HEADERIZER HFILE: include/parrot/encoding.h */
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-static INTVAL register_encoding(PARROT_INTERP,
- ARGIN(const char *encodingname),
- ARGIN(ENCODING *encoding))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-#define ASSERT_ARGS_register_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(encodingname) \
- , PARROT_ASSERT_ARG(encoding))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: static */
-ENCODING *Parrot_default_encoding_ptr = NULL;
-ENCODING *Parrot_fixed_8_encoding_ptr = NULL;
-ENCODING *Parrot_utf8_encoding_ptr = NULL;
-ENCODING *Parrot_ucs2_encoding_ptr = NULL;
-ENCODING *Parrot_utf16_encoding_ptr = NULL;
-ENCODING *Parrot_ucs4_encoding_ptr = NULL;
-
-typedef struct One_encoding {
- NOTNULL(ENCODING *encoding);
- STRING *name;
-} One_encoding;
-
-typedef struct All_encodings {
- int n_encodings;
- One_encoding *enc;
-} All_encodings;
-
-static All_encodings *all_encodings;
/*
-=item C<void parrot_deinit_encodings(PARROT_INTERP)>
+=item C<void Parrot_deinit_encodings(PARROT_INTERP)>
Deinitialize encodings and free all memory used by them.
@@ -68,25 +46,20 @@
*/
void
-parrot_deinit_encodings(PARROT_INTERP)
+Parrot_deinit_encodings(PARROT_INTERP)
{
- ASSERT_ARGS(parrot_deinit_encodings)
- const int n = all_encodings->n_encodings;
- int i;
+ ASSERT_ARGS(Parrot_deinit_encodings)
- for (i = 0; i < n; ++i) {
- mem_gc_free(interp, all_encodings->enc[i].encoding);
- }
- mem_gc_free(interp, all_encodings->enc);
- mem_gc_free(interp, all_encodings);
- all_encodings = NULL;
+ mem_gc_free(interp, encodings);
+ encodings = NULL;
+ n_encodings = 0;
}
/*
-=item C<ENCODING * Parrot_new_encoding(PARROT_INTERP)>
+=item C<STR_VTABLE * Parrot_new_encoding(PARROT_INTERP)>
-Allocates the memory for a new C<ENCODING> from the system.
+Allocates the memory for a new string vtable from the system.
=cut
@@ -95,16 +68,16 @@
PARROT_EXPORT
PARROT_MALLOC
PARROT_CANNOT_RETURN_NULL
-ENCODING *
+STR_VTABLE *
Parrot_new_encoding(PARROT_INTERP)
{
ASSERT_ARGS(Parrot_new_encoding)
- return mem_gc_allocate_typed(interp, ENCODING);
+ return mem_gc_allocate_typed(interp, STR_VTABLE);
}
/*
-=item C<const ENCODING * Parrot_find_encoding(PARROT_INTERP, const char
+=item C<const STR_VTABLE * Parrot_find_encoding(PARROT_INTERP, const char
*encodingname)>
Finds an encoding with the name C<encodingname>. Returns the encoding
@@ -118,22 +91,27 @@
PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
PARROT_CAN_RETURN_NULL
-const ENCODING *
+const STR_VTABLE *
Parrot_find_encoding(SHIM_INTERP, ARGIN(const char *encodingname))
{
ASSERT_ARGS(Parrot_find_encoding)
- const int n = all_encodings->n_encodings;
+ const int n = n_encodings;
int i;
for (i = 0; i < n; ++i)
- if (STREQ(all_encodings->enc[i].encoding->name, encodingname))
- return all_encodings->enc[i].encoding;
+ if (STREQ(encodings[i]->name, encodingname))
+ return encodings[i];
+
+ /* backwards compatibility */
+ if (strcmp(encodingname, "unicode") == 0)
+ return Parrot_utf8_encoding_ptr;
+
return NULL;
}
/*
-=item C<const ENCODING * Parrot_load_encoding(PARROT_INTERP, const char
+=item C<const STR_VTABLE * Parrot_load_encoding(PARROT_INTERP, const char
*encodingname)>
Loads an encoding. Currently throws an exception because we cannot load
@@ -154,7 +132,7 @@
PARROT_EXPORT
PARROT_DOES_NOT_RETURN
PARROT_CANNOT_RETURN_NULL
-const ENCODING *
+const STR_VTABLE *
Parrot_load_encoding(PARROT_INTERP, ARGIN(const char *encodingname))
{
ASSERT_ARGS(Parrot_load_encoding)
@@ -181,13 +159,28 @@
Parrot_encoding_number(PARROT_INTERP, ARGIN(const STRING *encodingname))
{
ASSERT_ARGS(Parrot_encoding_number)
- const int n = all_encodings->n_encodings;
+ const int n = n_encodings;
int i;
for (i = 0; i < n; ++i) {
- if (Parrot_str_equal(interp, all_encodings->enc[i].name, encodingname))
+ if (Parrot_str_equal(interp, encodings[i]->name_str, encodingname))
return i;
}
+
+ /* backwards compatibility */
+ if (Parrot_str_equal(interp, encodingname, unicode_str)) {
+ for (i = 0; i < n; ++i) {
+ if (STREQ(encodings[i]->name, "utf8"))
+ return i;
+ }
+ }
+ else if (STRING_equal(interp, encodingname, fixed_8_str)) {
+ for (i = 0; i < n; ++i) {
+ if (STREQ(encodings[i]->name, "ascii"))
+ return i;
+ }
+ }
+
return -1;
}
@@ -197,6 +190,8 @@
Return the number of the encoding of the given string or -1 if not found.
+This could be converted to a macro.
+
=cut
*/
@@ -208,14 +203,8 @@
Parrot_encoding_number_of_str(SHIM_INTERP, ARGIN(const STRING *src))
{
ASSERT_ARGS(Parrot_encoding_number_of_str)
- const int n = all_encodings->n_encodings;
- int i;
- for (i = 0; i < n; ++i) {
- if (src->encoding == all_encodings->enc[i].encoding)
- return i;
- }
- return -1;
+ return src->encoding->num;
}
/*
@@ -225,6 +214,8 @@
Returns the name of a character encoding based on the INTVAL index
C<number_of_encoding> to the All_encodings array.
+This could be converted to a macro.
+
=cut
*/
@@ -237,15 +228,15 @@
Parrot_encoding_name(SHIM_INTERP, INTVAL number_of_encoding)
{
ASSERT_ARGS(Parrot_encoding_name)
- if (number_of_encoding >= all_encodings->n_encodings ||
+ if (number_of_encoding >= n_encodings ||
number_of_encoding < 0)
return NULL;
- return all_encodings->enc[number_of_encoding].name;
+ return encodings[number_of_encoding]->name_str;
}
/*
-=item C<const ENCODING* Parrot_get_encoding(PARROT_INTERP, INTVAL
+=item C<const STR_VTABLE* Parrot_get_encoding(PARROT_INTERP, INTVAL
number_of_encoding)>
Returns the encoding given by the INTVAL index C<number_of_encoding>.
@@ -258,14 +249,14 @@
PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
PARROT_CAN_RETURN_NULL
-const ENCODING*
+const STR_VTABLE*
Parrot_get_encoding(SHIM_INTERP, INTVAL number_of_encoding)
{
ASSERT_ARGS(Parrot_get_encoding)
- if (number_of_encoding >= all_encodings->n_encodings ||
+ if (number_of_encoding >= n_encodings ||
number_of_encoding < 0)
return NULL;
- return all_encodings->enc[number_of_encoding].encoding;
+ return encodings[number_of_encoding];
}
/*
@@ -288,50 +279,10 @@
Parrot_encoding_c_name(SHIM_INTERP, INTVAL number_of_encoding)
{
ASSERT_ARGS(Parrot_encoding_c_name)
- if (number_of_encoding >= all_encodings->n_encodings ||
+ if (number_of_encoding >= n_encodings ||
number_of_encoding < 0)
return NULL;
- return all_encodings->enc[number_of_encoding].encoding->name;
-}
-
-/*
-
-=item C<static INTVAL register_encoding(PARROT_INTERP, const char *encodingname,
-ENCODING *encoding)>
-
-Registers a new character encoding C<encoding> with the given name
-C<encodingname>. Returns 1 if successful, returns 0 otherwise.
-
-=cut
-
-*/
-
-static INTVAL
-register_encoding(PARROT_INTERP, ARGIN(const char *encodingname),
- ARGIN(ENCODING *encoding))
-{
- ASSERT_ARGS(register_encoding)
- const int n = all_encodings->n_encodings;
- int i;
-
- for (i = 0; i < n; ++i) {
- if (STREQ(all_encodings->enc[i].encoding->name, encodingname))
- return 0;
- }
- /*
- * TODO
- * this needs either a LOCK or we just forbid dynamic
- * loading of encodings from inside threads
- */
- if (!n)
- all_encodings->enc = mem_gc_allocate_zeroed_typed(interp, One_encoding);
- else
- all_encodings->enc = mem_gc_realloc_n_typed_zeroed(interp,
- all_encodings->enc, n + 1, n, One_encoding);
- ++all_encodings->n_encodings;
- all_encodings->enc[n].encoding = encoding;
-
- return 1;
+ return encodings[number_of_encoding]->name;
}
/*
@@ -352,15 +303,16 @@
{
ASSERT_ARGS(Parrot_str_internal_register_encoding_names)
int n;
- for (n = 0; n < all_encodings->n_encodings; ++n)
- all_encodings->enc[n].name =
- Parrot_str_new_constant(interp, all_encodings->enc[n].encoding->name);
+ for (n = 0; n < n_encodings; ++n)
+ encodings[n]->name_str =
+ Parrot_str_new_constant(interp, encodings[n]->name);
+ unicode_str = Parrot_str_new_constant(interp, "unicode");
+ fixed_8_str = Parrot_str_new_constant(interp, "fixed_8");
}
/*
-=item C<INTVAL Parrot_register_encoding(PARROT_INTERP, const char *encodingname,
-ENCODING *encoding)>
+=item C<INTVAL Parrot_register_encoding(PARROT_INTERP, STR_VTABLE *encoding)>
Registers a character encoding C<encoding> with name C<encodingname>.
Only allows one of 5 possibilities: fixed_8, utf8, utf16, ucs2 and ucs4.
@@ -371,46 +323,66 @@
PARROT_EXPORT
INTVAL
-Parrot_register_encoding(PARROT_INTERP, ARGIN(const char *encodingname),
- ARGIN(ENCODING *encoding))
+Parrot_register_encoding(PARROT_INTERP, ARGIN(STR_VTABLE *encoding))
{
ASSERT_ARGS(Parrot_register_encoding)
- if (!all_encodings) {
- all_encodings = mem_gc_allocate_zeroed_typed(interp, All_encodings);
- all_encodings->n_encodings = 0;
- all_encodings->enc = NULL;
- }
- if (STREQ("fixed_8", encodingname)) {
- Parrot_fixed_8_encoding_ptr = encoding;
- if (!Parrot_default_encoding_ptr) {
- Parrot_default_encoding_ptr = encoding;
+ int i;
+ int n = n_encodings;
- }
- return register_encoding(interp, encodingname, encoding);
- }
- if (STREQ("utf8", encodingname)) {
- Parrot_utf8_encoding_ptr = encoding;
- return register_encoding(interp, encodingname, encoding);
- }
- if (STREQ("utf16", encodingname)) {
- Parrot_utf16_encoding_ptr = encoding;
- return register_encoding(interp, encodingname, encoding);
- }
- if (STREQ("ucs2", encodingname)) {
- Parrot_ucs2_encoding_ptr = encoding;
- return register_encoding(interp, encodingname, encoding);
- }
- if (STREQ("ucs4", encodingname)) {
- Parrot_ucs4_encoding_ptr = encoding;
- return register_encoding(interp, encodingname, encoding);
+ for (i = 0; i < n_encodings; ++i) {
+ if (STREQ(encodings[i]->name, encoding->name))
+ return 0;
}
- return 0;
+
+ if (!n)
+ encodings = mem_gc_allocate_zeroed_typed(interp, STR_VTABLE *);
+ else
+ encodings = mem_gc_realloc_n_typed_zeroed(interp,
+ encodings, n + 1, n, STR_VTABLE *);
+
+ encoding->num = n;
+ encodings[n] = encoding;
+ ++n_encodings;
+
+ return 1;
+}
+
+/*
+
+=item C<void Parrot_encodings_init(PARROT_INTERP)>
+
+Creates the initial charsets and encodings, and registers the initial
+charset converters.
+
+=cut
+
+*/
+
+PARROT_EXPORT
+void
+Parrot_encodings_init(PARROT_INTERP)
+{
+ ASSERT_ARGS(Parrot_encodings_init)
+
+ Parrot_register_encoding(interp, Parrot_ascii_encoding_ptr);
+ Parrot_register_encoding(interp, Parrot_latin1_encoding_ptr);
+ Parrot_register_encoding(interp, Parrot_binary_encoding_ptr);
+ Parrot_register_encoding(interp, Parrot_utf8_encoding_ptr);
+ Parrot_register_encoding(interp, Parrot_utf16_encoding_ptr);
+ Parrot_register_encoding(interp, Parrot_ucs2_encoding_ptr);
+ Parrot_register_encoding(interp, Parrot_ucs4_encoding_ptr);
+
+ Parrot_default_encoding_ptr = Parrot_ascii_encoding_ptr;
+
+ /* Now that the plugins are registered, we can create STRING
+ * names for them. */
+ Parrot_str_internal_register_encoding_names(interp);
}
/*
=item C<INTVAL Parrot_make_default_encoding(PARROT_INTERP, const char
-*encodingname, ENCODING *encoding)>
+*encodingname, STR_VTABLE *encoding)>
Sets the default encoding to C<encoding> with name C<encodingname>.
@@ -421,7 +393,7 @@
PARROT_EXPORT
INTVAL
Parrot_make_default_encoding(SHIM_INTERP, SHIM(const char *encodingname),
- ARGIN(ENCODING *encoding))
+ ARGIN(STR_VTABLE *encoding))
{
ASSERT_ARGS(Parrot_make_default_encoding)
Parrot_default_encoding_ptr = encoding;
@@ -430,7 +402,7 @@
/*
-=item C<const ENCODING * Parrot_default_encoding(PARROT_INTERP)>
+=item C<const STR_VTABLE * Parrot_default_encoding(PARROT_INTERP)>
Gets the default encoding.
@@ -442,39 +414,13 @@
PARROT_PURE_FUNCTION
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
-const ENCODING *
+const STR_VTABLE *
Parrot_default_encoding(SHIM_INTERP)
{
ASSERT_ARGS(Parrot_default_encoding)
return Parrot_default_encoding_ptr;
}
-/*
-
-=item C<encoding_converter_t Parrot_find_encoding_converter(PARROT_INTERP,
-ENCODING *lhs, ENCODING *rhs)>
-
-Finds a converter from encoding C<rhs> to C<lhs>. Not yet implemented, so
-throws an exception.
-
-=cut
-
-*/
-
-PARROT_EXPORT
-PARROT_DOES_NOT_RETURN
-encoding_converter_t
-Parrot_find_encoding_converter(PARROT_INTERP, ARGIN(ENCODING *lhs), ARGIN(ENCODING *rhs))
-{
- ASSERT_ARGS(Parrot_find_encoding_converter)
- UNUSED(lhs);
- UNUSED(rhs);
-
- /* XXX Apparently unwritten https://trac.parrot.org/parrot/wiki/StringsTasklist */
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
- "Can't find encoding converters yet.");
-}
-
/*
* Local variables:
Added: trunk/src/string/encoding/ascii.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ trunk/src/string/encoding/ascii.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -0,0 +1,554 @@
+/*
+Copyright (C) 2004-2010, Parrot Foundation.
+$Id$
+
+=head1 NAME
+
+src/string/encoding/ascii.c
+
+=head1 DESCRIPTION
+
+This file implements encoding functions for ASCII strings.
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "ascii.h"
+#include "shared.h"
+#include "tables.h"
+
+/* HEADERIZER HFILE: src/string/encoding/ascii.h */
+
+/* HEADERIZER BEGIN: static */
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static STRING * ascii_chr(PARROT_INTERP, UINTVAL codepoint)
+ __attribute__nonnull__(1);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* ascii_downcase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* ascii_downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+static INTVAL ascii_find_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+static INTVAL ascii_find_not_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+static INTVAL ascii_is_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* ascii_titlecase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* ascii_titlecase_first(PARROT_INTERP,
+ ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING * ascii_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* ascii_upcase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* ascii_upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL ascii_validate(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+#define ASSERT_ARGS_ascii_chr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_ascii_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ascii_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: static */
+
+
+/*
+
+=item C<static STRING * ascii_to_encoding(PARROT_INTERP, const STRING *src)>
+
+Converts STRING C<src> to ASCII charset STRING C<dest>.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING *
+ascii_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_to_encoding)
+ STRING *dest;
+
+ if (STRING_max_bytes_per_codepoint(src) == 1) {
+ unsigned char * const src_buf = (unsigned char *)src->strstart;
+ UINTVAL offs;
+
+ for (offs = 0; offs < src->strlen; ++offs) {
+ UINTVAL c = src_buf[offs];
+ if (c >= 0x80)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
+ "lossy conversion to ascii");
+ }
+
+ dest = Parrot_str_clone(interp, src);
+ dest->encoding = Parrot_ascii_encoding_ptr;
+ }
+ else {
+ String_iter iter;
+ unsigned char *p;
+ const UINTVAL len = src->strlen;
+
+ dest = Parrot_str_new_init(interp, NULL, len,
+ Parrot_ascii_encoding_ptr, 0);
+ p = (unsigned char *)dest->strstart;
+ STRING_ITER_INIT(interp, &iter);
+
+ while (iter.charpos < len) {
+ const UINTVAL c = STRING_iter_get_and_advance(interp, src, &iter);
+ if (c >= 0x80)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
+ "can't convert unicode string to ascii");
+ *p++ = c;
+ }
+
+ dest->bufused = len;
+ dest->strlen = len;
+ }
+
+ return dest;
+}
+
+/*
+
+=item C<static STRING * ascii_chr(PARROT_INTERP, UINTVAL codepoint)>
+
+Creates a new STRING object from a single codepoint C<codepoint>. Returns
+the new STRING.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static STRING *
+ascii_chr(PARROT_INTERP, UINTVAL codepoint)
+{
+ ASSERT_ARGS(ascii_chr)
+ char real_codepoint = (char)codepoint;
+ STRING * const return_string = string_make(interp, &real_codepoint, 1, "ascii", 0);
+ return return_string;
+}
+
+/*
+
+=item C<static UINTVAL ascii_validate(PARROT_INTERP, const STRING *src)>
+
+Verifies that the given string is valid ASCII. Returns 1 if it is ASCII,
+returns 0 otherwise.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+ascii_validate(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_validate)
+ String_iter iter;
+ const UINTVAL length = Parrot_str_length(interp, src);
+
+ STRING_ITER_INIT(interp, &iter);
+ while (iter.charpos < length) {
+ const UINTVAL codepoint = STRING_iter_get_and_advance(interp, src, &iter);
+ if (codepoint >= 0x80)
+ return 0;
+ }
+ return 1;
+}
+
+/*
+
+=item C<static INTVAL ascii_is_cclass(PARROT_INTERP, INTVAL flags, const STRING
+*src, UINTVAL offset)>
+
+Returns Boolean.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static INTVAL
+ascii_is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
+{
+ ASSERT_ARGS(ascii_is_cclass)
+ UINTVAL codepoint;
+
+ if (offset >= src->strlen)
+ return 0;
+ codepoint = STRING_ord(interp, src, offset);
+
+ if (codepoint >= sizeof (Parrot_ascii_typetable) / sizeof (Parrot_ascii_typetable[0])) {
+ return 0;
+ }
+ return (Parrot_ascii_typetable[codepoint] & flags) ? 1 : 0;
+}
+
+/*
+
+=item C<static INTVAL ascii_find_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Find a character in the given character class. Delegates to the find_cclass
+method of the encoding plugin.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static INTVAL
+ascii_find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(ascii_find_cclass)
+ const unsigned char *contents = (const unsigned char *)src->strstart;
+ UINTVAL pos = offset;
+ UINTVAL end = offset + count;
+
+ end = src->strlen < end ? src->strlen : end;
+ for (; pos < end; ++pos) {
+ if ((Parrot_ascii_typetable[contents[pos]] & flags) != 0) {
+ return pos;
+ }
+ }
+ return end;
+}
+
+/*
+
+=item C<static INTVAL ascii_find_not_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Returns C<INTVAL>.
+
+=cut
+
+*/
+
+static INTVAL
+ascii_find_not_cclass(PARROT_INTERP,
+ INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(ascii_find_not_cclass)
+ const unsigned char *contents = (const unsigned char *)src->strstart;
+ UINTVAL pos = offset;
+ UINTVAL end = offset + count;
+
+ end = src->strlen < end ? src->strlen : end;
+ for (; pos < end; ++pos) {
+ if ((Parrot_ascii_typetable[contents[pos]] & flags) == 0) {
+ return pos;
+ }
+ }
+ return end;
+}
+
+/*
+
+=item C<static STRING* ascii_upcase(PARROT_INTERP, const STRING *src)>
+
+Converts the STRING C<src> to all uppercase.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+ascii_upcase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_upcase)
+ STRING * const result = Parrot_str_clone(interp, src);
+ const UINTVAL n = src->strlen;
+
+ if (n) {
+ char * const buffer = result->strstart;
+ UINTVAL offset;
+
+ for (offset = 0; offset < n; ++offset) {
+ buffer[offset] = (char)toupper((unsigned char)buffer[offset]);
+ }
+ }
+
+ return result;
+}
+
+/*
+
+=item C<static STRING* ascii_downcase(PARROT_INTERP, const STRING *src)>
+
+Converts the STRING C<src> to all lower-case.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+ascii_downcase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_downcase)
+ STRING *result = Parrot_str_clone(interp, src);
+ const UINTVAL n = src->strlen;
+
+ if (n) {
+ char * const buffer = result->strstart;
+ UINTVAL offset;
+
+ for (offset = 0; offset < n; ++offset) {
+ buffer[offset] = (char)tolower((unsigned char)buffer[offset]);
+ }
+ }
+
+ return result;
+}
+
+/*
+
+=item C<static STRING* ascii_titlecase(PARROT_INTERP, const STRING *src)>
+
+Converts the STRING given by C<src> to title case, where
+the first character is upper case and all the rest of the characters
+are lower-case.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+ascii_titlecase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_titlecase)
+ STRING *result = Parrot_str_clone(interp, src);
+ const UINTVAL n = src->strlen;
+
+ if (n) {
+ char * const buffer = result->strstart;
+ UINTVAL offset;
+
+ buffer[0] = (char)toupper((unsigned char)buffer[0]);
+ for (offset = 1; offset < n; ++offset) {
+ buffer[offset] = (char)tolower((unsigned char)buffer[offset]);
+ }
+ }
+
+ return result;
+}
+
+/*
+
+=item C<static STRING* ascii_upcase_first(PARROT_INTERP, const STRING *src)>
+
+Sets the first character in the STRING C<src> to upper case,
+but doesn't modify the rest of the string.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+ascii_upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_upcase_first)
+ STRING * const result = Parrot_str_clone(interp, src);
+
+ if (result->strlen > 0) {
+ char * const buffer = result->strstart;
+ buffer[0] = (char)toupper((unsigned char)buffer[0]);
+ }
+
+ return result;
+}
+
+/*
+
+=item C<static STRING* ascii_downcase_first(PARROT_INTERP, const STRING *src)>
+
+Sets the first character of the STRING C<src> to lowercase,
+but doesn't modify the rest of the characters.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+ascii_downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_downcase_first)
+ STRING * const result = Parrot_str_clone(interp, src);
+
+ if (result->strlen > 0) {
+ char * const buffer = result->strstart;
+ buffer[0] = (char)tolower((unsigned char)buffer[0]);
+ }
+
+ return result;
+}
+
+/*
+
+=item C<static STRING* ascii_titlecase_first(PARROT_INTERP, const STRING *src)>
+
+Converts the first letter of STRING C<src> to upper case,
+but doesn't modify the rest of the string.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+ascii_titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(ascii_titlecase_first)
+ STRING * const result = Parrot_str_clone(interp, src);
+
+ if (result->strlen > 0) {
+ char * const buffer = result->strstart;
+ buffer[0] = (char)toupper((unsigned char)buffer[0]);
+ }
+
+ return result;
+}
+
+static STR_VTABLE Parrot_ascii_encoding = {
+ 0,
+ "ascii",
+ NULL,
+ 1, /* Max bytes per codepoint */
+
+ ascii_to_encoding,
+ ascii_chr,
+
+ fixed8_equal,
+ fixed8_compare,
+ fixed8_index,
+ fixed8_rindex,
+ fixed8_hash,
+ ascii_validate,
+
+ fixed8_scan,
+ fixed8_ord,
+ fixed8_substr,
+
+ ascii_is_cclass,
+ ascii_find_cclass,
+ ascii_find_not_cclass,
+
+ encoding_get_graphemes,
+ fixed8_compose,
+ encoding_decompose,
+
+ ascii_upcase,
+ ascii_downcase,
+ ascii_titlecase,
+ ascii_upcase_first,
+ ascii_downcase_first,
+ ascii_titlecase_first,
+
+ fixed8_iter_get,
+ fixed8_iter_skip,
+ fixed8_iter_get_and_advance,
+ fixed8_iter_set_and_advance,
+ fixed8_iter_set_position
+};
+
+STR_VTABLE *Parrot_ascii_encoding_ptr = &Parrot_ascii_encoding;
+
+
+/*
+ * Local variables:
+ * c-file-style: "parrot"
+ * End:
+ * vim: expandtab shiftwidth=4:
+ */
+
Added: trunk/src/string/encoding/ascii.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ trunk/src/string/encoding/ascii.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -0,0 +1,30 @@
+/* ascii.h
+ * Copyright (C) 2004-2010, Parrot Foundation.
+ * SVN Info
+ * $Id$
+ * Overview:
+ * This is the header for the 8-bit fixed-width encoding
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#ifndef PARROT_ENCODING_ASCII_H_GUARD
+#define PARROT_ENCODING_ASCII_H_GUARD
+
+/* HEADERIZER BEGIN: src/string/encoding/ascii.c */
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+
+
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: src/string/encoding/ascii.c */
+
+#endif /* PARROT_ENCODING_ASCII_H_GUARD */
+
+/*
+ * Local variables:
+ * c-file-style: "parrot"
+ * End:
+ * vim: expandtab shiftwidth=4:
+ */
Added: trunk/src/string/encoding/binary.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ trunk/src/string/encoding/binary.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -0,0 +1,275 @@
+/*
+Copyright (C) 2004-2010, Parrot Foundation.
+$Id$
+
+=head1 NAME
+
+src/string/encoding/binary.c
+
+=head1 DESCRIPTION
+
+This file implements encoding functions for binary strings.
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "binary.h"
+#include "shared.h"
+
+/* HEADERIZER HFILE: src/string/encoding/binary.h */
+
+/* HEADERIZER BEGIN: static */
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* binary_change_case(PARROT_INTERP, SHIM(const STRING *src))
+ __attribute__nonnull__(1);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING * binary_chr(PARROT_INTERP, UINTVAL codepoint)
+ __attribute__nonnull__(1);
+
+static INTVAL binary_find_cclass(SHIM_INTERP,
+ SHIM(INTVAL flags),
+ SHIM(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count);
+
+static INTVAL binary_find_not_cclass(SHIM_INTERP,
+ SHIM(INTVAL flags),
+ SHIM(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count);
+
+static INTVAL binary_is_cclass(SHIM_INTERP,
+ SHIM(INTVAL flags),
+ SHIM(const STRING *src),
+ SHIM(UINTVAL offset));
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* binary_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+static UINTVAL binary_validate(SHIM_INTERP, SHIM(const STRING *src));
+#define ASSERT_ARGS_binary_change_case __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_binary_chr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_binary_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
+#define ASSERT_ARGS_binary_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
+#define ASSERT_ARGS_binary_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
+#define ASSERT_ARGS_binary_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_binary_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (0)
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: static */
+
+#ifdef EXCEPTION
+# undef EXCEPTION
+#endif
+
+#define EXCEPTION(err, str) \
+ Parrot_ex_throw_from_c_args(interp, NULL, (err), (str))
+
+
+/*
+
+=item C<static STRING* binary_to_encoding(PARROT_INTERP, const STRING *src)>
+
+Converts the STRING C<src> to STRING C<dest> in binary mode.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+binary_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(binary_to_encoding)
+ STRING *dest;
+
+ dest = Parrot_str_copy(interp, src);
+ dest->encoding = Parrot_binary_encoding_ptr;
+ dest->strlen = dest->bufused;
+ dest->hashval = 0;
+
+ return dest;
+}
+
+
+/*
+
+=item C<static STRING * binary_chr(PARROT_INTERP, UINTVAL codepoint)>
+
+Creates a new STRING object from a single codepoint C<codepoint>. Returns
+the new STRING.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING *
+binary_chr(PARROT_INTERP, UINTVAL codepoint)
+{
+ ASSERT_ARGS(binary_chr)
+ char real_codepoint = (char)codepoint;
+ return string_make(interp, &real_codepoint, 1, "binary", 0);
+}
+
+
+/*
+
+=item C<static UINTVAL binary_validate(PARROT_INTERP, const STRING *src)>
+
+Returns 1. All sequential data is valid binary data.
+
+=cut
+
+*/
+
+/* Binary's always valid */
+static UINTVAL
+binary_validate(SHIM_INTERP, SHIM(const STRING *src))
+{
+ ASSERT_ARGS(binary_validate)
+ return 1;
+}
+
+
+/*
+
+=item C<static INTVAL binary_is_cclass(PARROT_INTERP, INTVAL flags, const STRING
+*src, UINTVAL offset)>
+
+Returns Boolean.
+
+=cut
+
+*/
+
+static INTVAL
+binary_is_cclass(SHIM_INTERP, SHIM(INTVAL flags), SHIM(const STRING *src), SHIM(UINTVAL offset))
+{
+ ASSERT_ARGS(binary_is_cclass)
+ return 0;
+}
+
+
+/*
+
+=item C<static INTVAL binary_find_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Find a character in the given character class.
+
+=cut
+
+*/
+
+static INTVAL
+binary_find_cclass(SHIM_INTERP, SHIM(INTVAL flags),
+ SHIM(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(binary_find_cclass)
+ return offset + count;
+}
+
+
+/*
+
+=item C<static INTVAL binary_find_not_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Returns C<INTVAL>.
+
+=cut
+
+*/
+
+static INTVAL
+binary_find_not_cclass(SHIM_INTERP, SHIM(INTVAL flags),
+ SHIM(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(binary_find_not_cclass)
+ return offset;
+}
+
+
+/*
+
+=item C<static STRING* binary_change_case(PARROT_INTERP, const STRING *src)>
+
+Throws an exception because we cannot change case of a binary string.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+binary_change_case(PARROT_INTERP, SHIM(const STRING *src))
+{
+ ASSERT_ARGS(binary_change_case)
+ EXCEPTION(EXCEPTION_INVALID_CHARTYPE, "Can't change case of binary data");
+}
+
+
+static STR_VTABLE Parrot_binary_encoding = {
+ 0,
+ "binary",
+ NULL,
+ 1, /* Max bytes per codepoint */
+
+ binary_to_encoding,
+ binary_chr,
+
+ fixed8_equal,
+ fixed8_compare,
+ fixed8_index,
+ fixed8_rindex,
+ fixed8_hash,
+ binary_validate,
+
+ fixed8_scan,
+ fixed8_ord,
+ fixed8_substr,
+
+ binary_is_cclass,
+ binary_find_cclass,
+ binary_find_not_cclass,
+
+ encoding_get_graphemes,
+ fixed8_compose,
+ encoding_decompose,
+
+ binary_change_case,
+ binary_change_case,
+ binary_change_case,
+ binary_change_case,
+ binary_change_case,
+ binary_change_case,
+
+ fixed8_iter_get,
+ fixed8_iter_skip,
+ fixed8_iter_get_and_advance,
+ fixed8_iter_set_and_advance,
+ fixed8_iter_set_position
+};
+
+STR_VTABLE *Parrot_binary_encoding_ptr = &Parrot_binary_encoding;
+
+
+/*
+ * Local variables:
+ * c-file-style: "parrot"
+ * End:
+ * vim: expandtab shiftwidth=4:
+ */
Copied and modified: trunk/src/string/encoding/binary.h (from r48832, trunk/src/string/charset/binary.h)
==============================================================================
--- trunk/src/string/charset/binary.h Tue Sep 7 22:20:33 2010 (r48832, copy source)
+++ trunk/src/string/encoding/binary.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -10,21 +10,17 @@
* References:
*/
-#ifndef PARROT_CHARSET_BINARY_H_GUARD
-#define PARROT_CHARSET_BINARY_H_GUARD
+#ifndef PARROT_ENCODING_BINARY_H_GUARD
+#define PARROT_ENCODING_BINARY_H_GUARD
-/* HEADERIZER BEGIN: src/string/charset/binary.c */
+/* HEADERIZER BEGIN: src/string/encoding/binary.c */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-void Parrot_charset_binary_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-#define ASSERT_ARGS_Parrot_charset_binary_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/charset/binary.c */
+/* HEADERIZER END: src/string/encoding/binary.c */
-#endif /* PARROT_CHARSET_BINARY_H_GUARD */
+#endif /* PARROT_ENCODING_BINARY_H_GUARD */
/*
* Local variables:
Deleted: trunk/src/string/encoding/fixed_8.c
==============================================================================
--- trunk/src/string/encoding/fixed_8.c Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,578 +0,0 @@
-/*
-Copyright (C) 2004-2010, Parrot Foundation.
-$Id$
-
-=head1 NAME
-
-src/string/encoding/fixed_8.c
-
-=head1 DESCRIPTION
-
-This file implements the encoding functions for fixed-width 8-bit codepoints
-
-=over 4
-
-=cut
-
-*/
-
-#include "parrot/parrot.h"
-#include "fixed_8.h"
-
-/* HEADERIZER HFILE: src/string/encoding/fixed_8.h */
-
-/* HEADERIZER BEGIN: static */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-static UINTVAL bytes(SHIM_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(2);
-
-static UINTVAL codepoints(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL find_cclass(SHIM_INTERP,
- ARGIN(const STRING *s),
- ARGIN(const INTVAL *typetable),
- INTVAL flags,
- UINTVAL pos,
- UINTVAL end)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-static UINTVAL fixed8_iter_get(PARROT_INTERP,
- ARGIN(const STRING *str),
- ARGIN(const String_iter *iter),
- INTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-static UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP,
- ARGIN(const STRING *str),
- ARGMOD(String_iter *iter))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
-static void fixed8_iter_set_and_advance(PARROT_INTERP,
- ARGMOD(STRING *str),
- ARGMOD(String_iter *iter),
- UINTVAL c)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*str)
- FUNC_MODIFIES(*iter);
-
-static void fixed8_iter_set_position(SHIM_INTERP,
- ARGIN(const STRING *str),
- ARGMOD(String_iter *iter),
- UINTVAL pos)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
-static void fixed8_iter_skip(SHIM_INTERP,
- ARGIN(const STRING *str),
- ARGMOD(String_iter *iter),
- INTVAL skip)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
-static size_t fixed_8_hash(SHIM_INTERP,
- ARGIN(const STRING *s),
- size_t hashval)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL get_byte(SHIM_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_bytes(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL get_codepoint(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_codepoints(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static void set_byte(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL byte)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_DOES_NOT_RETURN
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_encoding(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(s) \
- , PARROT_ASSERT_ARG(typetable))
-#define ASSERT_ARGS_fixed8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(str) \
- , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(str) \
- , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(str) \
- , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(str) \
- , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(str) \
- , PARROT_ASSERT_ARG(iter))
-#define ASSERT_ARGS_fixed_8_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(s))
-#define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
-
-#define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
- "unimpl fixed_8")
-
-/*
-
-=item C<static STRING * to_encoding(PARROT_INTERP, const STRING *src)>
-
-Converts the string C<src> to this particular encoding. If C<dest> is
-provided, it will contain the result. Otherwise this function operates in
-place.
-
-
-=cut
-
-*/
-
-PARROT_DOES_NOT_RETURN
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-to_encoding(PARROT_INTERP, SHIM(const STRING *src))
-{
- ASSERT_ARGS(to_encoding)
- UNIMPL;
-}
-
-
-/*
-
-=item C<static UINTVAL get_codepoint(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
-
-codepoints are bytes, so delegate
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-get_codepoint(PARROT_INTERP, ARGIN(const STRING *src),
- UINTVAL offset)
-{
- ASSERT_ARGS(get_codepoint)
- return get_byte(interp, src, offset);
-}
-
-
-/*
-
-=item C<static UINTVAL find_cclass(PARROT_INTERP, const STRING *s, const INTVAL
-*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
-
-codepoints are bytes, so delegate
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-find_cclass(SHIM_INTERP, ARGIN(const STRING *s), ARGIN(const INTVAL *typetable),
-INTVAL flags, UINTVAL pos, UINTVAL end)
-{
- ASSERT_ARGS(find_cclass)
- const unsigned char *contents = (const unsigned char *)s->strstart;
- for (; pos < end; ++pos) {
- if ((typetable[contents[pos]] & flags) != 0) {
- return pos;
- }
- }
- return end;
-}
-
-/*
-
-=item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
-
-Returns the byte in string C<src> at position C<offset>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(get_byte)
- const unsigned char *contents = (const unsigned char *)src->strstart;
-
- if (offset >= src->bufused) {
-/* Parrot_ex_throw_from_c_args(interp, NULL, 0,
- "get_byte past the end of the buffer (%i of %i)",
- offset, src->bufused); */
- return 0;
- }
-
- return contents[offset];
-}
-
-/*
-
-=item C<static void set_byte(PARROT_INTERP, const STRING *src, UINTVAL offset,
-UINTVAL byte)>
-
-Sets, in string C<src> at position C<offset>, the byte C<byte>.
-
-=cut
-
-*/
-
-static void
-set_byte(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL byte)
-{
- ASSERT_ARGS(set_byte)
- unsigned char *contents;
-
- if (offset >= src->bufused)
- Parrot_ex_throw_from_c_args(interp, NULL, 0,
- "set_byte past the end of the buffer");
-
- contents = (unsigned char *)src->strstart;
- contents[offset] = (unsigned char)byte;
-}
-
-/*
-
-=item C<static STRING * get_codepoints(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the codepoints in string C<src> at position C<offset> and length
-C<count>. (Delegates to C<get_bytes>.)
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(get_codepoints)
- STRING * const return_string = get_bytes(interp, src, offset, count);
- return_string->charset = src->charset;
- return return_string;
-}
-
-/*
-
-=item C<static STRING * get_bytes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the bytes in string C<src> at position C<offset> and length C<count>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_bytes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(get_bytes)
- STRING * const return_string = Parrot_str_copy(interp, src);
-
- return_string->encoding = src->encoding;
- return_string->charset = src->charset;
-
- return_string->strstart = (char *)return_string->strstart + offset ;
- return_string->bufused = count;
-
- return_string->strlen = count;
- return_string->hashval = 0;
-
- return return_string;
-}
-
-
-/*
-
-=item C<static UINTVAL codepoints(PARROT_INTERP, const STRING *src)>
-
-Returns the number of codepoints in string C<src>.
-
-=cut
-
-*/
-
-static UINTVAL
-codepoints(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(codepoints)
- return bytes(interp, src);
-}
-
-/*
-
-=item C<static UINTVAL bytes(PARROT_INTERP, const STRING *src)>
-
-Returns the number of bytes in string C<src>.
-
-=cut
-
-*/
-
-static UINTVAL
-bytes(SHIM_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(bytes)
- return src->bufused;
-}
-
-/*
- * iterator functions
- */
-
-/*
-
-=item C<static UINTVAL fixed8_iter_get(PARROT_INTERP, const STRING *str, const
-String_iter *iter, INTVAL offset)>
-
-Get the character at C<iter> plus C<offset>.
-
-=cut
-
-*/
-
-static UINTVAL
-fixed8_iter_get(PARROT_INTERP,
- ARGIN(const STRING *str), ARGIN(const String_iter *iter), INTVAL offset)
-{
- ASSERT_ARGS(fixed8_iter_get)
- return get_byte(interp, str, iter->charpos + offset);
-}
-
-/*
-
-=item C<static void fixed8_iter_skip(PARROT_INTERP, const STRING *str,
-String_iter *iter, INTVAL skip)>
-
-Moves the string iterator C<i> by C<skip> characters.
-
-=cut
-
-*/
-
-static void
-fixed8_iter_skip(SHIM_INTERP,
- ARGIN(const STRING *str), ARGMOD(String_iter *iter), INTVAL skip)
-{
- ASSERT_ARGS(fixed8_iter_skip)
- iter->bytepos += skip;
- iter->charpos += skip;
- PARROT_ASSERT(iter->bytepos <= Buffer_buflen(str));
-}
-
-/*
-
-=item C<static UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP, const STRING
-*str, String_iter *iter)>
-
-Moves the string iterator C<i> to the next codepoint.
-
-=cut
-
-*/
-
-static UINTVAL
-fixed8_iter_get_and_advance(PARROT_INTERP,
- ARGIN(const STRING *str), ARGMOD(String_iter *iter))
-{
- ASSERT_ARGS(fixed8_iter_get_and_advance)
- const UINTVAL c = get_byte(interp, str, iter->charpos++);
- iter->bytepos++;
- return c;
-}
-
-/*
-
-=item C<static void fixed8_iter_set_and_advance(PARROT_INTERP, STRING *str,
-String_iter *iter, UINTVAL c)>
-
-With the string iterator C<i>, appends the codepoint C<c> and advances to the
-next position in the string.
-
-=cut
-
-*/
-
-static void
-fixed8_iter_set_and_advance(PARROT_INTERP,
- ARGMOD(STRING *str), ARGMOD(String_iter *iter), UINTVAL c)
-{
- ASSERT_ARGS(fixed8_iter_set_and_advance)
- set_byte(interp, str, iter->charpos++, c);
- iter->bytepos++;
-}
-
-/*
-
-=item C<static void fixed8_iter_set_position(PARROT_INTERP, const STRING *str,
-String_iter *iter, UINTVAL pos)>
-
-Moves the string iterator C<i> to the position C<n> in the string.
-
-=cut
-
-*/
-
-static void
-fixed8_iter_set_position(SHIM_INTERP,
- ARGIN(const STRING *str), ARGMOD(String_iter *iter), UINTVAL pos)
-{
- ASSERT_ARGS(fixed8_iter_set_position)
- iter->bytepos = iter->charpos = pos;
- PARROT_ASSERT(pos <= Buffer_buflen(str));
-}
-
-/*
-
-=item C<static size_t fixed_8_hash(PARROT_INTERP, const STRING *s, size_t
-hashval)>
-
-Returns the hashed value of the string, given a seed in hashval.
-
-=cut
-
-*/
-
-static size_t
-fixed_8_hash(SHIM_INTERP, ARGIN(const STRING *s), size_t hashval)
-{
- ASSERT_ARGS(fixed_8_hash)
- const unsigned char *pos = (const unsigned char *)s->strstart;
- UINTVAL len = s->strlen;
-
- while (len--) {
- hashval += hashval << 5;
- hashval += *(pos++);
- }
-
- return hashval;
-}
-
-
-/*
-
-=item C<void Parrot_encoding_fixed_8_init(PARROT_INTERP)>
-
-Initializes the fixed-8 encoding.
-
-=cut
-
-*/
-
-void
-Parrot_encoding_fixed_8_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_encoding_fixed_8_init)
- ENCODING * const return_encoding = Parrot_new_encoding(interp);
-
- ENCODING base_encoding = {
- "fixed_8",
- 1, /* Max bytes per codepoint */
- to_encoding,
- get_codepoint,
- get_byte,
- set_byte,
- get_codepoints,
- get_bytes,
- codepoints,
- bytes,
- find_cclass,
- fixed_8_hash,
- fixed8_iter_get,
- fixed8_iter_skip,
- fixed8_iter_get_and_advance,
- fixed8_iter_set_and_advance,
- fixed8_iter_set_position
- };
-
- STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
- Parrot_register_encoding(interp, "fixed_8", return_encoding);
-
- return;
-}
-
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
-
Deleted: trunk/src/string/encoding/fixed_8.h
==============================================================================
--- trunk/src/string/encoding/fixed_8.h Tue Sep 7 22:58:38 2010 (r48832)
+++ /dev/null 00:00:00 1970 (deleted)
@@ -1,34 +0,0 @@
-/* fixed_8.h
- * Copyright (C) 2004-2007, Parrot Foundation.
- * SVN Info
- * $Id$
- * Overview:
- * This is the header for the 8-bit fixed-width encoding
- * Data Structure and Algorithms:
- * History:
- * Notes:
- * References:
- */
-
-#ifndef PARROT_ENCODING_FIXED_8_H_GUARD
-#define PARROT_ENCODING_FIXED_8_H_GUARD
-
-/* HEADERIZER BEGIN: src/string/encoding/fixed_8.c */
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-
-void Parrot_encoding_fixed_8_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-
-#define ASSERT_ARGS_Parrot_encoding_fixed_8_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: src/string/encoding/fixed_8.c */
-
-#endif /* PARROT_ENCODING_FIXED_8_H_GUARD */
-
-/*
- * Local variables:
- * c-file-style: "parrot"
- * End:
- * vim: expandtab shiftwidth=4:
- */
Added: trunk/src/string/encoding/latin1.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ trunk/src/string/encoding/latin1.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -0,0 +1,582 @@
+/*
+Copyright (C) 2004-2010, Parrot Foundation.
+$Id$
+
+=head1 NAME
+
+src/string/encoding/latin1.c
+
+=head1 DESCRIPTION
+
+This file implements encoding functions for ISO-8859-1 strings.
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "latin1.h"
+#include "shared.h"
+#include "tables.h"
+
+/* HEADERIZER HFILE: src/string/encoding/latin1.h */
+
+/* HEADERIZER BEGIN: static */
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+
+PARROT_CANNOT_RETURN_NULL
+static STRING * latin1_chr(PARROT_INTERP, UINTVAL codepoint)
+ __attribute__nonnull__(1);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* latin1_downcase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* latin1_downcase_first(PARROT_INTERP,
+ ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+static INTVAL latin1_find_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+static INTVAL latin1_find_not_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+static INTVAL latin1_is_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* latin1_titlecase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* latin1_titlecase_first(PARROT_INTERP,
+ ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static STRING * latin1_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* latin1_upcase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+static STRING* latin1_upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+static UINTVAL latin1_validate(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+#define ASSERT_ARGS_latin1_chr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_latin1_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_latin1_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: static */
+
+
+/*
+
+=item C<static STRING * latin1_to_encoding(PARROT_INTERP, const STRING *src)>
+
+Converts the STRING C<src> to an ISO-8859-1 STRING C<dest>.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+static STRING *
+latin1_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_to_encoding)
+ STRING *dest;
+
+ if (STRING_max_bytes_per_codepoint(src) == 1) {
+ dest = Parrot_str_clone(interp, src);
+ dest->encoding = Parrot_latin1_encoding_ptr;
+ }
+ else {
+ String_iter iter;
+ unsigned char *p;
+ const UINTVAL len = src->strlen;
+
+ dest = Parrot_str_new_init(interp, NULL, len,
+ Parrot_latin1_encoding_ptr, 0);
+ p = (unsigned char *)dest->strstart;
+ STRING_ITER_INIT(interp, &iter);
+
+ while (iter.charpos < len) {
+ const UINTVAL c = STRING_iter_get_and_advance(interp, src, &iter);
+ if (c >= 0x100)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION,
+ "lossy conversion to iso-8559-1");
+ *p++ = c;
+ }
+
+ dest->bufused = len;
+ dest->strlen = len;
+ }
+
+ return dest;
+}
+
+
+/*
+
+=item C<static STRING * latin1_chr(PARROT_INTERP, UINTVAL codepoint)>
+
+Creates a new STRING from the single codepoint C<codepoint>.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING *
+latin1_chr(PARROT_INTERP, UINTVAL codepoint)
+{
+ ASSERT_ARGS(latin1_chr)
+ char real_codepoint = (char)codepoint;
+ STRING * const return_string = string_make(interp, &real_codepoint, 1,
+ "iso-8859-1", 0);
+ PARROT_ASSERT(codepoint < 0x100);
+ return return_string;
+}
+
+
+/*
+
+=item C<static UINTVAL latin1_validate(PARROT_INTERP, const STRING *src)>
+
+Returns 1 if the STRING C<src> is a valid ISO-8859-1 STRING. Returns 0 otherwise.
+
+=cut
+
+*/
+
+static UINTVAL
+latin1_validate(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_validate)
+ INTVAL offset;
+ const INTVAL length = Parrot_str_length(interp, src);
+
+ for (offset = 0; offset < length; ++offset) {
+ const UINTVAL codepoint = STRING_ord(interp, src, offset);
+ if (codepoint >= 0x100)
+ return 0;
+ }
+ return 1;
+}
+
+
+/*
+
+=item C<static INTVAL latin1_is_cclass(PARROT_INTERP, INTVAL flags, const STRING
+*src, UINTVAL offset)>
+
+Returns Boolean.
+
+=cut
+
+*/
+
+static INTVAL
+latin1_is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
+{
+ ASSERT_ARGS(latin1_is_cclass)
+ UINTVAL codepoint;
+
+ if (offset >= src->strlen) return 0;
+ codepoint = STRING_ord(interp, src, offset);
+
+ if (codepoint >= sizeof (Parrot_ascii_typetable) /
+ sizeof (Parrot_ascii_typetable[0])) {
+ return 0;
+ }
+ return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
+}
+
+
+/*
+
+=item C<static INTVAL latin1_find_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Find a character in the given character class. Delegates to the find_cclass
+method of the encoding plugin.
+
+=cut
+
+*/
+
+static INTVAL
+latin1_find_cclass(PARROT_INTERP, INTVAL flags,
+ ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(latin1_find_cclass)
+ const unsigned char *contents = (const unsigned char *)src->strstart;
+ UINTVAL pos = offset;
+ UINTVAL end = offset + count;
+
+ end = src->strlen < end ? src->strlen : end;
+ for (; pos < end; ++pos) {
+ if ((Parrot_iso_8859_1_typetable[contents[pos]] & flags) != 0) {
+ return pos;
+ }
+ }
+ return end;
+}
+
+
+/*
+
+=item C<static INTVAL latin1_find_not_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Returns C<INTVAL>.
+
+=cut
+
+*/
+
+static INTVAL
+latin1_find_not_cclass(PARROT_INTERP, INTVAL flags,
+ ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(latin1_find_not_cclass)
+ const unsigned char *contents = (const unsigned char *)src->strstart;
+ UINTVAL pos = offset;
+ UINTVAL end = offset + count;
+
+ end = src->strlen < end ? src->strlen : end;
+ for (; pos < end; ++pos) {
+ if ((Parrot_iso_8859_1_typetable[contents[pos]] & flags) == 0) {
+ return pos;
+ }
+ }
+ return end;
+}
+
+
+/*
+
+=item C<static STRING* latin1_upcase(PARROT_INTERP, const STRING *src)>
+
+Convert all graphemes in the STRING C<src> to upper case, for those
+graphemes that support cases.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+latin1_upcase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_upcase)
+ unsigned char *buffer;
+ UINTVAL offset = 0;
+ STRING *result = Parrot_str_clone(interp, src);
+
+ if (!result->strlen)
+ return result;
+
+ buffer = (unsigned char *)result->strstart;
+ for (offset = 0; offset < result->strlen; ++offset) {
+ unsigned int c = buffer[offset]; /* XXX use encoding ? */
+ if (c >= 0xe0 && c != 0xf7)
+ c &= ~0x20;
+ else
+ c = toupper((unsigned char)c);
+ buffer[offset] = (unsigned char)c;
+ }
+
+ return result;
+}
+
+
+/*
+
+=item C<static STRING* latin1_downcase(PARROT_INTERP, const STRING *src)>
+
+Converts all graphemes in STRING C<src> to lower-case, for those graphemes
+that support cases.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+latin1_downcase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_downcase)
+ unsigned char *buffer;
+ UINTVAL offset = 0;
+ STRING *result = Parrot_str_clone(interp, src);
+
+ if (!result->strlen)
+ return result;
+
+ buffer = (unsigned char *)result->strstart;
+ for (offset = 0; offset < result->strlen; ++offset) {
+ unsigned int c = buffer[offset];
+ if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
+ c |= 0x20;
+ else
+ c = tolower((unsigned char)c);
+ buffer[offset] = (unsigned char)c;
+ }
+
+ return result;
+}
+
+
+/*
+
+=item C<static STRING* latin1_titlecase(PARROT_INTERP, const STRING *src)>
+
+Converts the graphemes in STRING C<src> to title case, for those graphemes
+that support cases.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+latin1_titlecase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_titlecase)
+ unsigned char *buffer;
+ unsigned int c;
+ UINTVAL offset;
+ STRING *result = Parrot_str_clone(interp, src);
+
+ if (!result->strlen)
+ return result;
+
+ buffer = (unsigned char *)result->strstart;
+ c = buffer[0];
+ if (c >= 0xe0 && c != 0xf7)
+ c &= ~0x20;
+ else
+ c = toupper((unsigned char)c);
+ buffer[0] = (unsigned char)c;
+
+ for (offset = 1; offset < result->strlen; ++offset) {
+ c = buffer[offset];
+ if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
+ c |= 0x20;
+ else
+ c = tolower((unsigned char)c);
+ buffer[offset] = (unsigned char)c;
+ }
+
+ return result;
+}
+
+
+/*
+
+=item C<static STRING* latin1_upcase_first(PARROT_INTERP, const STRING *src)>
+
+Converts the first grapheme in STRING C<src> to upper case, if it
+supports cases.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+latin1_upcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_upcase_first)
+ unsigned char *buffer;
+ unsigned int c;
+ STRING *result = Parrot_str_clone(interp, src);
+
+ if (!result->strlen)
+ return result;
+
+ buffer = (unsigned char *)result->strstart;
+ c = buffer[0];
+ if (c >= 0xe0 && c != 0xf7)
+ c &= ~0x20;
+ else
+ c = toupper((unsigned char)c);
+ buffer[0] = (unsigned char)c;
+
+ return result;
+}
+
+
+/*
+
+=item C<static STRING* latin1_downcase_first(PARROT_INTERP, const STRING *src)>
+
+Converts the first character of the STRING C<src> to lower case, if the
+grapheme supports lower case.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+latin1_downcase_first(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_downcase_first)
+ unsigned char *buffer;
+ unsigned int c;
+ STRING *result = Parrot_str_clone(interp, src);
+
+ if (!result->strlen)
+ return result;
+
+ buffer = (unsigned char *)result->strstart;
+ c = buffer[0];
+ if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
+ c &= ~0x20;
+ else
+ c = tolower((unsigned char)c);
+ buffer[0] = (unsigned char)c;
+
+ return result;
+}
+
+
+/*
+
+=item C<static STRING* latin1_titlecase_first(PARROT_INTERP, const STRING *src)>
+
+Converts the first grapheme in STRING C<src> to title case, if the grapheme
+supports case.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+static STRING*
+latin1_titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(latin1_titlecase_first)
+ return latin1_upcase_first(interp, src);
+}
+
+
+static STR_VTABLE Parrot_latin1_encoding = {
+ 0,
+ "iso-8859-1",
+ NULL,
+ 1, /* Max bytes per codepoint */
+
+ latin1_to_encoding,
+ latin1_chr,
+
+ fixed8_equal,
+ fixed8_compare,
+ fixed8_index,
+ fixed8_rindex,
+ fixed8_hash,
+ latin1_validate,
+
+ fixed8_scan,
+ fixed8_ord,
+ fixed8_substr,
+
+ latin1_is_cclass,
+ latin1_find_cclass,
+ latin1_find_not_cclass,
+
+ encoding_get_graphemes,
+ fixed8_compose,
+ encoding_decompose,
+
+ latin1_upcase,
+ latin1_downcase,
+ latin1_titlecase,
+ latin1_upcase_first,
+ latin1_downcase_first,
+ latin1_titlecase_first,
+
+ fixed8_iter_get,
+ fixed8_iter_skip,
+ fixed8_iter_get_and_advance,
+ fixed8_iter_set_and_advance,
+ fixed8_iter_set_position
+};
+
+STR_VTABLE *Parrot_latin1_encoding_ptr = &Parrot_latin1_encoding;
+
+
+/*
+ * Local variables:
+ * c-file-style: "parrot"
+ * End:
+ * vim: expandtab shiftwidth=4:
+ */
Added: trunk/src/string/encoding/latin1.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ trunk/src/string/encoding/latin1.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -0,0 +1,30 @@
+/* iso_8859_1.h
+ * Copyright (C) 2004-2007, Parrot Foundation.
+ * SVN Info
+ * $Id$
+ * Overview:
+ * This is the header for the iso_8859-1 charset functions
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#ifndef PARROT_ENCODING_LATIN1_H_GUARD
+#define PARROT_ENCODING_LATIN1_H_GUARD
+
+/* HEADERIZER BEGIN: src/string/encoding/latin1.c */
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+
+
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: src/string/encoding/latin1.c */
+
+#endif /* PARROT_ENCODING_LATIN1_H_GUARD */
+
+/*
+ * Local variables:
+ * c-file-style: "parrot"
+ * End:
+ * vim: expandtab shiftwidth=4:
+ */
Copied and modified: trunk/src/string/encoding/shared.c (from r48832, trunk/src/string/charset/unicode.c)
==============================================================================
--- trunk/src/string/charset/unicode.c Tue Sep 7 22:20:33 2010 (r48832, copy source)
+++ trunk/src/string/encoding/shared.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -1,14 +1,18 @@
/*
-Copyright (C) 2005-2010, Parrot Foundation.
+Copyright (C) 2004-2010, Parrot Foundation.
$Id$
=head1 NAME
-src/string/charset/unicode.c
+src/string/encoding/shared.c
=head1 DESCRIPTION
-This file implements the charset functions for unicode data
+This file implements general encoding functions for strings.
+
+Functions starting with encoding_ work with any type of string.
+Functions starting with fixed8_ work with fixed8 strings.
+Functions starting with unicode_ work with unicode strings.
=over 4
@@ -17,313 +21,171 @@
*/
#include "parrot/parrot.h"
-#include "unicode.h"
-#include "ascii.h"
#include "tables.h"
+#include "shared.h"
+
+#if PARROT_HAS_ICU
+# include <unicode/ucnv.h>
+# include <unicode/utypes.h>
+# include <unicode/uchar.h>
+# include <unicode/ustring.h>
+# include <unicode/unorm.h>
+#endif
-/* HEADERIZER HFILE: src/string/charset/unicode.h */
+/* HEADERIZER HFILE: src/string/encoding/shared.h */
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-static INTVAL compare(PARROT_INTERP,
- ARGIN(const STRING *lhs),
- ARGIN(const STRING *rhs))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* compose(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static size_t compute_hash(PARROT_INTERP,
- ARGIN(const STRING *src),
- size_t seed)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static INTVAL cs_rindex(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(const STRING *search_string),
- SHIM(UINTVAL offset))
- __attribute__nonnull__(1);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* decompose(PARROT_INTERP, SHIM(const STRING *src))
+static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
__attribute__nonnull__(1);
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING* downcase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
+#define ASSERT_ARGS_u_iscclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: static */
-static INTVAL find_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-
-static INTVAL find_not_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
+#define UNIMPL Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED, \
+ "unimpl fixed_8")
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_graphemes(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static INTVAL is_cclass(PARROT_INTERP,
- INTVAL flags,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(3);
-PARROT_CANNOT_RETURN_NULL
-static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
- __attribute__nonnull__(1);
+/*
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
+=item C<INTVAL encoding_equal(PARROT_INTERP, const STRING *lhs, const STRING
+*rhs)>
-PARROT_CANNOT_RETURN_NULL
-static STRING* titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
+Compares two STRINGs, C<lhs> and C<rhs>. If STRING C<lhs> == C<rhs>,
+returns 1. If C<lhs> != C<rhs> returns 0.
-PARROT_CANNOT_RETURN_NULL
-static STRING* to_charset(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
+=cut
-static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
- __attribute__nonnull__(1);
+*/
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_equal(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+{
+ ASSERT_ARGS(encoding_equal)
+ String_iter l_iter, r_iter;
+ const UINTVAL len = STRING_length(lhs);
-PARROT_CANNOT_RETURN_NULL
-static STRING* upcase_first(PARROT_INTERP, SHIM(const STRING *src))
- __attribute__nonnull__(1);
+ if (len != STRING_length(rhs))
+ return 0;
+ if (len == 0)
+ return 1;
+ if (lhs == rhs)
+ return 1;
+ if (lhs->hashval && rhs->hashval && lhs->hashval != rhs->hashval)
+ return 0;
-static UINTVAL validate(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-#define ASSERT_ARGS_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(lhs) \
- , PARROT_ASSERT_ARG(rhs))
-#define ASSERT_ARGS_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_compute_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_cs_rindex __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_decompose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_graphemes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_string_from_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_to_charset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_u_iscclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-/* HEADERIZER END: static */
+ STRING_ITER_INIT(interp, &l_iter);
+ STRING_ITER_INIT(interp, &r_iter);
-#ifdef EXCEPTION
-# undef EXCEPTION
-#endif
+ while (l_iter.charpos < len) {
+ const UINTVAL cl = STRING_iter_get_and_advance(interp, lhs, &l_iter);
+ const UINTVAL cr = STRING_iter_get_and_advance(interp, rhs, &r_iter);
-#if PARROT_HAS_ICU
-# include <unicode/ucnv.h>
-# include <unicode/utypes.h>
-# include <unicode/uchar.h>
-# include <unicode/ustring.h>
-# include <unicode/unorm.h>
-#endif
-#define EXCEPTION(err, str) \
- Parrot_ex_throw_from_c_args(interp, NULL, (err), (str))
+ if (cl != cr)
+ return 0;
+ }
-#define UNIMPL EXCEPTION(EXCEPTION_UNIMPLEMENTED, "unimplemented unicode")
+ return 1;
+}
/*
-=item C<static STRING * get_graphemes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
+=item C<INTVAL encoding_compare(PARROT_INTERP, const STRING *lhs, const STRING
+*rhs)>
-Gets the graphemes from STRING C<src> starting at C<offset>. Gets
-C<count> graphemes total.
+Compares two STRINGs, C<lhs> and C<rhs>. Returns -1 if C<lhs> < C<rhs>. Returns
+0 if C<lhs> = C<rhs>. Returns 1 if C<lhs> > C<rhs>.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_graphemes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
{
- ASSERT_ARGS(get_graphemes)
- return ENCODING_GET_CODEPOINTS(interp, src, offset, count);
-}
-
+ ASSERT_ARGS(encoding_compare)
+ String_iter l_iter, r_iter;
+ UINTVAL min_len, l_len, r_len;
-/*
+ STRING_ITER_INIT(interp, &l_iter);
+ STRING_ITER_INIT(interp, &r_iter);
-=item C<static STRING* to_charset(PARROT_INTERP, const STRING *src)>
+ l_len = lhs->strlen;
+ r_len = rhs->strlen;
-Converts input STRING C<src> to unicode STRING C<dest>.
+ min_len = l_len > r_len ? r_len : l_len;
-=cut
+ while (l_iter.charpos < min_len) {
+ const UINTVAL cl = STRING_iter_get_and_advance(interp, lhs, &l_iter);
+ const UINTVAL cr = STRING_iter_get_and_advance(interp, rhs, &r_iter);
-*/
+ if (cl != cr)
+ return cl < cr ? -1 : 1;
+ }
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-to_charset(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_charset)
- const charset_converter_t conversion_func =
- Parrot_find_charset_converter(interp, src->charset,
- Parrot_unicode_charset_ptr);
+ if (l_len < r_len)
+ return -1;
- if (conversion_func)
- return conversion_func(interp, src);
+ if (l_len > r_len)
+ return 1;
- return Parrot_utf8_encoding_ptr->to_encoding(interp, src);
+ return 0;
}
/*
-=item C<static STRING* compose(PARROT_INTERP, const STRING *src)>
-
-If Parrot is built with ICU, composes the STRING C<src>. Attempts to
-denormalize the STRING into the ICU default, NFC.
+=item C<INTVAL encoding_index(PARROT_INTERP, const STRING *src, const STRING
+*search, UINTVAL offs)>
-If Parrot does not have ICU included, throws an exception.
+Searches for the first instance of STRING C<search> in STRING C<src>.
+returns the position where the substring is found if it is indeed found.
+Returns -1 otherwise. Operates on different types of strings, not just
+ASCII.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-compose(PARROT_INTERP, ARGIN(const STRING *src))
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_index(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search),
+ UINTVAL offs)
{
- ASSERT_ARGS(compose)
-#if PARROT_HAS_ICU
- STRING *dest;
- int src_len, dest_len;
- UErrorCode err;
- /*
- U_STABLE int32_t U_EXPORT2
- unorm_normalize(const UChar *source, int32_t sourceLength,
- UNormalizationMode mode, int32_t options,
- UChar *result, int32_t resultLength,
- UErrorCode *status);
- */
- dest_len = src_len = src->strlen;
- dest = Parrot_str_new_init(interp, NULL, src_len * sizeof (UChar),
- src->encoding, src->charset, 0);
-
- err = U_ZERO_ERROR;
- dest_len = unorm_normalize((UChar *)src->strstart, src_len,
- UNORM_DEFAULT, /* default is NFC */
- 0, /* options 0 default - no specific icu
- * version */
- (UChar *)dest->strstart, dest_len, &err);
+ ASSERT_ARGS(encoding_index)
+ String_iter start, end;
- dest->bufused = dest_len * sizeof (UChar);
+ STRING_ITER_INIT(interp, &start);
+ STRING_iter_set_position(interp, src, &start, offs);
- if (!U_SUCCESS(err)) {
- err = U_ZERO_ERROR;
- Parrot_gc_reallocate_string_storage(interp, dest, dest->bufused);
- dest_len = unorm_normalize((UChar *)src->strstart, src_len,
- UNORM_DEFAULT, /* default is NFC */
- 0, /* options 0 default - no specific
- * icu version */
- (UChar *)dest->strstart, dest_len, &err);
- PARROT_ASSERT(U_SUCCESS(err));
- dest->bufused = dest_len * sizeof (UChar);
- }
- dest->strlen = dest_len;
- return dest;
-#else
- UNUSED(src);
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ return Parrot_str_iter_index(interp, src, &start, &end, search);
}
/*
-=item C<static STRING* decompose(PARROT_INTERP, const STRING *src)>
+=item C<INTVAL encoding_rindex(PARROT_INTERP, const STRING *src, const STRING
+*search_string, UINTVAL offset)>
-Decompose function for unicode charset. This function is not yet implemented.
+Finds the last index of substring C<search_string> in STRING C<src>,
+starting from C<offset>. Not implemented.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-decompose(PARROT_INTERP, SHIM(const STRING *src))
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_rindex(PARROT_INTERP, SHIM(const STRING *src),
+ SHIM(const STRING *search_string), SHIM(UINTVAL offset))
{
- ASSERT_ARGS(decompose)
+ ASSERT_ARGS(encoding_rindex)
/* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
UNIMPL;
}
@@ -331,380 +193,796 @@
/*
-=item C<static STRING* upcase(PARROT_INTERP, const STRING *src)>
-
-Converts the STRING C<src> to all upper-case graphemes, for those characters
-which support upper-case versions.
+=item C<size_t encoding_hash(PARROT_INTERP, const STRING *src, size_t seed)>
-Throws an exception if ICU is not installed.
+Computes the hash of the given STRING C<src> with starting seed value C<seed>.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase(PARROT_INTERP, ARGIN(const STRING *src))
+PARROT_WARN_UNUSED_RESULT
+size_t
+encoding_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed)
{
- ASSERT_ARGS(upcase)
-#if PARROT_HAS_ICU
- UErrorCode err;
- int dest_len, src_len, needed;
- STRING *res;
-#endif
-
- if (src->bufused == src->strlen
- && src->encoding == Parrot_utf8_encoding_ptr) {
- return Parrot_ascii_charset_ptr->upcase(interp, src);
- }
-
-#if PARROT_HAS_ICU
- /* to_encoding will allocate new string */
- res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
- /*
- U_CAPI int32_t U_EXPORT2
- u_strToUpper(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode);
- */
- err = U_ZERO_ERROR;
-
- /* use all available space - see below XXX */
- /* TODO downcase, titlecase too */
- dest_len = Buffer_buflen(res) / sizeof (UChar);
- src_len = res->bufused / sizeof (UChar);
-
- /*
- * XXX troubles:
- * t/op/string_cs_45 upcase unicode:"\u01f0"
- * this creates \u004a \u030c J+NON-SPACING HACEK
- * the string needs resizing, *if* the src buffer is
- * too short. *But* with icu 3.2/3.4 the src string is
- * overwritten with partial result, despite the icu docs sayeth:
- *
- * The source string and the destination buffer
- * are allowed to overlap.
- *
- * Workaround: 'preflighting' returns needed length
- * Alternative: forget about inplace operation - create new result
- *
- * TODO downcase, titlecase
- */
- needed = u_strToUpper(NULL, 0,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
-
- if (needed > dest_len) {
- Parrot_gc_reallocate_string_storage(interp, res, needed * sizeof (UChar));
- dest_len = needed;
- }
+ ASSERT_ARGS(encoding_hash)
+ String_iter iter;
+ size_t hashval = seed;
- err = U_ZERO_ERROR;
- dest_len = u_strToUpper((UChar *)res->strstart, dest_len,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
- PARROT_ASSERT(U_SUCCESS(err));
- res->bufused = dest_len * sizeof (UChar);
+ STRING_ITER_INIT(interp, &iter);
- /* downgrade if possible */
- if (dest_len == (int)src->strlen)
- res->encoding = Parrot_ucs2_encoding_ptr;
- else {
- /* string is likely still ucs2 if it was earlier
- * but strlen changed due to combining char
- */
- res->strlen = dest_len;
+ while (iter.charpos < src->strlen) {
+ const UINTVAL c = STRING_iter_get_and_advance(interp, src, &iter);
+ hashval += hashval << 5;
+ hashval += c;
}
- return res;
-
-#else
- UNUSED(src);
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+ return hashval;
}
/*
-=item C<static STRING* downcase(PARROT_INTERP, const STRING *src)>
-
-Converts all graphemes to lower-case, for those graphemes which have cases.
+=item C<static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)>
-Throws an exception if ICU is not installed.
+Returns Boolean.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase(PARROT_INTERP, ARGIN(const STRING *src))
+static int
+u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
{
- ASSERT_ARGS(downcase)
-#if PARROT_HAS_ICU
- UErrorCode err;
- int dest_len, src_len;
- STRING *res;
-#endif
-
- if (src->bufused == src->strlen
- && src->encoding == Parrot_utf8_encoding_ptr) {
- return Parrot_ascii_charset_ptr->downcase(interp, src);
- }
-
+ ASSERT_ARGS(u_iscclass)
#if PARROT_HAS_ICU
- /* to_encoding will allocate new string */
- res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
- /*
-U_CAPI int32_t U_EXPORT2
-u_strToLower(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode);
+ UNUSED(interp);
+ /* XXX which one
+ return u_charDigitValue(codepoint);
+ */
+ if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1;
+ if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1;
+ if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1;
+ if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1;
+ if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1;
+ if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1;
+ if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1;
+ if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1;
+ if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1;
+ if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1;
+ if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1;
+ if ((flags & enum_cclass_word) &&
+ (u_isalnum(codepoint) || codepoint == '_')) return 1;
+
+ return 0;
+#else
+ if (codepoint < 256)
+ return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
+
+ if (flags == enum_cclass_any)
+ return 1;
+
+ /* All codepoints from u+0100 to u+02af are alphabetic, so we
+ * cheat on the WORD and ALPHABETIC properties to include these
+ * (and incorrectly exclude all others). This is a stopgap until
+ * ICU is everywhere, or we have better non-ICU unicode support. */
+ if (flags == enum_cclass_word || flags == enum_cclass_alphabetic)
+ return (codepoint < 0x2b0);
+
+ if (flags & enum_cclass_whitespace) {
+ /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */
+ switch (codepoint) {
+ case 0x1680: case 0x180e: case 0x2000: case 0x2001:
+ case 0x2002: case 0x2003: case 0x2004: case 0x2005:
+ case 0x2006: case 0x2007: case 0x2008: case 0x2009:
+ case 0x200a: case 0x2028: case 0x2029: case 0x202f:
+ case 0x205f: case 0x3000:
+ return 1;
+ default:
+ break;
+ }
+ }
+
+ if (flags & enum_cclass_numeric) {
+ /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
+ if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1;
+ if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1;
+ if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1;
+ if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1;
+ if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1;
+ if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1;
+ if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1;
+ if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1;
+ if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1;
+ if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1;
+ if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1;
+ if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1;
+ if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1;
+ if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1;
+ if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1;
+ if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1;
+ if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1;
+ if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1;
+ if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1;
+ if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1;
+ if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1;
+ if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1;
+ }
+
+ if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline))
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+
+ return 0;
+#endif
+}
+
+
+/*
+
+=item C<UINTVAL encoding_scan(PARROT_INTERP, const STRING *src)>
+
+Returns the number of codepoints in string C<src>.
+
+=cut
+
+*/
+
+UINTVAL
+encoding_scan(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(encoding_scan)
+ String_iter iter;
+ /*
+ * this is used to initially calculate src->strlen,
+ * therefore we must scan the whole string
*/
- err = U_ZERO_ERROR;
- src_len = res->bufused / sizeof (UChar);
- dest_len = u_strToLower((UChar *)res->strstart, src_len,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
- res->bufused = dest_len * sizeof (UChar);
+ STRING_ITER_INIT(interp, &iter);
+ while (iter.bytepos < src->bufused)
+ STRING_iter_get_and_advance(interp, src, &iter);
+ return iter.charpos;
+}
- if (!U_SUCCESS(err)) {
- err = U_ZERO_ERROR;
- Parrot_gc_reallocate_string_storage(interp, res, res->bufused);
- dest_len = u_strToLower((UChar *)res->strstart, dest_len,
- (UChar *)res->strstart, src_len,
- NULL, /* locale = default */
- &err);
- PARROT_ASSERT(U_SUCCESS(err));
+
+/*
+
+=item C<STRING * encoding_substr(PARROT_INTERP, const STRING *src, UINTVAL
+offset, UINTVAL count)>
+
+Returns the codepoints in string C<src> at position C<offset> and length
+C<count>.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+STRING *
+encoding_substr(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(encoding_substr)
+
+ STRING * const return_string = Parrot_str_copy(interp, src);
+ String_iter iter;
+ UINTVAL start;
+
+ STRING_ITER_INIT(interp, &iter);
+
+ if (offset)
+ STRING_iter_set_position(interp, src, &iter, offset);
+
+ start = iter.bytepos;
+ return_string->strstart = (char *)return_string->strstart + start;
+
+ if (count)
+ STRING_iter_set_position(interp, src, &iter, offset + count);
+
+ return_string->bufused = iter.bytepos - start;
+ return_string->strlen = count;
+ return_string->hashval = 0;
+
+ return return_string;
+}
+
+
+/*
+
+=item C<INTVAL encoding_is_cclass(PARROT_INTERP, INTVAL flags, const STRING
+*src, UINTVAL offset)>
+
+Returns Boolean.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
+{
+ ASSERT_ARGS(encoding_is_cclass)
+ UINTVAL codepoint;
+
+ if (offset >= src->strlen)
+ return 0;
+
+ codepoint = STRING_ord(interp, src, offset);
+
+ if (codepoint >= 256)
+ return u_iscclass(interp, codepoint, flags) != 0;
+
+ return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
+}
+
+
+/*
+
+=item C<INTVAL encoding_find_cclass(PARROT_INTERP, INTVAL flags, const STRING
+*src, UINTVAL offset, UINTVAL count)>
+
+Find a character in the given character class.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src),
+ UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(encoding_find_cclass)
+ String_iter iter;
+ UINTVAL codepoint;
+ UINTVAL end = offset + count;
+
+ STRING_ITER_INIT(interp, &iter);
+ STRING_iter_set_position(interp, src, &iter, offset);
+
+ end = src->strlen < end ? src->strlen : end;
+
+ while (iter.charpos < end) {
+ codepoint = STRING_iter_get_and_advance(interp, src, &iter);
+ if (codepoint >= 256) {
+ if (u_iscclass(interp, codepoint, flags))
+ return iter.charpos - 1;
+ }
+ else {
+ if (Parrot_iso_8859_1_typetable[codepoint] & flags)
+ return iter.charpos - 1;
+ }
}
- /* downgrade if possible */
- if (dest_len == (int)res->strlen)
- res->encoding = Parrot_ucs2_encoding_ptr;
+ return end;
+}
- return res;
-#else
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+/*
+
+=item C<INTVAL encoding_find_not_cclass(PARROT_INTERP, INTVAL flags, const
+STRING *src, UINTVAL offset, UINTVAL count)>
+
+Returns C<INTVAL>.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+encoding_find_not_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src),
+ UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(encoding_find_not_cclass)
+ String_iter iter;
+ UINTVAL codepoint;
+ UINTVAL end = offset + count;
+ int bit;
+
+ if (offset > src->strlen) {
+ /* XXX: Throw in this case? */
+ return offset + count;
+ }
+
+ STRING_ITER_INIT(interp, &iter);
+
+ if (offset)
+ STRING_iter_set_position(interp, src, &iter, offset);
+
+ end = src->strlen < end ? src->strlen : end;
+
+ if (flags == enum_cclass_any)
+ return end;
+
+ while (iter.charpos < end) {
+ codepoint = STRING_iter_get_and_advance(interp, src, &iter);
+ if (codepoint >= 256) {
+ for (bit = enum_cclass_uppercase;
+ bit <= enum_cclass_word ; bit <<= 1) {
+ if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
+ return iter.charpos - 1;
+ }
+ }
+ else {
+ if (!(Parrot_iso_8859_1_typetable[codepoint] & flags))
+ return iter.charpos - 1;
+ }
+ }
+
+ return end;
+}
+
+
+/*
+
+=item C<STRING * encoding_get_graphemes(PARROT_INTERP, const STRING *src,
+UINTVAL offset, UINTVAL count)>
+
+Retrieves the graphemes for the STRING C<src>, starting at
+C<offset> and ending at C<offset + count>. Returns codepoints for now.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+encoding_get_graphemes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+{
+ ASSERT_ARGS(encoding_get_graphemes)
+ return STRING_substr(interp, src, offset, count);
+}
+
+
+/*
+
+=item C<STRING* encoding_decompose(PARROT_INTERP, const STRING *src)>
+
+Decompose function. This function is not yet implemented.
+
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+STRING*
+encoding_decompose(PARROT_INTERP, SHIM(const STRING *src))
+{
+ ASSERT_ARGS(encoding_decompose)
+ /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
+ UNIMPL;
+}
+
+
+/*
+
+=item C<INTVAL fixed8_equal(PARROT_INTERP, const STRING *lhs, const STRING
+*rhs)>
+
+Compares a fixed8 string with another string. If STRING C<lhs> == C<rhs>,
+returns 1. If C<lhs> != C<rhs> returns 0.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+fixed8_equal(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+{
+ ASSERT_ARGS(fixed8_equal)
+ const UINTVAL len = STRING_length(lhs);
+
+ if (len != STRING_length(rhs))
+ return 0;
+ if (len == 0)
+ return 1;
+ if (lhs == rhs)
+ return 1;
+ if (lhs->hashval && rhs->hashval && lhs->hashval != rhs->hashval)
+ return 0;
+
+ if (STRING_max_bytes_per_codepoint(rhs) == 1) {
+ return memcmp(lhs->strstart, rhs->strstart, len) == 0;
+ }
+ else {
+ const unsigned char * const buf = (unsigned char *)lhs->strstart;
+ String_iter iter;
+
+ STRING_ITER_INIT(interp, &iter);
+
+ while (iter.charpos < len) {
+ const UINTVAL cl = buf[iter.charpos];
+ const UINTVAL cr = STRING_iter_get_and_advance(interp, rhs, &iter);
+ if (cl != cr)
+ return 0;
+ }
+
+ return 1;
+ }
+}
+
+
+/*
+
+=item C<INTVAL fixed8_compare(PARROT_INTERP, const STRING *lhs, const STRING
+*rhs)>
+
+Compares a fixed8 string with another string. If STRING C<lhs> > C<rhs>, returns
+1. If C<lhs> == C<rhs> returns 0. If STRING C<lhs> < C<rhs>, returns -1.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+fixed8_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+{
+ ASSERT_ARGS(fixed8_compare)
+ const UINTVAL l_len = lhs->strlen;
+ const UINTVAL r_len = rhs->strlen;
+ const UINTVAL min_len = l_len > r_len ? r_len : l_len;
+
+ if (STRING_max_bytes_per_codepoint(rhs) == 1) {
+ const int ret_val = memcmp(lhs->strstart, rhs->strstart, min_len);
+ if (ret_val)
+ return ret_val < 0 ? -1 : 1;
+ }
+ else {
+ const unsigned char * const buf = (unsigned char *)lhs->strstart;
+ String_iter iter;
+
+ STRING_ITER_INIT(interp, &iter);
+
+ while (iter.charpos < min_len) {
+ const UINTVAL cl = buf[iter.charpos];
+ const UINTVAL cr = STRING_iter_get_and_advance(interp, rhs, &iter);
+ if (cl != cr)
+ return cl < cr ? -1 : 1;
+ }
+ }
+
+ if (l_len < r_len)
+ return -1;
+ if (l_len > r_len)
+ return 1;
+
+ return 0;
+}
+
+
+/*
+
+=item C<INTVAL fixed8_index(PARROT_INTERP, const STRING *src, const STRING
+*search_string, UINTVAL offset)>
+
+Searches for the first instance of STRING C<search> in STRING C<src>.
+returns the position where the substring is found if it is indeed found.
+Returns -1 otherwise.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+fixed8_index(PARROT_INTERP, ARGIN(const STRING *src),
+ ARGIN(const STRING *search_string), UINTVAL offset)
+{
+ ASSERT_ARGS(fixed8_index)
+ INTVAL retval;
+
+ if (STRING_max_bytes_per_codepoint(search_string) != 1) {
+ return encoding_index(interp, src, search_string, offset);
+ }
+
+ PARROT_ASSERT(STRING_max_bytes_per_codepoint(src) == 1);
+ retval = Parrot_byte_index(interp, src,
+ search_string, offset);
+ return retval;
+}
+
+
+/*
+
+=item C<INTVAL fixed8_rindex(PARROT_INTERP, const STRING *src, const STRING
+*search_string, UINTVAL offset)>
+
+Searches for the last instance of STRING C<search_string> in STRING
+C<src>. Starts searching at C<offset>.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+fixed8_rindex(PARROT_INTERP, ARGIN(const STRING *src),
+ ARGIN(const STRING *search_string), UINTVAL offset)
+{
+ ASSERT_ARGS(fixed8_rindex)
+ INTVAL retval;
+
+ if (STRING_max_bytes_per_codepoint(search_string) != 1)
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
+ "Cross-charset rindex not supported");
+
+ PARROT_ASSERT(STRING_max_bytes_per_codepoint(src) == 1);
+ retval = Parrot_byte_rindex(interp, src, search_string, offset);
+ return retval;
+}
+
+
+/*
+
+=item C<size_t fixed8_hash(PARROT_INTERP, const STRING *s, size_t hashval)>
+
+Returns the hashed value of the string, given a seed in hashval.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+size_t
+fixed8_hash(SHIM_INTERP, ARGIN(const STRING *s), size_t hashval)
+{
+ ASSERT_ARGS(fixed8_hash)
+ const unsigned char *pos = (const unsigned char *)s->strstart;
+ UINTVAL len = s->strlen;
+
+ while (len--) {
+ hashval += hashval << 5;
+ hashval += *(pos++);
+ }
+
+ return hashval;
+}
+
+
+/*
+
+=item C<UINTVAL fixed8_scan(PARROT_INTERP, const STRING *src)>
+
+Returns the number of codepoints in string C<src>. No scanning needed
+for fixed encodings.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+UINTVAL
+fixed8_scan(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(fixed8_scan)
+ return src->bufused;
+}
+
+
+/*
+
+=item C<UINTVAL fixed8_ord(PARROT_INTERP, const STRING *src, UINTVAL offset)>
+
+codepoints are bytes, so delegate
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+UINTVAL
+fixed8_ord(PARROT_INTERP, ARGIN(const STRING *src),
+ UINTVAL offset)
+{
+ ASSERT_ARGS(fixed8_ord)
+ const unsigned char * const buf = (unsigned char *)src->strstart;
+
+ if (offset >= src->bufused) {
+/* Parrot_ex_throw_from_c_args(interp, NULL, 0,
+ "fixed8_ord past the end of the buffer (%i of %i)",
+ offset, src->bufused); */
+ return 0;
+ }
+
+ return buf[offset];
}
/*
-=item C<static STRING* titlecase(PARROT_INTERP, const STRING *src)>
+=item C<STRING * fixed8_substr(PARROT_INTERP, const STRING *src, UINTVAL offset,
+UINTVAL count)>
-Converts the string to title case, for those characters which support cases.
-
-Throws an exception if ICU is not installed.
+Returns the codepoints in string C<src> at position C<offset> and length
+C<count>.
=cut
*/
+PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase(PARROT_INTERP, ARGIN(const STRING *src))
+STRING *
+fixed8_substr(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
{
- ASSERT_ARGS(titlecase)
-#if PARROT_HAS_ICU
+ ASSERT_ARGS(fixed8_substr)
+ STRING * const return_string = Parrot_str_copy(interp, src);
- UErrorCode err;
- int dest_len, src_len;
- STRING *res;
+ return_string->encoding = src->encoding;
- if (src->bufused == src->strlen
- && src->encoding == Parrot_utf8_encoding_ptr) {
- return Parrot_ascii_charset_ptr->titlecase(interp, src);
- }
+ return_string->strstart = (char *)return_string->strstart + offset ;
+ return_string->bufused = count;
- /* to_encoding will allocate new string */
- res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
+ return_string->strlen = count;
+ return_string->hashval = 0;
- /*
-U_CAPI int32_t U_EXPORT2
-u_strToTitle(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- UBreakIterator *titleIter,
- const char *locale,
- UErrorCode *pErrorCode);
- */
+ return return_string;
+}
- err = U_ZERO_ERROR;
- src_len = res->bufused / sizeof (UChar);
- dest_len = u_strToTitle((UChar *)res->strstart, src_len,
- (UChar *)res->strstart, src_len,
- NULL, /* default titleiter */
- NULL, /* locale = default */
- &err);
- res->bufused = dest_len * sizeof (UChar);
- if (!U_SUCCESS(err)) {
- err = U_ZERO_ERROR;
- Parrot_gc_reallocate_string_storage(interp, res, res->bufused);
- dest_len = u_strToTitle((UChar *)res->strstart, dest_len,
- (UChar *)res->strstart, src_len,
- NULL, NULL,
- &err);
- PARROT_ASSERT(U_SUCCESS(err));
- }
+/*
- /* downgrade if possible */
- if (dest_len == (int)res->strlen)
- res->encoding = Parrot_ucs2_encoding_ptr;
+=item C<STRING* fixed8_compose(PARROT_INTERP, const STRING *src)>
- return res;
+Can't compose ASCII strings, so performs a string copy on it and
+returns the new string.
-#else
- UNUSED(src);
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
+=cut
+
+*/
+
+PARROT_CANNOT_RETURN_NULL
+STRING*
+fixed8_compose(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(fixed8_compose)
+
+ return Parrot_str_copy(interp, src);
}
/*
-=item C<static STRING* upcase_first(PARROT_INTERP, const STRING *src)>
+=item C<UINTVAL fixed8_iter_get(PARROT_INTERP, const STRING *str, const
+String_iter *iter, INTVAL offset)>
-Converts the first grapheme in the STRING C<src> to uppercase, if the
-grapheme supports it. Not implemented.
+Get the character at C<iter> plus C<offset>.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-upcase_first(PARROT_INTERP, SHIM(const STRING *src))
+UINTVAL
+fixed8_iter_get(PARROT_INTERP,
+ ARGIN(const STRING *str), ARGIN(const String_iter *iter), INTVAL offset)
{
- ASSERT_ARGS(upcase_first)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
+ ASSERT_ARGS(fixed8_iter_get)
+ return fixed8_ord(interp, str, iter->charpos + offset);
}
/*
-=item C<static STRING* downcase_first(PARROT_INTERP, const STRING *src)>
+=item C<void fixed8_iter_skip(PARROT_INTERP, const STRING *str, String_iter
+*iter, INTVAL skip)>
-Converts the first grapheme in the STRING C<src> to lower-case, if
-the grapheme supports it. Not implemented
+Moves the string iterator C<i> by C<skip> characters.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-downcase_first(PARROT_INTERP, SHIM(const STRING *src))
+void
+fixed8_iter_skip(SHIM_INTERP,
+ ARGIN(const STRING *str), ARGMOD(String_iter *iter), INTVAL skip)
{
- ASSERT_ARGS(downcase_first)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
+ ASSERT_ARGS(fixed8_iter_skip)
+ iter->bytepos += skip;
+ iter->charpos += skip;
+ PARROT_ASSERT(iter->bytepos <= Buffer_buflen(str));
}
/*
-=item C<static STRING* titlecase_first(PARROT_INTERP, const STRING *src)>
+=item C<UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP, const STRING *str,
+String_iter *iter)>
-Converts the first grapheme in STRING C<src> to title case, if the
-string supports it. Not implemented.
+Moves the string iterator C<i> to the next codepoint.
=cut
*/
-PARROT_CANNOT_RETURN_NULL
-static STRING*
-titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
+UINTVAL
+fixed8_iter_get_and_advance(PARROT_INTERP,
+ ARGIN(const STRING *str), ARGMOD(String_iter *iter))
{
- ASSERT_ARGS(titlecase_first)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
+ ASSERT_ARGS(fixed8_iter_get_and_advance)
+ const UINTVAL c = fixed8_ord(interp, str, iter->charpos++);
+ iter->bytepos++;
+ return c;
}
/*
-=item C<static INTVAL compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
+=item C<void fixed8_iter_set_and_advance(PARROT_INTERP, STRING *str, String_iter
+*iter, UINTVAL c)>
-Compares two STRINGs, C<lhs> and C<rhs>. Returns -1 if C<lhs> < C<rhs>. Returns
-0 if C<lhs> = C<rhs>. Returns 1 if C<lhs> > C<rhs>.
+With the string iterator C<i>, appends the codepoint C<c> and advances to the
+next position in the string.
=cut
*/
-static INTVAL
-compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+void
+fixed8_iter_set_and_advance(PARROT_INTERP,
+ ARGMOD(STRING *str), ARGMOD(String_iter *iter), UINTVAL c)
{
- ASSERT_ARGS(compare)
- String_iter l_iter, r_iter;
- UINTVAL min_len, l_len, r_len;
-
- /* TODO make optimized equal - strings are equal length then already */
- STRING_ITER_INIT(interp, &l_iter);
- STRING_ITER_INIT(interp, &r_iter);
+ ASSERT_ARGS(fixed8_iter_set_and_advance)
+ unsigned char *buf = (unsigned char *)str->strstart;
+ buf[iter->charpos++] = c;
+ iter->bytepos++;
+}
- l_len = lhs->strlen;
- r_len = rhs->strlen;
- min_len = l_len > r_len ? r_len : l_len;
+/*
- while (l_iter.charpos < min_len) {
- const UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, lhs, &l_iter);
- const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &r_iter);
+=item C<void fixed8_iter_set_position(PARROT_INTERP, const STRING *str,
+String_iter *iter, UINTVAL pos)>
- if (cl != cr)
- return cl < cr ? -1 : 1;
- }
+Moves the string iterator C<i> to the position C<n> in the string.
- if (l_len < r_len)
- return -1;
+=cut
- if (l_len > r_len)
- return 1;
+*/
- return 0;
+void
+fixed8_iter_set_position(SHIM_INTERP,
+ ARGIN(const STRING *str), ARGMOD(String_iter *iter), UINTVAL pos)
+{
+ ASSERT_ARGS(fixed8_iter_set_position)
+ iter->bytepos = iter->charpos = pos;
+ PARROT_ASSERT(pos <= Buffer_buflen(str));
}
/*
-=item C<static INTVAL cs_rindex(PARROT_INTERP, const STRING *src, const STRING
-*search_string, UINTVAL offset)>
+=item C<STRING * unicode_chr(PARROT_INTERP, UINTVAL codepoint)>
-Finds the last index of substring C<search_string> in STRING C<src>,
-starting from C<offset>. Not implemented.
+Returns a one-codepoint string for the given codepoint.
=cut
*/
-static INTVAL
-cs_rindex(PARROT_INTERP, SHIM(const STRING *src),
- SHIM(const STRING *search_string), SHIM(UINTVAL offset))
+PARROT_CANNOT_RETURN_NULL
+STRING *
+unicode_chr(PARROT_INTERP, UINTVAL codepoint)
{
- ASSERT_ARGS(cs_rindex)
- /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
- UNIMPL;
+ ASSERT_ARGS(unicode_chr)
+ String_iter iter;
+ STRING * const dest = string_make(interp, "", 1, "unicode", 0);
+
+ dest->strlen = 1;
+
+ STRING_ITER_INIT(interp, &iter);
+ STRING_iter_set_and_advance(interp, dest, &iter, codepoint);
+ dest->bufused = iter.bytepos;
+
+ return dest;
}
/*
-=item C<static UINTVAL validate(PARROT_INTERP, const STRING *src)>
+=item C<UINTVAL unicode_validate(PARROT_INTERP, const STRING *src)>
Returns 1 if the STRING C<src> is a valid unicode string, returns 0 otherwise.
@@ -712,16 +990,16 @@
*/
-static UINTVAL
-validate(PARROT_INTERP, ARGIN(const STRING *src))
+UINTVAL
+unicode_validate(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(validate)
+ ASSERT_ARGS(unicode_validate)
String_iter iter;
- const INTVAL length = Parrot_str_length(interp, src);
+ const UINTVAL length = Parrot_str_length(interp, src);
STRING_ITER_INIT(interp, &iter);
while (iter.charpos < length) {
- const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
+ const UINTVAL codepoint = STRING_iter_get_and_advance(interp, src, &iter);
/* Check for Unicode non-characters */
if (codepoint >= 0xfdd0
&& (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
@@ -735,335 +1013,369 @@
/*
-=item C<static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)>
+=item C<STRING* unicode_compose(PARROT_INTERP, const STRING *src)>
-Returns Boolean.
+If Parrot is built with ICU, composes the STRING C<src>. Attempts to
+denormalize the STRING into the ICU default, NFC.
+
+If Parrot does not have ICU included, throws an exception.
=cut
*/
-static int
-u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
+PARROT_CANNOT_RETURN_NULL
+STRING*
+unicode_compose(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(u_iscclass)
+ ASSERT_ARGS(unicode_compose)
#if PARROT_HAS_ICU
- UNUSED(interp);
- /* XXX which one
- return u_charDigitValue(codepoint);
- */
- if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1;
- if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1;
- if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1;
- if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1;
- if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1;
- if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1;
- if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1;
- if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1;
- if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1;
- if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1;
- if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1;
- if ((flags & enum_cclass_word) &&
- (u_isalnum(codepoint) || codepoint == '_')) return 1;
+ STRING *dest;
+ int src_len, dest_len;
+ UErrorCode err;
+ /*
+ U_STABLE int32_t U_EXPORT2
+ unorm_normalize(const UChar *source, int32_t sourceLength,
+ UNormalizationMode mode, int32_t options,
+ UChar *result, int32_t resultLength,
+ UErrorCode *status);
+ */
+ dest_len = src_len = src->strlen;
+ dest = Parrot_str_new_init(interp, NULL, src_len * sizeof (UChar),
+ src->encoding, 0);
- return 0;
+ err = U_ZERO_ERROR;
+ dest_len = unorm_normalize((UChar *)src->strstart, src_len,
+ UNORM_DEFAULT, /* default is NFC */
+ 0, /* options 0 default - no specific icu
+ * version */
+ (UChar *)dest->strstart, dest_len, &err);
+
+ dest->bufused = dest_len * sizeof (UChar);
+
+ if (!U_SUCCESS(err)) {
+ err = U_ZERO_ERROR;
+ Parrot_gc_reallocate_string_storage(interp, dest, dest->bufused);
+ dest_len = unorm_normalize((UChar *)src->strstart, src_len,
+ UNORM_DEFAULT, /* default is NFC */
+ 0, /* options 0 default - no specific
+ * icu version */
+ (UChar *)dest->strstart, dest_len, &err);
+ PARROT_ASSERT(U_SUCCESS(err));
+ dest->bufused = dest_len * sizeof (UChar);
+ }
+ dest->strlen = dest_len;
+ return dest;
#else
- if (codepoint < 256)
- return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
+ UNUSED(src);
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+#endif
+}
- if (flags == enum_cclass_any)
- return 1;
- /* All codepoints from u+0100 to u+02af are alphabetic, so we
- * cheat on the WORD and ALPHABETIC properties to include these
- * (and incorrectly exclude all others). This is a stopgap until
- * ICU is everywhere, or we have better non-ICU unicode support. */
- if (flags == enum_cclass_word || flags == enum_cclass_alphabetic)
- return (codepoint < 0x2b0);
+/*
- if (flags & enum_cclass_whitespace) {
- /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */
- switch (codepoint) {
- case 0x1680: case 0x180e: case 0x2000: case 0x2001:
- case 0x2002: case 0x2003: case 0x2004: case 0x2005:
- case 0x2006: case 0x2007: case 0x2008: case 0x2009:
- case 0x200a: case 0x2028: case 0x2029: case 0x202f:
- case 0x205f: case 0x3000:
- return 1;
- default:
- break;
- }
- }
+=item C<STRING* unicode_upcase(PARROT_INTERP, const STRING *src)>
- if (flags & enum_cclass_numeric) {
- /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
- if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1;
- if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1;
- if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1;
- if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1;
- if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1;
- if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1;
- if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1;
- if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1;
- if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1;
- if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1;
- if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1;
- if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1;
- if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1;
- if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1;
- if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1;
- if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1;
- if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1;
- if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1;
- if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1;
- if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1;
- if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1;
- if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1;
- }
+Converts the STRING C<src> to all upper-case graphemes, for those characters
+which support upper-case versions.
- if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline))
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
+Throws an exception if ICU is not installed.
- return 0;
-#endif
-}
+=cut
+*/
-/*
+PARROT_CANNOT_RETURN_NULL
+STRING*
+unicode_upcase(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(unicode_upcase)
+#if PARROT_HAS_ICU
+ UErrorCode err;
+ int dest_len, src_len, needed;
+ STRING *res;
+#endif
-=item C<static INTVAL is_cclass(PARROT_INTERP, INTVAL flags, const STRING *src,
-UINTVAL offset)>
+ if (src->bufused == src->strlen
+ && src->encoding == Parrot_utf8_encoding_ptr) {
+ return Parrot_ascii_encoding_ptr->upcase(interp, src);
+ }
-Returns Boolean.
+#if PARROT_HAS_ICU
+ /* to_encoding will allocate new string */
+ res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
+ /*
+ U_CAPI int32_t U_EXPORT2
+ u_strToUpper(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ const char *locale,
+ UErrorCode *pErrorCode);
+ */
+ err = U_ZERO_ERROR;
-=cut
+ /* use all available space - see below XXX */
+ /* TODO downcase, titlecase too */
+ dest_len = Buffer_buflen(res) / sizeof (UChar);
+ src_len = res->bufused / sizeof (UChar);
-*/
+ /*
+ * XXX troubles:
+ * t/op/string_cs_45 upcase unicode:"\u01f0"
+ * this creates \u004a \u030c J+NON-SPACING HACEK
+ * the string needs resizing, *if* the src buffer is
+ * too short. *But* with icu 3.2/3.4 the src string is
+ * overwritten with partial result, despite the icu docs sayeth:
+ *
+ * The source string and the destination buffer
+ * are allowed to overlap.
+ *
+ * Workaround: 'preflighting' returns needed length
+ * Alternative: forget about inplace operation - create new result
+ *
+ * TODO downcase, titlecase
+ */
+ needed = u_strToUpper(NULL, 0,
+ (UChar *)res->strstart, src_len,
+ NULL, /* locale = default */
+ &err);
-static INTVAL
-is_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(is_cclass)
- UINTVAL codepoint;
+ if (needed > dest_len) {
+ Parrot_gc_reallocate_string_storage(interp, res, needed * sizeof (UChar));
+ dest_len = needed;
+ }
- if (offset >= src->strlen)
- return 0;
+ err = U_ZERO_ERROR;
+ dest_len = u_strToUpper((UChar *)res->strstart, dest_len,
+ (UChar *)res->strstart, src_len,
+ NULL, /* locale = default */
+ &err);
+ PARROT_ASSERT(U_SUCCESS(err));
+ res->bufused = dest_len * sizeof (UChar);
- codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
+ /* downgrade if possible */
+ if (dest_len == (int)src->strlen)
+ res->encoding = Parrot_ucs2_encoding_ptr;
+ else {
+ /* string is likely still ucs2 if it was earlier
+ * but strlen changed due to combining char
+ */
+ res->strlen = dest_len;
+ }
- if (codepoint >= 256)
- return u_iscclass(interp, codepoint, flags) != 0;
+ return res;
- return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
+#else
+ UNUSED(src);
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+#endif
}
/*
-=item C<static INTVAL find_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
+=item C<STRING* unicode_downcase(PARROT_INTERP, const STRING *src)>
-Find a character in the given character class.
+Converts all graphemes to lower-case, for those graphemes which have cases.
+
+Throws an exception if ICU is not installed.
=cut
*/
-static INTVAL
-find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+PARROT_CANNOT_RETURN_NULL
+STRING*
+unicode_downcase(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(find_cclass)
- String_iter iter;
- UINTVAL codepoint;
- UINTVAL end = offset + count;
+ ASSERT_ARGS(unicode_downcase)
+#if PARROT_HAS_ICU
+ UErrorCode err;
+ int dest_len, src_len;
+ STRING *res;
+#endif
- STRING_ITER_INIT(interp, &iter);
- STRING_ITER_SET_POSITION(interp, src, &iter, offset);
+ if (src->bufused == src->strlen
+ && src->encoding == Parrot_utf8_encoding_ptr) {
+ return Parrot_ascii_encoding_ptr->downcase(interp, src);
+ }
- end = src->strlen < end ? src->strlen : end;
+#if PARROT_HAS_ICU
+ /* to_encoding will allocate new string */
+ res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
+ /*
+U_CAPI int32_t U_EXPORT2
+u_strToLower(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ const char *locale,
+ UErrorCode *pErrorCode);
+ */
+ err = U_ZERO_ERROR;
+ src_len = res->bufused / sizeof (UChar);
+ dest_len = u_strToLower((UChar *)res->strstart, src_len,
+ (UChar *)res->strstart, src_len,
+ NULL, /* locale = default */
+ &err);
+ res->bufused = dest_len * sizeof (UChar);
- while (iter.charpos < end) {
- codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (codepoint >= 256) {
- if (u_iscclass(interp, codepoint, flags))
- return iter.charpos - 1;
- }
- else {
- if (Parrot_iso_8859_1_typetable[codepoint] & flags)
- return iter.charpos - 1;
- }
+ if (!U_SUCCESS(err)) {
+ err = U_ZERO_ERROR;
+ Parrot_gc_reallocate_string_storage(interp, res, res->bufused);
+ dest_len = u_strToLower((UChar *)res->strstart, dest_len,
+ (UChar *)res->strstart, src_len,
+ NULL, /* locale = default */
+ &err);
+ PARROT_ASSERT(U_SUCCESS(err));
}
- return end;
+ /* downgrade if possible */
+ if (dest_len == (int)res->strlen)
+ res->encoding = Parrot_ucs2_encoding_ptr;
+
+ return res;
+
+#else
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+#endif
}
/*
-=item C<static INTVAL find_not_cclass(PARROT_INTERP, INTVAL flags, const STRING
-*src, UINTVAL offset, UINTVAL count)>
+=item C<STRING* unicode_titlecase(PARROT_INTERP, const STRING *src)>
-Returns C<INTVAL>.
+Converts the string to title case, for those characters which support cases.
+
+Throws an exception if ICU is not installed.
=cut
*/
-static INTVAL
-find_not_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src),
- UINTVAL offset, UINTVAL count)
+PARROT_CANNOT_RETURN_NULL
+STRING*
+unicode_titlecase(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(find_not_cclass)
- String_iter iter;
- UINTVAL codepoint;
- UINTVAL end = offset + count;
- int bit;
+ ASSERT_ARGS(unicode_titlecase)
+#if PARROT_HAS_ICU
- if (offset > src->strlen) {
- /* XXX: Throw in this case? */
- return offset + count;
- }
+ UErrorCode err;
+ int dest_len, src_len;
+ STRING *res;
- STRING_ITER_INIT(interp, &iter);
+ if (src->bufused == src->strlen
+ && src->encoding == Parrot_utf8_encoding_ptr) {
+ return Parrot_ascii_encoding_ptr->titlecase(interp, src);
+ }
- if (offset)
- STRING_ITER_SET_POSITION(interp, src, &iter, offset);
+ /* to_encoding will allocate new string */
+ res = Parrot_utf16_encoding_ptr->to_encoding(interp, src);
- end = src->strlen < end ? src->strlen : end;
+ /*
+U_CAPI int32_t U_EXPORT2
+u_strToTitle(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UBreakIterator *titleIter,
+ const char *locale,
+ UErrorCode *pErrorCode);
+ */
- if (flags == enum_cclass_any)
- return end;
+ err = U_ZERO_ERROR;
+ src_len = res->bufused / sizeof (UChar);
+ dest_len = u_strToTitle((UChar *)res->strstart, src_len,
+ (UChar *)res->strstart, src_len,
+ NULL, /* default titleiter */
+ NULL, /* locale = default */
+ &err);
+ res->bufused = dest_len * sizeof (UChar);
- while (iter.charpos < end) {
- codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- if (codepoint >= 256) {
- for (bit = enum_cclass_uppercase;
- bit <= enum_cclass_word ; bit <<= 1) {
- if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
- return iter.charpos - 1;
- }
- }
- else {
- if (!(Parrot_iso_8859_1_typetable[codepoint] & flags))
- return iter.charpos - 1;
- }
+ if (!U_SUCCESS(err)) {
+ err = U_ZERO_ERROR;
+ Parrot_gc_reallocate_string_storage(interp, res, res->bufused);
+ dest_len = u_strToTitle((UChar *)res->strstart, dest_len,
+ (UChar *)res->strstart, src_len,
+ NULL, NULL,
+ &err);
+ PARROT_ASSERT(U_SUCCESS(err));
}
- return end;
+ /* downgrade if possible */
+ if (dest_len == (int)res->strlen)
+ res->encoding = Parrot_ucs2_encoding_ptr;
+
+ return res;
+
+#else
+ UNUSED(src);
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+#endif
}
/*
-=item C<static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)>
+=item C<STRING* unicode_upcase_first(PARROT_INTERP, const STRING *src)>
-Returns a one-codepoint string for the given codepoint.
+Converts the first grapheme in the STRING C<src> to uppercase, if the
+grapheme supports it. Not implemented.
=cut
*/
PARROT_CANNOT_RETURN_NULL
-static STRING *
-string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
+STRING*
+unicode_upcase_first(PARROT_INTERP, SHIM(const STRING *src))
{
- ASSERT_ARGS(string_from_codepoint)
- String_iter iter;
- STRING * const dest = string_make(interp, "", 1, "unicode", 0);
-
- dest->strlen = 1;
-
- STRING_ITER_INIT(interp, &iter);
- STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, codepoint);
- dest->bufused = iter.bytepos;
-
- return dest;
+ ASSERT_ARGS(unicode_upcase_first)
+ /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
+ UNIMPL;
}
/*
-=item C<static size_t compute_hash(PARROT_INTERP, const STRING *src, size_t
-seed)>
+=item C<STRING* unicode_downcase_first(PARROT_INTERP, const STRING *src)>
-Computes the hash of the given STRING C<src> with starting seed value C<seed>.
+Converts the first grapheme in the STRING C<src> to lower-case, if
+the grapheme supports it. Not implemented
=cut
*/
-static size_t
-compute_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed)
+PARROT_CANNOT_RETURN_NULL
+STRING*
+unicode_downcase_first(PARROT_INTERP, SHIM(const STRING *src))
{
- ASSERT_ARGS(compute_hash)
- String_iter iter;
- size_t hashval = seed;
-
- STRING_ITER_INIT(interp, &iter);
-
- while (iter.charpos < src->strlen) {
- const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter);
- hashval += hashval << 5;
- hashval += c;
- }
-
- return hashval;
+ ASSERT_ARGS(unicode_downcase_first)
+ /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
+ UNIMPL;
}
/*
-=item C<void Parrot_charset_unicode_init(PARROT_INTERP)>
+=item C<STRING* unicode_titlecase_first(PARROT_INTERP, const STRING *src)>
-Initializes the Unicode charset by installing all the necessary function
-pointers.
+Converts the first grapheme in STRING C<src> to title case, if the
+string supports it. Not implemented.
=cut
*/
-void
-Parrot_charset_unicode_init(PARROT_INTERP)
+PARROT_CANNOT_RETURN_NULL
+STRING*
+unicode_titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
{
- ASSERT_ARGS(Parrot_charset_unicode_init)
- CHARSET * const return_set = Parrot_new_charset(interp);
- static const CHARSET base_set = {
- "unicode",
- get_graphemes,
- to_charset,
- compose,
- decompose,
- upcase,
- downcase,
- titlecase,
- upcase_first,
- downcase_first,
- titlecase_first,
- compare,
- mixed_cs_index,
- cs_rindex,
- validate,
- is_cclass,
- find_cclass,
- find_not_cclass,
- string_from_codepoint,
- compute_hash,
- NULL
- };
-
- STRUCT_COPY_FROM_STRUCT(return_set, base_set);
-
- /*
- * for now use utf8
- * TODO replace it with a fixed uint_16 or uint_32 encoding
- * XXX if this is changed, modify string_make so it
- * still takes "utf8" when fed "unicode" as charset!
- */
- return_set->preferred_encoding = Parrot_utf8_encoding_ptr;
- Parrot_register_charset(interp, "unicode", return_set);
-
- return;
+ ASSERT_ARGS(unicode_titlecase_first)
+ /* TODO: https://trac.parrot.org/parrot/wiki/StringsTasklist Implement this. */
+ UNIMPL;
}
@@ -1073,3 +1385,4 @@
* End:
* vim: expandtab shiftwidth=4:
*/
+
Added: trunk/src/string/encoding/shared.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ trunk/src/string/encoding/shared.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -0,0 +1,369 @@
+/* fixed_8.h
+ * Copyright (C) 2004-2007, Parrot Foundation.
+ * SVN Info
+ * $Id$
+ * Overview:
+ * This is the header for the 8-bit fixed-width encoding
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#ifndef PARROT_ENCODING_SHARED_H_GUARD
+#define PARROT_ENCODING_SHARED_H_GUARD
+
+/* HEADERIZER BEGIN: src/string/encoding/shared.c */
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_compare(PARROT_INTERP,
+ ARGIN(const STRING *lhs),
+ ARGIN(const STRING *rhs))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* encoding_decompose(PARROT_INTERP, SHIM(const STRING *src))
+ __attribute__nonnull__(1);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_equal(PARROT_INTERP,
+ ARGIN(const STRING *lhs),
+ ARGIN(const STRING *rhs))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_find_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_find_not_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * encoding_get_graphemes(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+size_t encoding_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_index(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ ARGIN(const STRING *search),
+ UINTVAL offs)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_is_cclass(PARROT_INTERP,
+ INTVAL flags,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL encoding_rindex(PARROT_INTERP,
+ SHIM(const STRING *src),
+ SHIM(const STRING *search_string),
+ NULLOK(UINTVAL offset))
+ __attribute__nonnull__(1);
+
+UINTVAL encoding_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+STRING * encoding_substr(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL fixed8_compare(PARROT_INTERP,
+ ARGIN(const STRING *lhs),
+ ARGIN(const STRING *rhs))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* fixed8_compose(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL fixed8_equal(PARROT_INTERP,
+ ARGIN(const STRING *lhs),
+ ARGIN(const STRING *rhs))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+size_t fixed8_hash(SHIM_INTERP, ARGIN(const STRING *s), size_t hashval)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL fixed8_index(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ ARGIN(const STRING *search_string),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+UINTVAL fixed8_iter_get(PARROT_INTERP,
+ ARGIN(const STRING *str),
+ ARGIN(const String_iter *iter),
+ INTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP,
+ ARGIN(const STRING *str),
+ ARGMOD(String_iter *iter))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3)
+ FUNC_MODIFIES(*iter);
+
+void fixed8_iter_set_and_advance(PARROT_INTERP,
+ ARGMOD(STRING *str),
+ ARGMOD(String_iter *iter),
+ UINTVAL c)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3)
+ FUNC_MODIFIES(*str)
+ FUNC_MODIFIES(*iter);
+
+void fixed8_iter_set_position(SHIM_INTERP,
+ ARGIN(const STRING *str),
+ ARGMOD(String_iter *iter),
+ UINTVAL pos)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3)
+ FUNC_MODIFIES(*iter);
+
+void fixed8_iter_skip(SHIM_INTERP,
+ ARGIN(const STRING *str),
+ ARGMOD(String_iter *iter),
+ INTVAL skip)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3)
+ FUNC_MODIFIES(*iter);
+
+PARROT_WARN_UNUSED_RESULT
+UINTVAL fixed8_ord(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+INTVAL fixed8_rindex(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ ARGIN(const STRING *search_string),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ __attribute__nonnull__(3);
+
+PARROT_WARN_UNUSED_RESULT
+UINTVAL fixed8_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+STRING * fixed8_substr(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+STRING * unicode_chr(PARROT_INTERP, UINTVAL codepoint)
+ __attribute__nonnull__(1);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_compose(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_downcase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_downcase_first(PARROT_INTERP, SHIM(const STRING *src))
+ __attribute__nonnull__(1);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_titlecase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
+ __attribute__nonnull__(1);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_upcase(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_CANNOT_RETURN_NULL
+STRING* unicode_upcase_first(PARROT_INTERP, SHIM(const STRING *src))
+ __attribute__nonnull__(1);
+
+UINTVAL unicode_validate(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+#define ASSERT_ARGS_encoding_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(lhs) \
+ , PARROT_ASSERT_ARG(rhs))
+#define ASSERT_ARGS_encoding_decompose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_encoding_equal __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(lhs) \
+ , PARROT_ASSERT_ARG(rhs))
+#define ASSERT_ARGS_encoding_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_find_not_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_get_graphemes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src) \
+ , PARROT_ASSERT_ARG(search))
+#define ASSERT_ARGS_encoding_is_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_rindex __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_encoding_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_encoding_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_fixed8_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(lhs) \
+ , PARROT_ASSERT_ARG(rhs))
+#define ASSERT_ARGS_fixed8_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_fixed8_equal __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(lhs) \
+ , PARROT_ASSERT_ARG(rhs))
+#define ASSERT_ARGS_fixed8_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(s))
+#define ASSERT_ARGS_fixed8_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src) \
+ , PARROT_ASSERT_ARG(search_string))
+#define ASSERT_ARGS_fixed8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(str) \
+ , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(str) \
+ , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(str) \
+ , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(str) \
+ , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(str) \
+ , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_ord __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_fixed8_rindex __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src) \
+ , PARROT_ASSERT_ARG(search_string))
+#define ASSERT_ARGS_fixed8_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_fixed8_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_unicode_chr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_unicode_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_unicode_downcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_unicode_downcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_unicode_titlecase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_unicode_titlecase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_unicode_upcase __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_unicode_upcase_first __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_unicode_validate __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
+/* HEADERIZER END: src/string/encoding/shared.c */
+
+#endif /* PARROT_ENCODING_SHARED_H_GUARD */
+
+/*
+ * Local variables:
+ * c-file-style: "parrot"
+ * End:
+ * vim: expandtab shiftwidth=4:
+ */
Copied and modified: trunk/src/string/encoding/tables.c (from r48832, trunk/src/string/charset/tables.c)
==============================================================================
Copied and modified: trunk/src/string/encoding/tables.h (from r48832, trunk/src/string/charset/tables.h)
==============================================================================
Modified: trunk/src/string/encoding/ucs2.c
==============================================================================
--- trunk/src/string/encoding/ucs2.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/ucs2.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -20,6 +20,7 @@
#include "parrot/parrot.h"
#include "../unicode.h"
+#include "shared.h"
#if !PARROT_HAS_ICU
PARROT_DOES_NOT_RETURN
@@ -36,66 +37,6 @@
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL bytes(SHIM_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL codepoints(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL find_cclass(PARROT_INTERP,
- ARGIN(const STRING *s),
- ARGIN(const INTVAL *typetable),
- INTVAL flags,
- UINTVAL pos,
- UINTVAL end)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-static UINTVAL get_byte(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(UINTVAL offset))
- __attribute__nonnull__(1);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_bytes(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(UINTVAL offset),
- SHIM(UINTVAL count))
- __attribute__nonnull__(1);
-
-static UINTVAL get_codepoint(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_codepoints(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static void set_byte(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(UINTVAL offset),
- SHIM(UINTVAL byte))
- __attribute__nonnull__(1);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
static size_t ucs2_hash(PARROT_INTERP,
ARGIN(const STRING *s),
size_t hashval)
@@ -146,30 +87,32 @@
__attribute__nonnull__(3)
FUNC_MODIFIES(*i);
-#define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(s) \
- , PARROT_ASSERT_ARG(typetable))
-#define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_get_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
+static UINTVAL ucs2_ord(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL ucs2_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static STRING * ucs2_substr(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static STRING * ucs2_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
#define ASSERT_ARGS_ucs2_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(s))
@@ -193,6 +136,18 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_ucs2_ord __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ucs2_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ucs2_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ucs2_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: static */
@@ -207,7 +162,7 @@
/*
-=item C<static STRING * to_encoding(PARROT_INTERP, const STRING *src)>
+=item C<static STRING * ucs2_to_encoding(PARROT_INTERP, const STRING *src)>
Converts the string C<src> to this particular encoding. If C<dest> is
provided, it will contain the result. Otherwise this function operates in
@@ -220,9 +175,9 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static STRING *
-to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ucs2_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(to_encoding)
+ ASSERT_ARGS(ucs2_to_encoding)
STRING * const result =
Parrot_utf16_encoding_ptr->to_encoding(interp, src);
@@ -236,98 +191,57 @@
/*
-=item C<static UINTVAL get_codepoint(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
+=item C<static UINTVAL ucs2_scan(PARROT_INTERP, const STRING *src)>
-Returns the codepoint in string C<src> at position C<offset>.
+Returns the number of codepoints in string C<src>.
=cut
*/
+PARROT_WARN_UNUSED_RESULT
static UINTVAL
-get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
+ucs2_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(get_codepoint)
+ ASSERT_ARGS(ucs2_scan)
#if PARROT_HAS_ICU
- const UChar * const s = (const UChar*) src->strstart;
UNUSED(interp);
- return s[offset];
+ return src->bufused / sizeof (UChar);
#else
- UNUSED(offset);
UNUSED(src);
no_ICU_lib(interp);
#endif
}
-
-/*
-
-=item C<static UINTVAL find_cclass(PARROT_INTERP, const STRING *s, const INTVAL
-*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
-
-Stub, the charset level handles this for unicode strings.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-find_cclass(PARROT_INTERP, ARGIN(const STRING *s), ARGIN(const INTVAL *typetable),
-INTVAL flags, UINTVAL pos, UINTVAL end)
-{
- UNUSED(s);
- UNUSED(typetable);
- UNUSED(flags);
- UNUSED(pos);
- UNUSED(end);
-
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No find_cclass support in unicode encoding plugins");
-}
-
/*
-=item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
+=item C<static UINTVAL ucs2_ord(PARROT_INTERP, const STRING *src, UINTVAL
offset)>
-Returns the byte in string C<src> at position C<offset>.
+Returns the codepoint in string C<src> at position C<offset>.
=cut
*/
static UINTVAL
-get_byte(PARROT_INTERP, SHIM(const STRING *src), SHIM(UINTVAL offset))
-{
- ASSERT_ARGS(get_byte)
- UNIMPL;
-}
-
-/*
-
-=item C<static void set_byte(PARROT_INTERP, const STRING *src, UINTVAL offset,
-UINTVAL byte)>
-
-Sets, in string C<src> at position C<offset>, the byte C<byte>.
-
-=cut
-
-*/
-
-static void
-set_byte(PARROT_INTERP, SHIM(const STRING *src), SHIM(UINTVAL offset),
- SHIM(UINTVAL byte))
+ucs2_ord(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
{
- ASSERT_ARGS(set_byte)
- UNIMPL;
+ ASSERT_ARGS(ucs2_ord)
+#if PARROT_HAS_ICU
+ const UChar * const s = (const UChar*) src->strstart;
+ UNUSED(interp);
+ return s[offset];
+#else
+ UNUSED(offset);
+ UNUSED(src);
+ no_ICU_lib(interp);
+#endif
}
/*
-=item C<static STRING * get_codepoints(PARROT_INTERP, const STRING *src, UINTVAL
+=item C<static STRING * ucs2_substr(PARROT_INTERP, const STRING *src, UINTVAL
offset, UINTVAL count)>
Returns the codepoints in string C<src> at position C<offset> and length
@@ -340,98 +254,27 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static STRING *
-get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+ucs2_substr(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
{
- ASSERT_ARGS(get_codepoints)
+ ASSERT_ARGS(ucs2_substr)
STRING * const return_string = Parrot_str_copy(interp, src);
#if PARROT_HAS_ICU
return_string->strstart = (char*)src->strstart + offset * sizeof (UChar);
- return_string->bufused = count * sizeof (UChar);
-#else
- {
- String_iter iter;
- UINTVAL start;
-
- STRING_ITER_INIT(interp, &iter);
- ucs2_iter_set_position(interp, src, &iter, offset);
- start = iter.bytepos;
- return_string->strstart = (char *)return_string->strstart + start;
- ucs2_iter_set_position(interp, src, &iter, offset + count);
- return_string->bufused = iter.bytepos - start;
- }
-#endif
- return_string->strlen = count;
- return_string->hashval = 0;
+ return_string->bufused = count * sizeof (UChar);
+ return_string->strlen = count;
+ return_string->hashval = 0;
return return_string;
-}
-
-/*
-
-=item C<static STRING * get_bytes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the bytes in string C<src> at position C<offset> and length C<count>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_bytes(PARROT_INTERP, SHIM(const STRING *src), SHIM(UINTVAL offset),
- SHIM(UINTVAL count))
-{
- ASSERT_ARGS(get_bytes)
- UNIMPL;
-}
-
-
-/*
-
-=item C<static UINTVAL codepoints(PARROT_INTERP, const STRING *src)>
-
-Returns the number of codepoints in string C<src>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-codepoints(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(codepoints)
-#if PARROT_HAS_ICU
- UNUSED(interp);
- return src->bufused / sizeof (UChar);
#else
UNUSED(src);
+ UNUSED(offset);
+ UNUSED(count);
no_ICU_lib(interp);
#endif
}
/*
-=item C<static UINTVAL bytes(PARROT_INTERP, const STRING *src)>
-
-Returns the number of bytes in string C<src>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-bytes(SHIM_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(bytes)
- return src->bufused;
-}
-
-/*
-
=item C<static UINTVAL ucs2_iter_get(PARROT_INTERP, const STRING *str, const
String_iter *i, INTVAL offset)>
@@ -446,7 +289,7 @@
ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
{
ASSERT_ARGS(ucs2_iter_get)
- return get_codepoint(interp, str, i->charpos + offset);
+ return ucs2_ord(interp, str, i->charpos + offset);
}
/*
@@ -607,46 +450,50 @@
#endif
}
-/*
-
-=item C<void Parrot_encoding_ucs2_init(PARROT_INTERP)>
+static STR_VTABLE Parrot_ucs2_encoding = {
+ 0,
+ "ucs2",
+ NULL,
+ 2, /* Max bytes per codepoint */
+
+ ucs2_to_encoding,
+ unicode_chr,
+
+ encoding_equal,
+ encoding_compare,
+ encoding_index,
+ encoding_rindex,
+ encoding_hash,
+ unicode_validate,
+
+ ucs2_scan,
+ ucs2_ord,
+ ucs2_substr,
+
+ encoding_is_cclass,
+ encoding_find_cclass,
+ encoding_find_not_cclass,
+
+ encoding_get_graphemes,
+ unicode_compose,
+ encoding_decompose,
+
+ unicode_upcase,
+ unicode_downcase,
+ unicode_titlecase,
+ unicode_upcase_first,
+ unicode_downcase_first,
+ unicode_titlecase_first,
+
+ ucs2_iter_get,
+ ucs2_iter_skip,
+ ucs2_iter_get_and_advance,
+ ucs2_iter_set_and_advance,
+ ucs2_iter_set_position
+};
-Initializes the UCS-2 encoding.
+STR_VTABLE *Parrot_ucs2_encoding_ptr = &Parrot_ucs2_encoding;
-=cut
-
-*/
-
-void
-Parrot_encoding_ucs2_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_encoding_ucs2_init)
- ENCODING * const return_encoding = Parrot_new_encoding(interp);
-
- static const ENCODING base_encoding = {
- "ucs2",
- 2, /* Max bytes per codepoint 0 .. 0x10ffff */
- to_encoding,
- get_codepoint,
- get_byte,
- set_byte,
- get_codepoints,
- get_bytes,
- codepoints,
- bytes,
- find_cclass,
- ucs2_hash,
- ucs2_iter_get,
- ucs2_iter_skip,
- ucs2_iter_get_and_advance,
- ucs2_iter_set_and_advance,
- ucs2_iter_set_position
- };
- STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
- Parrot_register_encoding(interp, "ucs2", return_encoding);
-
- return;
-}
/*
Modified: trunk/src/string/encoding/ucs2.h
==============================================================================
--- trunk/src/string/encoding/ucs2.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/ucs2.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -16,11 +16,7 @@
/* HEADERIZER BEGIN: src/string/encoding/ucs2.c */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-void Parrot_encoding_ucs2_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-#define ASSERT_ARGS_Parrot_encoding_ucs2_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding/ucs2.c */
Modified: trunk/src/string/encoding/ucs4.c
==============================================================================
--- trunk/src/string/encoding/ucs4.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/ucs4.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -20,6 +20,7 @@
#include "parrot/parrot.h"
#include "../unicode.h"
+#include "shared.h"
#if !PARROT_HAS_ICU
PARROT_DOES_NOT_RETURN
@@ -36,66 +37,6 @@
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL bytes(SHIM_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL codepoints(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL find_cclass(PARROT_INTERP,
- ARGIN(const STRING *s),
- ARGIN(const INTVAL *typetable),
- INTVAL flags,
- UINTVAL pos,
- UINTVAL end)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-static UINTVAL get_byte(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(UINTVAL offset))
- __attribute__nonnull__(1);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_bytes(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(UINTVAL offset),
- SHIM(UINTVAL count))
- __attribute__nonnull__(1);
-
-static UINTVAL get_codepoint(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_codepoints(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static void set_byte(PARROT_INTERP,
- SHIM(const STRING *src),
- SHIM(UINTVAL offset),
- SHIM(UINTVAL byte))
- __attribute__nonnull__(1);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
static size_t ucs4_hash(PARROT_INTERP,
ARGIN(const STRING *s),
size_t hashval)
@@ -146,30 +87,32 @@
__attribute__nonnull__(3)
FUNC_MODIFIES(*i);
-#define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(s) \
- , PARROT_ASSERT_ARG(typetable))
-#define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_get_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
+static UINTVAL ucs4_ord(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL ucs4_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static STRING * ucs4_substr(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static STRING * ucs4_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
#define ASSERT_ARGS_ucs4_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(s))
@@ -193,6 +136,18 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_ucs4_ord __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ucs4_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ucs4_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_ucs4_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: static */
@@ -202,9 +157,10 @@
# include <unicode/ustring.h>
#endif
+
/*
-=item C<static STRING * to_encoding(PARROT_INTERP, const STRING *src)>
+=item C<static STRING * ucs4_to_encoding(PARROT_INTERP, const STRING *src)>
Converts the string C<src> to this particular encoding.
@@ -215,9 +171,9 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static STRING *
-to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ucs4_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(to_encoding)
+ ASSERT_ARGS(ucs4_to_encoding)
#if PARROT_HAS_ICU
if (src->encoding == Parrot_ucs4_encoding_ptr) {
return Parrot_str_clone(interp, src);
@@ -225,11 +181,12 @@
else {
UINTVAL len = Parrot_str_length(interp, src);
STRING *res = Parrot_str_new_init(interp, NULL, len * sizeof (UChar32),
- Parrot_ucs4_encoding_ptr, Parrot_unicode_charset_ptr, 0);
+ Parrot_ucs4_encoding_ptr, 0);
UChar32 *buf = (UChar32 *) res->strstart;
UINTVAL offs;
+ /* TODO: use an iterator */
for (offs = 0; offs < len; offs++){
- buf[offs] = src->encoding->get_codepoint(interp, src, offs);
+ buf[offs] = STRING_ord(interp, src, offs);
};
res->strlen = len;
res->bufused = len * sizeof (UChar32);
@@ -243,27 +200,26 @@
}
+
/*
-=item C<static UINTVAL get_codepoint(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
+=item C<static UINTVAL ucs4_scan(PARROT_INTERP, const STRING *src)>
-Returns the codepoint in string C<src> at position C<offset>.
+Returns the number of codepoints in string C<src>.
=cut
*/
+PARROT_WARN_UNUSED_RESULT
static UINTVAL
-get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
+ucs4_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(get_codepoint)
+ ASSERT_ARGS(ucs4_scan)
#if PARROT_HAS_ICU
- const UChar32 * const s = (const UChar32*) src->strstart;
UNUSED(interp);
- return s[offset];
+ return src->bufused / sizeof (UChar32);
#else
- UNUSED(offset);
UNUSED(src);
no_ICU_lib(interp);
#endif
@@ -272,77 +228,34 @@
/*
-=item C<static UINTVAL find_cclass(PARROT_INTERP, const STRING *s, const INTVAL
-*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
-
-Stub, the charset level handles this for unicode strings.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-find_cclass(PARROT_INTERP, ARGIN(const STRING *s), ARGIN(const INTVAL *typetable),
-INTVAL flags, UINTVAL pos, UINTVAL end)
-{
- ASSERT_ARGS(find_cclass)
-
- UNUSED(s);
- UNUSED(typetable);
- UNUSED(flags);
- UNUSED(pos);
- UNUSED(end);
-
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No find_cclass support in unicode encoding plugins");
-}
-
-/*
-
-=item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
+=item C<static UINTVAL ucs4_ord(PARROT_INTERP, const STRING *src, UINTVAL
offset)>
-Returns the byte in string C<src> at position C<offset>.
+Returns the codepoint in string C<src> at position C<offset>.
=cut
*/
static UINTVAL
-get_byte(PARROT_INTERP, SHIM(const STRING *src), SHIM(UINTVAL offset))
+ucs4_ord(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
{
- ASSERT_ARGS(get_byte)
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No get_byte for UCS-4");
+ ASSERT_ARGS(ucs4_ord)
+#if PARROT_HAS_ICU
+ const UChar32 * const s = (const UChar32*) src->strstart;
+ UNUSED(interp);
+ return s[offset];
+#else
+ UNUSED(offset);
+ UNUSED(src);
+ no_ICU_lib(interp);
+#endif
}
-/*
-
-=item C<static void set_byte(PARROT_INTERP, const STRING *src, UINTVAL offset,
-UINTVAL byte)>
-
-Sets, in string C<src> at position C<offset>, the byte C<byte>.
-
-=cut
-
-*/
-
-static void
-set_byte(PARROT_INTERP, SHIM(const STRING *src), SHIM(UINTVAL offset),
- SHIM(UINTVAL byte))
-{
- ASSERT_ARGS(set_byte)
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No set_byte for UCS-4");
-}
/*
-=item C<static STRING * get_codepoints(PARROT_INTERP, const STRING *src, UINTVAL
+=item C<static STRING * ucs4_substr(PARROT_INTERP, const STRING *src, UINTVAL
offset, UINTVAL count)>
Returns the C<count> codepoints stored at position C<offset> in string
@@ -355,12 +268,12 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static STRING *
-get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+ucs4_substr(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
{
- ASSERT_ARGS(get_codepoints)
+ ASSERT_ARGS(ucs4_substr)
#if PARROT_HAS_ICU
return Parrot_str_new_init(interp, (char*)src->strstart + offset * sizeof (UChar32),
- count * sizeof (UChar32), src->encoding, src->charset, 0);
+ count * sizeof (UChar32), src->encoding, 0);
#else
UNUSED(src);
UNUSED(offset);
@@ -369,71 +282,6 @@
#endif
}
-/*
-
-=item C<static STRING * get_bytes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the bytes in string C<src> at position C<offset> and length C<count>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_bytes(PARROT_INTERP, SHIM(const STRING *src), SHIM(UINTVAL offset),
- SHIM(UINTVAL count))
-{
- ASSERT_ARGS(get_bytes)
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No get_bytes for UCS-4");
-}
-
-
-/*
-
-=item C<static UINTVAL codepoints(PARROT_INTERP, const STRING *src)>
-
-Returns the number of codepoints in string C<src>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-codepoints(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(codepoints)
-#if PARROT_HAS_ICU
- UNUSED(interp);
- return src->bufused / sizeof (UChar32);
-#else
- UNUSED(src);
- no_ICU_lib(interp);
-#endif
-}
-
-/*
-
-=item C<static UINTVAL bytes(PARROT_INTERP, const STRING *src)>
-
-Returns the number of bytes in string C<src>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-bytes(SHIM_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(bytes)
- return src->bufused;
-}
/*
@@ -451,9 +299,10 @@
ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset)
{
ASSERT_ARGS(ucs4_iter_get)
- return get_codepoint(interp, str, i->charpos + offset);
+ return ucs4_ord(interp, str, i->charpos + offset);
}
+
/*
=item C<static void ucs4_iter_skip(PARROT_INTERP, const STRING *str, String_iter
@@ -482,6 +331,7 @@
#endif
}
+
/*
=item C<static UINTVAL ucs4_iter_get_and_advance(PARROT_INTERP, const STRING
@@ -512,6 +362,7 @@
#endif
}
+
/*
=item C<static void ucs4_iter_set_and_advance(PARROT_INTERP, STRING *str,
@@ -542,6 +393,7 @@
#endif
}
+
/*
=item C<static void ucs4_iter_set_position(PARROT_INTERP, const STRING *str,
@@ -570,7 +422,7 @@
#endif
}
-#if PARROT_HAS_ICU
+
/*
=item C<static size_t ucs4_hash(PARROT_INTERP, const STRING *s, size_t hashval)>
@@ -596,52 +448,52 @@
return hashval;
}
-#endif
-/*
-
-=item C<void Parrot_encoding_ucs4_init(PARROT_INTERP)>
-
-Initializes the UCS-4 encoding.
-
-=cut
-
-*/
-void
-Parrot_encoding_ucs4_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_encoding_ucs4_init)
- ENCODING * const return_encoding = Parrot_new_encoding(interp);
+static STR_VTABLE Parrot_ucs4_encoding = {
+ 0,
+ "ucs4",
+ NULL,
+ 4, /* Max bytes per codepoint */
+
+ ucs4_to_encoding,
+ unicode_chr,
+
+ encoding_equal,
+ encoding_compare,
+ encoding_index,
+ encoding_rindex,
+ ucs4_hash,
+ unicode_validate,
+
+ ucs4_scan,
+ ucs4_ord,
+ ucs4_substr,
+
+ encoding_is_cclass,
+ encoding_find_cclass,
+ encoding_find_not_cclass,
+
+ encoding_get_graphemes,
+ unicode_compose,
+ encoding_decompose,
+
+ unicode_upcase,
+ unicode_downcase,
+ unicode_titlecase,
+ unicode_upcase_first,
+ unicode_downcase_first,
+ unicode_titlecase_first,
+
+ ucs4_iter_get,
+ ucs4_iter_skip,
+ ucs4_iter_get_and_advance,
+ ucs4_iter_set_and_advance,
+ ucs4_iter_set_position
+};
- static const ENCODING base_encoding = {
- "ucs4",
- 4, /* Max bytes per codepoint */
- to_encoding,
- get_codepoint,
- get_byte,
- set_byte,
- get_codepoints,
- get_bytes,
- codepoints,
- bytes,
- find_cclass,
-#if PARROT_HAS_ICU
- ucs4_hash,
-#else
- NULL,
-#endif
- ucs4_iter_get,
- ucs4_iter_skip,
- ucs4_iter_get_and_advance,
- ucs4_iter_set_and_advance,
- ucs4_iter_set_position
- };
- STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
- Parrot_register_encoding(interp, "ucs4", return_encoding);
+STR_VTABLE *Parrot_ucs4_encoding_ptr = &Parrot_ucs4_encoding;
- return;
-}
/*
Modified: trunk/src/string/encoding/ucs4.h
==============================================================================
--- trunk/src/string/encoding/ucs4.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/ucs4.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -9,11 +9,7 @@
/* HEADERIZER BEGIN: src/string/encoding/ucs4.c */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-void Parrot_encoding_ucs4_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-#define ASSERT_ARGS_Parrot_encoding_ucs4_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding/ucs4.c */
Modified: trunk/src/string/encoding/utf16.c
==============================================================================
--- trunk/src/string/encoding/utf16.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/utf16.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -20,74 +20,13 @@
#include "parrot/parrot.h"
#include "../unicode.h"
+#include "shared.h"
/* HEADERIZER HFILE: src/string/encoding/utf16.h */
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL bytes(SHIM_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL codepoints(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL find_cclass(PARROT_INTERP,
- ARGIN(const STRING *s),
- ARGIN(const INTVAL *typetable),
- INTVAL flags,
- UINTVAL pos,
- UINTVAL end)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3);
-
-static UINTVAL get_byte(SHIM_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_bytes(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static UINTVAL get_codepoint(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_codepoints(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static void set_byte(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL byte)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING * to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
static UINTVAL utf16_iter_get(PARROT_INTERP,
ARGIN(const STRING *str),
ARGIN(const String_iter *i),
@@ -133,32 +72,32 @@
__attribute__nonnull__(3)
FUNC_MODIFIES(*i);
-#define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(s) \
- , PARROT_ASSERT_ARG(typetable))
-#define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
+static UINTVAL utf16_ord(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL utf16_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static STRING * utf16_substr(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset,
+ UINTVAL count)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+PARROT_WARN_UNUSED_RESULT
+PARROT_CANNOT_RETURN_NULL
+static STRING * utf16_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
#define ASSERT_ARGS_utf16_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(str) \
@@ -179,6 +118,18 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_utf16_ord __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf16_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf16_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf16_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: static */
@@ -195,7 +146,7 @@
/*
-=item C<static STRING * to_encoding(PARROT_INTERP, const STRING *src)>
+=item C<static STRING * utf16_to_encoding(PARROT_INTERP, const STRING *src)>
Converts the string C<src> to this particular encoding. If C<dest> is
provided, it will contain the result. Otherwise this function operates in
@@ -209,9 +160,9 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static STRING *
-to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+utf16_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(to_encoding)
+ ASSERT_ARGS(utf16_to_encoding)
#if PARROT_HAS_ICU
UErrorCode err;
int dest_len;
@@ -231,7 +182,6 @@
*/
src_len = src->strlen;
if (!src_len) {
- result->charset = Parrot_unicode_charset_ptr;
result->encoding = Parrot_ucs2_encoding_ptr;
result->strlen = result->bufused = 0;
return result;
@@ -240,8 +190,8 @@
Parrot_gc_allocate_string_storage(interp, result, sizeof (UChar) * src_len);
p = (UChar *)result->strstart;
- if (src->charset == Parrot_iso_8859_1_charset_ptr ||
- src->charset == Parrot_ascii_charset_ptr) {
+ if (src->encoding == Parrot_latin1_encoding_ptr ||
+ src->encoding == Parrot_ascii_encoding_ptr) {
for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len) {
p[dest_len] = (UChar)((unsigned char*)src->strstart)[dest_len];
}
@@ -264,7 +214,6 @@
}
}
result->bufused = dest_len * sizeof (UChar);
- result->charset = Parrot_unicode_charset_ptr;
result->encoding = Parrot_utf16_encoding_ptr;
result->strlen = src_len;
@@ -280,118 +229,76 @@
/*
-=item C<static UINTVAL get_codepoint(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
+=item C<static UINTVAL utf16_scan(PARROT_INTERP, const STRING *src)>
-Returns the codepoint in string C<src> at position C<offset>.
+Returns the number of codepoints in string C<src> by scanning the whole
+string.
=cut
*/
+PARROT_WARN_UNUSED_RESULT
static UINTVAL
-get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
+utf16_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(get_codepoint)
+ ASSERT_ARGS(utf16_scan)
#if PARROT_HAS_ICU
const UChar * const s = (UChar*) src->strstart;
- UINTVAL c, pos;
- UNUSED(interp);
-
- pos = 0;
- U16_FWD_N_UNSAFE(s, pos, offset);
- U16_GET_UNSAFE(s, pos, c);
- return c;
+ UINTVAL pos = 0, charpos = 0;
+ /*
+ * this is used to initially calculate src->strlen,
+ * therefore we must scan the whole string
+ */
+ while (pos * sizeof (UChar) < src->bufused) {
+ U16_FWD_1_UNSAFE(s, pos);
+ ++charpos;
+ }
+ return charpos;
#else
UNUSED(src);
- UNUSED(offset);
Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
"no ICU lib loaded");
#endif
}
-
/*
-=item C<static UINTVAL find_cclass(PARROT_INTERP, const STRING *s, const INTVAL
-*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
-
-Stub, the charset level handles this for unicode strings.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-find_cclass(PARROT_INTERP, ARGIN(const STRING *s), ARGIN(const INTVAL *typetable),
-INTVAL flags, UINTVAL pos, UINTVAL end)
-{
- UNUSED(s);
- UNUSED(typetable);
- UNUSED(flags);
- UNUSED(pos);
- UNUSED(end);
-
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No find_cclass support in unicode encoding plugins");
-}
-
-/*
-
-=item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
+=item C<static UINTVAL utf16_ord(PARROT_INTERP, const STRING *src, UINTVAL
offset)>
-Returns the byte in string C<src> at position C<offset>.
+Returns the codepoint in string C<src> at position C<offset>.
=cut
*/
static UINTVAL
-get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(get_byte)
- const unsigned char * const contents = (unsigned char *)src->strstart;
- if (offset >= src->bufused) {
-/* Parrot_ex_throw_from_c_args(interp, NULL, 0,
- "get_byte past the end of the buffer (%i of %i)",
- offset, src->bufused); */
- return 0;
- }
- return contents[offset];
-}
-
-/*
-
-=item C<static void set_byte(PARROT_INTERP, const STRING *src, UINTVAL offset,
-UINTVAL byte)>
-
-Sets, in string C<src> at position C<offset>, the byte C<byte>.
-
-=cut
-
-*/
-
-static void
-set_byte(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL byte)
+utf16_ord(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
{
- ASSERT_ARGS(set_byte)
- unsigned char *contents;
+ ASSERT_ARGS(utf16_ord)
+#if PARROT_HAS_ICU
+ const UChar * const s = (UChar*) src->strstart;
+ UINTVAL c, pos;
+ UNUSED(interp);
- if (offset >= src->bufused)
- Parrot_ex_throw_from_c_args(interp, NULL, 0,
- "set_byte past the end of the buffer");
+ pos = 0;
+ U16_FWD_N_UNSAFE(s, pos, offset);
+ U16_GET_UNSAFE(s, pos, c);
+ return c;
+#else
+ UNUSED(src);
+ UNUSED(offset);
- contents = (unsigned char *)src->strstart;
- contents[offset] = (unsigned char)byte;
+ Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
+ "no ICU lib loaded");
+#endif
}
/*
-=item C<static STRING * get_codepoints(PARROT_INTERP, const STRING *src, UINTVAL
+=item C<static STRING * utf16_substr(PARROT_INTERP, const STRING *src, UINTVAL
offset, UINTVAL count)>
Returns the codepoints in string C<src> at position C<offset> and length
@@ -404,9 +311,9 @@
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static STRING *
-get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
+utf16_substr(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
{
- ASSERT_ARGS(get_codepoints)
+ ASSERT_ARGS(utf16_substr)
#if PARROT_HAS_ICU
UINTVAL pos = 0, start;
const UChar * const s = (UChar*) src->strstart;
@@ -430,84 +337,6 @@
#endif
}
-
-/*
-
-=item C<static STRING * get_bytes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the bytes in string C<src> at position C<offset> and length C<count>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_bytes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(get_bytes)
- UNUSED(interp);
- UNUSED(src);
- UNUSED(offset)
- UNUSED(count);
- UNIMPL;
-}
-
-/*
-
-=item C<static UINTVAL codepoints(PARROT_INTERP, const STRING *src)>
-
-Returns the number of codepoints in string C<src>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-codepoints(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(codepoints)
-#if PARROT_HAS_ICU
- const UChar * const s = (UChar*) src->strstart;
- UINTVAL pos = 0, charpos = 0;
- /*
- * this is used to initially calculate src->strlen,
- * therefore we must scan the whole string
- */
- while (pos * sizeof (UChar) < src->bufused) {
- U16_FWD_1_UNSAFE(s, pos);
- ++charpos;
- }
- return charpos;
-#else
- UNUSED(src);
-
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
-}
-
-/*
-
-=item C<static UINTVAL bytes(PARROT_INTERP, const STRING *src)>
-
-Returns the number of bytes in string C<src>.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-bytes(SHIM_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(bytes)
- return src->bufused;
-}
-
/*
=item C<static UINTVAL utf16_iter_get(PARROT_INTERP, const STRING *str, const
@@ -691,46 +520,51 @@
#endif
}
-/*
-
-=item C<void Parrot_encoding_utf16_init(PARROT_INTERP)>
-Initializes the UTF-16 encoding.
+static STR_VTABLE Parrot_utf16_encoding = {
+ 0,
+ "utf16",
+ NULL,
+ 4, /* Max bytes per codepoint */
+
+ utf16_to_encoding,
+ unicode_chr,
+
+ encoding_equal,
+ encoding_compare,
+ encoding_index,
+ encoding_rindex,
+ encoding_hash,
+ unicode_validate,
+
+ utf16_scan,
+ utf16_ord,
+ utf16_substr,
+
+ encoding_is_cclass,
+ encoding_find_cclass,
+ encoding_find_not_cclass,
+
+ encoding_get_graphemes,
+ unicode_compose,
+ encoding_decompose,
+
+ unicode_upcase,
+ unicode_downcase,
+ unicode_titlecase,
+ unicode_upcase_first,
+ unicode_downcase_first,
+ unicode_titlecase_first,
+
+ utf16_iter_get,
+ utf16_iter_skip,
+ utf16_iter_get_and_advance,
+ utf16_iter_set_and_advance,
+ utf16_iter_set_position
+};
-=cut
-
-*/
+STR_VTABLE *Parrot_utf16_encoding_ptr = &Parrot_utf16_encoding;
-void
-Parrot_encoding_utf16_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_encoding_utf16_init)
- ENCODING * const return_encoding = Parrot_new_encoding(interp);
-
- static const ENCODING base_encoding = {
- "utf16",
- 4, /* Max bytes per codepoint 0 .. 0x10ffff */
- to_encoding,
- get_codepoint,
- get_byte,
- set_byte,
- get_codepoints,
- get_bytes,
- codepoints,
- bytes,
- find_cclass,
- NULL,
- utf16_iter_get,
- utf16_iter_skip,
- utf16_iter_get_and_advance,
- utf16_iter_set_and_advance,
- utf16_iter_set_position
- };
- STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
- Parrot_register_encoding(interp, "utf16", return_encoding);
-
- return;
-}
/*
Modified: trunk/src/string/encoding/utf16.h
==============================================================================
--- trunk/src/string/encoding/utf16.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/utf16.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -16,11 +16,7 @@
/* HEADERIZER BEGIN: src/string/encoding/utf16.c */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-void Parrot_encoding_utf16_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-#define ASSERT_ARGS_Parrot_encoding_utf16_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding/utf16.c */
Modified: trunk/src/string/encoding/utf8.c
==============================================================================
--- trunk/src/string/encoding/utf8.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/utf8.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -21,74 +21,13 @@
#include "parrot/parrot.h"
#include "../unicode.h"
#include "utf8.h"
+#include "shared.h"
/* HEADERIZER HFILE: src/string/encoding/utf8.h */
/* HEADERIZER BEGIN: static */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-PARROT_PURE_FUNCTION
-static UINTVAL bytes(SHIM_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(2);
-
-static UINTVAL codepoints(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL find_cclass(PARROT_INTERP,
- SHIM(const STRING *s),
- SHIM(const INTVAL *typetable),
- SHIM(INTVAL flags),
- SHIM(UINTVAL pos),
- SHIM(UINTVAL end))
- __attribute__nonnull__(1);
-
-static UINTVAL get_byte(SHIM_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_bytes(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static UINTVAL get_codepoint(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CANNOT_RETURN_NULL
-static STRING * get_codepoints(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL count)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static void set_byte(PARROT_INTERP,
- ARGIN(const STRING *src),
- UINTVAL offset,
- UINTVAL byte)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-PARROT_CAN_RETURN_NULL
-static STRING * to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
-static UINTVAL utf8_characters(PARROT_INTERP,
- ARGIN(const utf8_t *ptr),
- UINTVAL byte_len)
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-
static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr))
__attribute__nonnull__(1)
__attribute__nonnull__(2);
@@ -140,6 +79,20 @@
__attribute__nonnull__(3)
FUNC_MODIFIES(*i);
+static UINTVAL utf8_ord(PARROT_INTERP,
+ ARGIN(const STRING *src),
+ UINTVAL offset)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+static UINTVAL utf8_scan(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
+static UINTVAL utf8_scan2(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
PARROT_WARN_UNUSED_RESULT
PARROT_CANNOT_RETURN_NULL
static const void * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n)
@@ -149,33 +102,11 @@
static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n)
__attribute__nonnull__(1);
-#define ASSERT_ARGS_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_find_cclass __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_get_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoint __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_get_codepoints __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src))
-#define ASSERT_ARGS_utf8_characters __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(ptr))
+PARROT_CAN_RETURN_NULL
+static STRING * utf8_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+
#define ASSERT_ARGS_utf8_decode __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(ptr))
@@ -200,10 +131,22 @@
#define ASSERT_ARGS_utf8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(str) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_utf8_ord __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf8_scan __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
+#define ASSERT_ARGS_utf8_scan2 __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_utf8_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(ptr))
#define ASSERT_ARGS_utf8_skip_forward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(ptr))
+#define ASSERT_ARGS_utf8_to_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(src))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: static */
@@ -229,25 +172,96 @@
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6 /* cjk etc. */
};
+
/*
-=item C<static UINTVAL utf8_characters(PARROT_INTERP, const utf8_t *ptr, UINTVAL
-byte_len)>
+=item C<static STRING * utf8_to_encoding(PARROT_INTERP, const STRING *src)>
-Returns the number of characters in the C<byte_len> bytes from C<*ptr>.
+Converts the string C<src> to this particular encoding. If C<dest> is
+provided, it will contain the result. Otherwise this function operates in
+place.
-XXX This function is unused.
+=cut
+
+*/
+
+PARROT_CAN_RETURN_NULL
+static STRING *
+utf8_to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(utf8_to_encoding)
+ STRING *result;
+ const STR_VTABLE *src_encoding = src->encoding;
+ UINTVAL dest_len, dest_pos, src_len;
+ unsigned char *p;
+
+ if (src_encoding == Parrot_utf8_encoding_ptr)
+ return Parrot_str_clone(interp, src);
+
+ src_len = src->strlen;
+ result = Parrot_gc_new_string_header(interp, 0);
+ result->encoding = Parrot_utf8_encoding_ptr;
+ result->strlen = src_len;
+
+ if (!src_len)
+ return result;
+
+ Parrot_gc_allocate_string_storage(interp, result, src_len);
+ p = (unsigned char *)result->strstart;
+
+ if (src_encoding == Parrot_ascii_encoding_ptr) {
+ for (dest_len = 0; dest_len < src_len; ++dest_len) {
+ p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
+ }
+ result->bufused = dest_len;
+ }
+ else {
+ String_iter src_iter;
+ STRING_ITER_INIT(interp, &src_iter);
+ dest_len = src_len;
+ dest_pos = 0;
+ while (src_iter.charpos < src_len) {
+ const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter);
+ unsigned char *new_pos;
+ unsigned char *pos;
+
+ if (dest_len - dest_pos < 6) {
+ UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5);
+ if (need < 16)
+ need = 16;
+ dest_len += need;
+ result->bufused = dest_pos;
+ Parrot_gc_reallocate_string_storage(interp, result, dest_len);
+ p = (unsigned char *)result->strstart;
+ }
+
+ pos = p + dest_pos;
+ new_pos = (unsigned char *)utf8_encode(interp, pos, c);
+ dest_pos += (new_pos - pos);
+ }
+ result->bufused = dest_pos;
+ }
+
+ return result;
+}
+
+
+/*
+
+=item C<static UINTVAL utf8_scan(PARROT_INTERP, const STRING *src)>
+
+Returns the number of characters in string C<str> by scanning the string.
=cut
*/
static UINTVAL
-utf8_characters(PARROT_INTERP, ARGIN(const utf8_t *ptr), UINTVAL byte_len)
+utf8_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
- ASSERT_ARGS(utf8_characters)
- const utf8_t *u8ptr = ptr;
- const utf8_t *u8end = u8ptr + byte_len;
+ ASSERT_ARGS(utf8_scan)
+ const utf8_t *u8ptr = (const utf8_t *)src->strstart;
+ const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused);
UINTVAL characters = 0;
while (u8ptr < u8end) {
@@ -262,6 +276,53 @@
return characters;
}
+
+/*
+
+=item C<static UINTVAL utf8_scan2(PARROT_INTERP, const STRING *src)>
+
+Returns the number of codepoints in string C<src>.
+
+=cut
+
+*/
+
+static UINTVAL
+utf8_scan2(PARROT_INTERP, ARGIN(const STRING *src))
+{
+ ASSERT_ARGS(utf8_scan2)
+ String_iter iter;
+ /*
+ * this is used to initially calculate src->strlen,
+ * therefore we must scan the whole string
+ */
+ STRING_ITER_INIT(interp, &iter);
+ while (iter.bytepos < src->bufused)
+ utf8_iter_get_and_advance(interp, src, &iter);
+ return iter.charpos;
+}
+
+
+/*
+
+=item C<static UINTVAL utf8_ord(PARROT_INTERP, const STRING *src, UINTVAL
+offset)>
+
+Returns the codepoint in string C<src> at position C<offset>.
+
+=cut
+
+*/
+
+static UINTVAL
+utf8_ord(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
+{
+ ASSERT_ARGS(utf8_ord)
+ const utf8_t * const start = (const utf8_t *)utf8_skip_forward(src->strstart, offset);
+ return utf8_decode(interp, start);
+}
+
+
/*
=item C<static UINTVAL utf8_decode(PARROT_INTERP, const utf8_t *ptr)>
@@ -306,6 +367,7 @@
return c;
}
+
/*
=item C<static void * utf8_encode(PARROT_INTERP, void *ptr, UINTVAL c)>
@@ -343,6 +405,7 @@
return (utf8_t *)ptr + len;
}
+
/*
=item C<static const void * utf8_skip_forward(const void *ptr, UINTVAL n)>
@@ -367,6 +430,7 @@
return u8ptr;
}
+
/*
=item C<static const void * utf8_skip_backward(const void *ptr, UINTVAL n)>
@@ -396,17 +460,6 @@
return u8ptr;
}
-/*
-
-=back
-
-=head2 Iterator Functions
-
-=over 4
-
-=cut
-
-*/
/*
@@ -436,6 +489,7 @@
return utf8_decode(interp, u8ptr);
}
+
/*
=item C<static void utf8_iter_skip(PARROT_INTERP, const STRING *str, String_iter
@@ -465,6 +519,7 @@
i->bytepos = (const char *)u8ptr - (const char *)str->strstart;
}
+
/*
=item C<static UINTVAL utf8_iter_get_and_advance(PARROT_INTERP, const STRING
@@ -515,6 +570,7 @@
return c;
}
+
/*
=item C<static void utf8_iter_set_and_advance(PARROT_INTERP, STRING *str,
@@ -541,6 +597,7 @@
i->charpos++;
}
+
/*
=item C<static void utf8_iter_set_position(PARROT_INTERP, const STRING *str,
@@ -597,326 +654,50 @@
}
-/*
-
-=item C<static STRING * to_encoding(PARROT_INTERP, const STRING *src)>
-
-Converts the string C<src> to this particular encoding. If C<dest> is
-provided, it will contain the result. Otherwise this function operates in
-place.
-
-=cut
-
-*/
-
-PARROT_CAN_RETURN_NULL
-static STRING *
-to_encoding(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(to_encoding)
- STRING *result;
- const ENCODING *src_encoding;
- UINTVAL dest_len, dest_pos, src_len;
- unsigned char *p;
-
- if (src->encoding == Parrot_utf8_encoding_ptr)
- return Parrot_str_clone(interp, src);
-
- result = Parrot_gc_new_string_header(interp, 0);
- src_len = src->strlen;
-
- /* save source encoding before possibly changing it */
- src_encoding = src->encoding;
- result->charset = Parrot_unicode_charset_ptr;
- result->encoding = Parrot_utf8_encoding_ptr;
- result->strlen = src_len;
-
- if (!src->strlen)
- return result;
-
- Parrot_gc_allocate_string_storage(interp, result, src_len);
- p = (unsigned char *)result->strstart;
-
- if (src->charset == Parrot_ascii_charset_ptr) {
- for (dest_len = 0; dest_len < src_len; ++dest_len) {
- p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
- }
- result->bufused = dest_len;
- }
- else {
- String_iter src_iter;
- STRING_ITER_INIT(interp, &src_iter);
- dest_len = src_len;
- dest_pos = 0;
- while (src_iter.charpos < src_len) {
- const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter);
- unsigned char *new_pos;
- unsigned char *pos;
-
- if (dest_len - dest_pos < 6) {
- UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5);
- if (need < 16)
- need = 16;
- dest_len += need;
- result->bufused = dest_pos;
- Parrot_gc_reallocate_string_storage(interp, result, dest_len);
- p = (unsigned char *)result->strstart;
- }
-
- pos = p + dest_pos;
- new_pos = (unsigned char *)utf8_encode(interp, pos, c);
- dest_pos += (new_pos - pos);
- }
- result->bufused = dest_pos;
- }
-
- return result;
-}
-
-/*
-
-=item C<static UINTVAL get_codepoint(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
-
-Returns the codepoint in string C<src> at position C<offset>.
-
-=cut
-
-*/
-
-static UINTVAL
-get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(get_codepoint)
- const utf8_t * const start = (const utf8_t *)utf8_skip_forward(src->strstart, offset);
- return utf8_decode(interp, start);
-}
-
-
-/*
-
-=item C<static UINTVAL find_cclass(PARROT_INTERP, const STRING *s, const INTVAL
-*typetable, INTVAL flags, UINTVAL pos, UINTVAL end)>
-
-Stub, the charset level handles this for unicode strings.
-
-=cut
-
-*/
-
-PARROT_WARN_UNUSED_RESULT
-static UINTVAL
-find_cclass(PARROT_INTERP, SHIM(const STRING *s), SHIM(const INTVAL *typetable),
-SHIM(INTVAL flags), SHIM(UINTVAL pos), SHIM(UINTVAL end))
-{
- Parrot_ex_throw_from_c_args(interp, NULL,
- EXCEPTION_UNIMPLEMENTED,
- "No find_cclass support in unicode encoding plugins");
-}
-
-/*
-
-=item C<static UINTVAL get_byte(PARROT_INTERP, const STRING *src, UINTVAL
-offset)>
-
-Returns the byte in string C<src> at position C<offset>.
-
-=cut
-
-*/
-
-static UINTVAL
-get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset)
-{
- ASSERT_ARGS(get_byte)
- unsigned char *contents = (unsigned char *)src->strstart;
- if (offset >= src->bufused) {
-/* Parrot_ex_throw_from_c_args(interp, NULL, 0,
- "get_byte past the end of the buffer (%i of %i)",
- offset, src->bufused); */
- return 0;
- }
- return contents[offset];
-}
-
-/*
-
-=item C<static void set_byte(PARROT_INTERP, const STRING *src, UINTVAL offset,
-UINTVAL byte)>
-
-Sets, in string C<src> at position C<offset>, the byte C<byte>.
-
-=cut
-
-*/
-
-static void
-set_byte(PARROT_INTERP, ARGIN(const STRING *src),
- UINTVAL offset, UINTVAL byte)
-{
- ASSERT_ARGS(set_byte)
- unsigned char *contents;
-
- if (offset >= src->bufused)
- Parrot_ex_throw_from_c_args(interp, NULL, 0,
- "set_byte past the end of the buffer");
-
- contents = (unsigned char *)src->strstart;
- contents[offset] = (unsigned char)byte;
-}
-
-/*
-
-=item C<static STRING * get_codepoints(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the codepoints in string C<src> at position C<offset> and length
-C<count>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(get_codepoints)
-
- STRING * const return_string = Parrot_str_copy(interp, src);
- String_iter iter;
- UINTVAL start;
-
- STRING_ITER_INIT(interp, &iter);
-
- if (offset)
- utf8_iter_set_position(interp, src, &iter, offset);
-
- start = iter.bytepos;
- return_string->strstart = (char *)return_string->strstart + start;
-
- if (count)
- utf8_iter_set_position(interp, src, &iter, offset + count);
-
- return_string->bufused = iter.bytepos - start;
- return_string->strlen = count;
- return_string->hashval = 0;
-
- return return_string;
-}
-
-/*
-
-=item C<static STRING * get_bytes(PARROT_INTERP, const STRING *src, UINTVAL
-offset, UINTVAL count)>
-
-Returns the bytes in string C<src> at position C<offset> and length C<count>.
-
-=cut
-
-*/
-
-PARROT_CANNOT_RETURN_NULL
-static STRING *
-get_bytes(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count)
-{
- ASSERT_ARGS(get_bytes)
- STRING * const return_string = Parrot_str_copy(interp, src);
-
- return_string->strstart = (char *)return_string->strstart + offset ;
- return_string->bufused = count;
-
- return_string->strlen = count;
- return_string->hashval = 0;
-
- return return_string;
-}
-
-
-
-/*
-
-=item C<static UINTVAL codepoints(PARROT_INTERP, const STRING *src)>
-
-Returns the number of codepoints in string C<src>.
-
-=cut
-
-*/
-
-static UINTVAL
-codepoints(PARROT_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(codepoints)
- String_iter iter;
- /*
- * this is used to initially calculate src->strlen,
- * therefore we must scan the whole string
- */
- STRING_ITER_INIT(interp, &iter);
- while (iter.bytepos < src->bufused)
- utf8_iter_get_and_advance(interp, src, &iter);
- return iter.charpos;
-}
-
-/*
-
-=item C<static UINTVAL bytes(PARROT_INTERP, const STRING *src)>
-
-Returns the number of bytes in string C<src>.
-
-=cut
-
-*/
-
-PARROT_PURE_FUNCTION
-static UINTVAL
-bytes(SHIM_INTERP, ARGIN(const STRING *src))
-{
- ASSERT_ARGS(bytes)
- return src->bufused;
-}
-
-/*
-
-=item C<void Parrot_encoding_utf8_init(PARROT_INTERP)>
-
-Initializes the UTF-8 encoding.
-
-=cut
-
-*/
-
-void
-Parrot_encoding_utf8_init(PARROT_INTERP)
-{
- ASSERT_ARGS(Parrot_encoding_utf8_init)
- ENCODING * const return_encoding = Parrot_new_encoding(interp);
+static STR_VTABLE Parrot_utf8_encoding = {
+ 0,
+ "utf8",
+ NULL,
+ 4, /* Max bytes per codepoint */
+
+ utf8_to_encoding,
+ unicode_chr,
+
+ encoding_equal,
+ encoding_compare,
+ encoding_index,
+ encoding_rindex,
+ encoding_hash,
+ unicode_validate,
+
+ utf8_scan2,
+ utf8_ord,
+ encoding_substr,
+
+ encoding_is_cclass,
+ encoding_find_cclass,
+ encoding_find_not_cclass,
+
+ encoding_get_graphemes,
+ unicode_compose,
+ encoding_decompose,
+
+ unicode_upcase,
+ unicode_downcase,
+ unicode_titlecase,
+ unicode_upcase_first,
+ unicode_downcase_first,
+ unicode_titlecase_first,
+
+ utf8_iter_get,
+ utf8_iter_skip,
+ utf8_iter_get_and_advance,
+ utf8_iter_set_and_advance,
+ utf8_iter_set_position
+};
- static const ENCODING base_encoding = {
- "utf8",
- 4, /* Max bytes per codepoint 0 .. 0x10ffff */
- to_encoding,
- get_codepoint,
- get_byte,
- set_byte,
- get_codepoints,
- get_bytes,
- codepoints,
- bytes,
- find_cclass,
- NULL,
- utf8_iter_get,
- utf8_iter_skip,
- utf8_iter_get_and_advance,
- utf8_iter_set_and_advance,
- utf8_iter_set_position
- };
- STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
- Parrot_register_encoding(interp, "utf8", return_encoding);
+STR_VTABLE *Parrot_utf8_encoding_ptr = &Parrot_utf8_encoding;
- return;
-}
/*
Modified: trunk/src/string/encoding/utf8.h
==============================================================================
--- trunk/src/string/encoding/utf8.h Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/encoding/utf8.h Tue Sep 7 22:58:38 2010 (r48833)
@@ -16,11 +16,7 @@
/* HEADERIZER BEGIN: src/string/encoding/utf8.c */
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
-void Parrot_encoding_utf8_init(PARROT_INTERP)
- __attribute__nonnull__(1);
-#define ASSERT_ARGS_Parrot_encoding_utf8_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp))
/* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
/* HEADERIZER END: src/string/encoding/utf8.c */
Modified: trunk/src/string/primitives.c
==============================================================================
--- trunk/src/string/primitives.c Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/src/string/primitives.c Tue Sep 7 22:58:38 2010 (r48833)
@@ -89,14 +89,15 @@
UINTVAL workchar = 0;
UINTVAL charcount = 0;
const UINTVAL len = Parrot_str_byte_length(interp, string);
+ const unsigned char * const buf = (unsigned char *)string->strstart;
/* Well, not right now */
- UINTVAL codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ UINTVAL codepoint = buf[*offset];
++*offset;
switch (codepoint) {
case 'x':
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= '0' && codepoint <= '9') {
workchar = codepoint - '0';
}
@@ -111,7 +112,7 @@
++*offset;
workchar = 0;
for (i = 0; i < 8 && *offset < len; ++i, ++*offset) {
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint == '}') {
++*offset;
return workchar;
@@ -145,7 +146,7 @@
++*offset;
if (*offset < len) {
workchar *= 16;
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= '0' && codepoint <= '9') {
workchar += codepoint - '0';
}
@@ -165,7 +166,7 @@
++*offset;
return workchar;
case 'c':
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= 'A' && codepoint <= 'Z') {
workchar = codepoint - 'A' + 1;
}
@@ -181,7 +182,7 @@
for (charcount = 0; charcount < 4; charcount++) {
if (*offset < len) {
workchar *= 16;
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= '0' && codepoint <= '9') {
workchar += codepoint - '0';
}
@@ -211,7 +212,7 @@
for (charcount = 0; charcount < 8; charcount++) {
if (*offset < len) {
workchar *= 16;
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= '0' && codepoint <= '9') {
workchar += codepoint - '0';
}
@@ -247,7 +248,7 @@
workchar = codepoint - '0';
if (*offset < len) {
workchar *= 8;
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= '0' && codepoint <= '7') {
workchar += codepoint - '0';
}
@@ -261,7 +262,7 @@
++*offset;
if (*offset < len) {
workchar *= 8;
- codepoint = CHARSET_GET_BYTE(interp, string, *offset);
+ codepoint = buf[*offset];
if (codepoint >= '0' && codepoint <= '7') {
workchar += codepoint - '0';
}
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/t/op/string_cs.t Tue Sep 7 22:58:38 2010 (r48833)
@@ -337,7 +337,7 @@
end
CODE
abc_\xc3\xa4_
-unicode
+utf8
6
OUTPUT
Modified: trunk/t/pmc/bytebuffer.t
==============================================================================
--- trunk/t/pmc/bytebuffer.t Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/t/pmc/bytebuffer.t Tue Sep 7 22:58:38 2010 (r48833)
@@ -135,7 +135,7 @@
bb = new ['ByteBuffer']
bb = binary:"abcd"
- s = bb.'get_string'('ascii', 'fixed_8')
+ s = bb.'get_string'('ascii')
n = length s
is(n, 4, "getting ascii from buffer gives correct length")
is(s, "abcd", "getting ascii from buffer gives correct content")
@@ -161,7 +161,7 @@
bb[0] = 0x00
bb[1] = 0xD1
doit:
- s = bb.'get_string'('unicode', 'utf16')
+ s = bb.'get_string'('utf16')
n = length s
is(n, 1, "getting utf16 from buffer gives correct length")
n = ord s
@@ -297,7 +297,7 @@
if i < 8192 goto loopset
.local string s
- s = bb.'get_string'('unicode', 'utf16')
+ s = bb.'get_string'('utf16')
# Check string size
i = length s
@@ -350,7 +350,7 @@
bb = new ['ByteBuffer']
bb = 'something'
push_eh catch_charset
- s = bb.'get_string'('***INVALID cHARsET%%%%', 'fixed_8')
+ s = bb.'get_string'('***INVALID cHARsET%%%%')
pop_eh
ok(0, "get_string with invalid charset should throw")
goto check_encoding
Modified: trunk/t/pmc/filehandle.t
==============================================================================
--- trunk/t/pmc/filehandle.t Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/t/pmc/filehandle.t Tue Sep 7 22:58:38 2010 (r48833)
@@ -541,7 +541,7 @@
\$I1 = charset line
\$S2 = charsetname \$I1
- if \$S2 == 'unicode' goto ok_3
+ if \$S2 == 'utf8' goto ok_3
print \$S2
print 'not '
ok_3:
Modified: trunk/t/pmc/io.t
==============================================================================
--- trunk/t/pmc/io.t Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/t/pmc/io.t Tue Sep 7 22:58:38 2010 (r48833)
@@ -658,7 +658,7 @@
print $S1
.end
CODE
-unicode
+utf8
utf8
T\xf6tsch
OUTPUT
Modified: trunk/tools/dev/gen_charset_tables.pl
==============================================================================
--- trunk/tools/dev/gen_charset_tables.pl Tue Sep 7 22:20:33 2010 (r48832)
+++ trunk/tools/dev/gen_charset_tables.pl Tue Sep 7 22:58:38 2010 (r48833)
@@ -23,7 +23,7 @@
'$Id$' =~
/^\$[iI][dD]:\s(.*) \$$/;
my $fileid = '$' . 'Id $';
-my $charset_dir = File::Spec->catdir(qw/ src charset /);
+my $charset_dir = File::Spec->catdir(qw/ src string encoding /);
my $coda = <<'EOF';
/*
@@ -110,7 +110,7 @@
}
#
-# create 'src/charset/tables.c'
+# create 'src/encoding/tables.c'
#
###########################################################################
my $c_file = File::Spec->catfile( $charset_dir, 'tables.c' );
@@ -129,7 +129,7 @@
close STDOUT;
#
-# create 'src/charset/tables.h'
+# create 'src/encoding/tables.h'
#
###########################################################################
my $h_file = File::Spec->catfile( $charset_dir, 'tables.h' );
More information about the parrot-commits
mailing list