Index: configure.ac =================================================================== --- configure.ac (revision 1011) +++ configure.ac (working copy) @@ -162,6 +162,8 @@ fi fi +m4_include([m4/ax_path_lib_pcre.m4]) AX_PATH_LIB_PCRE([]) + # check if rdtsc (read CPU cycle counter is available. # This is expected only on Intel CPUs AC_MSG_CHECKING([whether CPU has rdtsc (read CPU cycle counter) opcode]) Index: m4/ax_path_lib_pcre.m4 =================================================================== --- m4/ax_path_lib_pcre.m4 (nonexistent) +++ m4/ax_path_lib_pcre.m4 (working copy) @@ -0,0 +1,90 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_path_lib_pcre.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PATH_LIB_PCRE [(A/NA)] +# +# DESCRIPTION +# +# check for pcre lib and set PCRE_LIBS and PCRE_CFLAGS accordingly. +# +# also provide --with-pcre option that may point to the $prefix of the +# pcre installation - the macro will check $pcre/include and $pcre/lib to +# contain the necessary files. +# +# the usual two ACTION-IF-FOUND / ACTION-IF-NOT-FOUND are supported and +# they can take advantage of the LIBS/CFLAGS additions. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 8 + +AC_DEFUN([AX_PATH_LIB_PCRE],[dnl +AC_MSG_CHECKING([lib pcre]) +AC_ARG_WITH(pcre, +[ --with-pcre[[=prefix]] compile xmlpcre part (via libpcre check)],, + with_pcre="yes") +if test ".$with_pcre" = ".no" ; then + AC_MSG_RESULT([disabled]) + m4_ifval($2,$2) +else + AC_MSG_RESULT([(testing)]) + AC_CHECK_LIB(pcre2-32, pcre2_compile_32) + if test "$ac_cv_lib_pcre2-32_pcre2_compile_32" = "yes" ; then + PCRE_LIBS="-lpcre2-32" + AC_MSG_CHECKING([lib pcre]) + AC_MSG_RESULT([$PCRE_LIBS]) + m4_ifval($1,$1) + else + OLDLDFLAGS="$LDFLAGS" ; LDFLAGS="$LDFLAGS -L$with_pcre/lib" + OLDCPPFLAGS="$CPPFLAGS" ; CPPFLAGS="$CPPFLAGS -I$with_pcre/include" + AC_CHECK_LIB(pcre2-32, pcre2_match_data_create_from_pattern_32) + CPPFLAGS="$OLDCPPFLAGS" + LDFLAGS="$OLDLDFLAGS" + if test "$ac_cv_lib_pcre2-32_pcre2_match_data_create_from_pattern_32" = "yes" ; then + AC_MSG_RESULT(.setting PCRE_LIBS -L$with_pcre/lib -lpcre2-32) + PCRE_LIBS="-L$with_pcre/lib -lpcre2-32" + test -d "$with_pcre/include" && PCRE_CFLAGS="-I$with_pcre/include" + AC_MSG_CHECKING([lib pcre]) + AC_MSG_RESULT([$PCRE_LIBS]) + m4_ifval($1,$1) + else + AC_MSG_CHECKING([lib pcre]) + AC_MSG_RESULT([no (WARNING)]) + m4_ifval($2,$2) + fi + fi +fi +AC_SUBST([PCRE_LIBS]) +AC_SUBST([PCRE_CFLAGS]) +]) Index: src/Id.cc =================================================================== --- src/Id.cc (revision 1011) +++ src/Id.cc (working copy) @@ -37,6 +37,7 @@ #include "QuadFunction.hh" #include "Quad_DLX.hh" #include "Quad_FX.hh" +#include "Quad_RE.hh" #include "Quad_SQL.hh" #include "Quad_SVx.hh" #include "Quad_TF.hh" Index: src/Id.def =================================================================== --- src/Id.def (revision 1011) +++ src/Id.def (working copy) @@ -201,6 +201,7 @@ qf( SVS , "竡百VS" , ) qv( SYL , "竡百YL" , ) pp( USER_SYMBOL , --- , ) +qf( RE , "竡紐E" , ) pp( STOP_LINE , --- , ) qf( STOP , "竡百TOP" , ) qf( SQL , "竡百QL" , ) Index: src/Makefile.am =================================================================== --- src/Makefile.am (revision 1011) +++ src/Makefile.am (working copy) @@ -86,10 +86,12 @@ Quad_DLX.cc Quad_DLX.hh \ Quad_FIO.cc Quad_FIO.hh \ Quad_FX.cc Quad_FX.hh \ +Quad_RE.cc Quad_RE.hh \ Quad_RL.cc Quad_RL.hh \ Quad_SQL.cc Quad_SQL.hh \ Quad_SVx.cc Quad_SVx.hh \ Quad_TF.cc Quad_TF.hh \ +Regexp.cc Regexp.hh \ Parallel.cc Parallel.hh \ Performance.cc Performance.def Performance.hh \ RealCell.cc RealCell.hh \ Index: src/QuadFunction.cc =================================================================== --- src/QuadFunction.cc (revision 1011) +++ src/QuadFunction.cc (working copy) @@ -36,6 +36,7 @@ #include "PrintOperator.hh" #include "QuadFunction.hh" #include "Quad_FX.hh" +#include "Quad_RE.hh" #include "Quad_SQL.hh" #include "Quad_TF.hh" #include "Tokenizer.hh" Index: src/Quad_RE.cc =================================================================== --- src/Quad_RE.cc (nonexistent) +++ src/Quad_RE.cc (working copy) @@ -0,0 +1,187 @@ +#include "Quad_RE.hh" +#include "Workspace.hh" +#include "PointerCell.hh" + +#include "Regexp.hh" + +class Flags +{ +public: + Flags(const UCS_string &flags_in); + int get_compflags() const { return flags; } + bool get_error_on_no_match() const { return error_on_no_match; } + bool get_result_bitmap() const { return result_bitmap; } + +private: + int flags; + bool error_on_no_match; + bool result_bitmap; +}; + +Flags::Flags(const UCS_string &flags_string) : flags(0), error_on_no_match(false), result_bitmap(false) +{ + int result = 0; + UCS_string::iterator i = flags_string.begin(); + while(i.more()) { + Unicode ch = i.next(); + switch(static_cast(ch)) { + case 'i': + result |= PCRE2_CASELESS; + break; + case 's': + result |= PCRE2_DOTALL; + break; + case 'x': + result |= PCRE2_EXTENDED; + break; + case 'm': + result |= PCRE2_MULTILINE; + break; + case 'E': + error_on_no_match = true; + break; + case 'B': + result_bitmap = true; + break; + default: + MORE_ERROR() << "Unknown regexp flag: " << ch; + VALUE_ERROR; + } + } + flags = result; +} + + +Quad_RE Quad_RE::_fun; +Quad_RE *Quad_RE::fun = &Quad_RE::_fun; + +Quad_RE::Quad_RE() : QuadFunction(TOK_Quad_RE) +{ +} + +Token Quad_RE::eval_AB(Value_P A, Value_P B) +{ + return eval_AXB(A, Str0(LOC), B); +} + +static Value_P fill_regex_results(Value_P &result, const Regexp ®exp, const Flags &flags, const UCS_string &matched) +{ + if(flags.get_result_bitmap()) { + vector> results; + ShapeItem pos = 0; + bool end = false; + while(!end && pos < matched.size()) { + unique_ptr match(regexp.match(matched, static_cast(pos))); + if(match->is_match()) { + const PCRE2_SIZE *ovector = match->get_ovector(); + results.push_back(pair(ovector[0], ovector[1])); + pos = ovector[1]; + } + else { + end = true; + } + } + + Shape shape(matched.size()); + Value_P result_value(shape, LOC); + ShapeItem w = 0; + int match_id = 1; + for(vector>::iterator i = results.begin() ; i != results.end() ; i++) { + while(w < i->first) { + new (result_value->next_ravel()) IntCell(0); + w++; + } + while(w < i->second) { + new (result_value->next_ravel()) IntCell(match_id); + w++; + } + match_id++; + } + while(w < matched.size()) { + new (result_value->next_ravel()) IntCell(0); + w++; + } + result_value->check_value(LOC); + return result_value; + } + else { + unique_ptr match(regexp.match(matched, 0)); + if(!match->is_match()) { + if(flags.get_error_on_no_match()) { + MORE_ERROR() << "No match"; + DOMAIN_ERROR; + } + else { + return Idx0(LOC); + } + } + else { + if(match->num_matches() == 1) { + Value_P res = Value_P(match->matched_string(), LOC); + return res; + } + else { + vector strings = match->matched_string_list(); + Shape shape(strings.size()); + Value_P result_value(shape, LOC); + for(vector::iterator i = strings.begin() ; i != strings.end() ; i++) { + Value_P field_value(*i, LOC); + field_value->check_value(LOC); + new (result_value->next_ravel()) PointerCell(field_value, result_value.getref()); + } + result_value->check_value(LOC); + return result_value; + } + } + } +} + +Token +Quad_RE::eval_AXB(const Value_P A, const Value_P X, const Value_P B) +{ + if(!A->is_char_string()) { + MORE_ERROR() << "Regexp argument must be a string value"; + VALUE_ERROR; + } + + Flags flags(X->get_UCS_ravel()); + Regexp regexp(A->get_UCS_ravel(), flags.get_compflags()); + + const Shape &shape = B->get_shape(); + if(shape.get_rank() == 0) { + return Token(TOK_APL_VALUE1, Idx0(LOC)); + } + else if(B->is_char_string()) { + Value_P result = fill_regex_results(result, regexp, flags, B->get_UCS_ravel()); + return Token(TOK_APL_VALUE1, result); + } + else { + const Shape &shape = B->get_shape(); + Value_P result(shape, LOC); + for(ShapeItem i = 0 ; i < shape.get_volume() ; i++) { + const Cell &cell = B->get_ravel(i); + Value_P value = cell.to_value(LOC); + if(!value->is_char_string()) { + MORE_ERROR() << "Cell does not contain a string"; + DOMAIN_ERROR; + } + + Value_P result_value = fill_regex_results(result, regexp, flags, value->get_UCS_ravel()); + new (result->next_ravel()) PointerCell(result_value, result.getref()); + } + result->check_value(LOC); + return Token(TOK_APL_VALUE1, result); + } +} + +Token +Quad_RE::eval_B(Value_P B) +{ + VALENCE_ERROR; +} + +Token +Quad_RE::eval_XB(Value_P X, Value_P B) +{ + VALENCE_ERROR; +} Index: src/Quad_RE.hh =================================================================== --- src/Quad_RE.hh (nonexistent) +++ src/Quad_RE.hh (working copy) @@ -0,0 +1,54 @@ +/* + This file is part of GNU APL, a free implementation of the + ISO/IEC Standard 13751, "Programming Language APL, Extended" + + Copyright (C) 2008-2016 Dr. Jテシrgen Sauermann + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __Quad_RE_DEFINED__ +#define __Quad_RE_DEFINED__ + +#include "QuadFunction.hh" +#include "Value.hh" +#include "Simple_string.hh" + +class Quad_RE : public QuadFunction +{ +public: + /// Constructor. + Quad_RE(); + + static Quad_RE * fun; ///< Built-in function. + static Quad_RE _fun; ///< Built-in function. + +protected: + /// overloaded Function::eval_AB(). + Token eval_AB(const Value_P A, const Value_P B); + + /// overloaded Function::eval_AXB(). + Token eval_AXB(const Value_P A, const Value_P X, const Value_P B); + + /// overloaded Function::eval_B(). + Token eval_B(Value_P B); + + /// overloaded Function::eval_XB(). + Token eval_XB(Value_P X, Value_P B); + +// virtual Token eval_AB(Value_P A, Value_P B); + +}; + +#endif Index: src/Regexp.cc =================================================================== --- src/Regexp.cc (nonexistent) +++ src/Regexp.cc (working copy) @@ -0,0 +1,115 @@ +#include "Workspace.hh" +#include "Regexp.hh" + +static const PCRE2_UCHAR32 *ucs_to_codepoints(const UCS_string &string) +{ + int size = string.size(); + PCRE2_UCHAR32 *buf = new PCRE2_UCHAR32[size]; + PCRE2_UCHAR32 *p = buf; + UCS_string::iterator i = string.begin(); + while(i.more()) { + *p++ = i.next(); + } + return buf; +} + +static UCS_string make_ucs_string(PCRE2_UCHAR32 *buf) +{ + UCS_string result; + PCRE2_UCHAR32 *p = buf; + while(*p != 0) { + result.append(static_cast(*p++)); + } + return result; +} + +RegexpMatch::RegexpMatch(pcre2_code *code, const UCS_string &matched, PCRE2_SIZE start) +{ + matched_ucs = ucs_to_codepoints(matched); + match_data = pcre2_match_data_create_from_pattern_32(code, NULL); + match_result = pcre2_match_32(code, matched_ucs, matched.size(), start, 0, match_data, NULL); + if(match_result == 0) { + MORE_ERROR() << "Match buffer too small"; + FIXME; + } + else if(match_result > 0) { + ovector = pcre2_get_ovector_pointer_32(match_data); + } + else { + ovector = NULL; + } +} + +RegexpMatch::~RegexpMatch() +{ + delete[] matched_ucs; + pcre2_match_data_free(match_data); +} + +bool RegexpMatch::is_match() const +{ + return match_result > 0; +} + +int RegexpMatch::num_matches() const +{ + if(match_result < 0) { + MORE_ERROR() << "Attempt to call num_matches without matches"; + FIXME; + } + return match_result; +} + +UCS_string RegexpMatch::matched_string() const +{ + const PCRE2_SIZE *ovector = get_ovector(); + UCS_string result(reinterpret_cast(matched_ucs + ovector[0]), ovector[1] - ovector[0]); + return result; +} + +vector RegexpMatch::matched_string_list() const +{ + const PCRE2_SIZE *ovector = get_ovector(); + vector result; + for(int i = 1 ; i < match_result ; i++) { + PCRE2_SIZE start = ovector[i * 2]; + PCRE2_SIZE end = ovector[i * 2 + 1]; + result.push_back(UCS_string(reinterpret_cast(matched_ucs + start), end - start)); + } + return result; +} + +Regexp::Regexp(const UCS_string &pattern, int flags) +{ + const PCRE2_UCHAR32 *pattern_ucs = ucs_to_codepoints(pattern); + + int error_code; + PCRE2_SIZE error_offset; + + code = pcre2_compile_32(pattern_ucs, pattern.size(), PCRE2_NO_UTF_CHECK | flags, &error_code, &error_offset, NULL); + delete[] pattern_ucs; + if(code == NULL) { + PCRE2_UCHAR32 buf[256]; + pcre2_get_error_message_32(error_code, buf, sizeof(buf)); + UCS_string error_message = make_ucs_string(buf); + MORE_ERROR() << "Error compiling regex at offset: " << error_offset << ": " << error_message; + VALUE_ERROR; + } +} + +Regexp::~Regexp() +{ + pcre2_code_free(code); +} + +RegexpMatch *Regexp::match(const UCS_string &match, PCRE2_SIZE size) const +{ + return new RegexpMatch(code, match, size); +} + +int Regexp::expression_count() const +{ + uint32_t result; + pcre2_pattern_info(code, PCRE2_INFO_CAPTURECOUNT, &result); + return result; +} Index: src/Regexp.hh =================================================================== --- src/Regexp.hh (nonexistent) +++ src/Regexp.hh (working copy) @@ -0,0 +1,40 @@ +#ifndef __Regexp__DEFINED__ +#define __Regexp__DEFINED__ + +#include "UCS_string.hh" +#include + +#define PCRE2_CODE_UNIT_WIDTH 32 +#include + +class RegexpMatch +{ +public: + RegexpMatch(pcre2_code *code, const UCS_string &, PCRE2_SIZE start); + virtual ~RegexpMatch(); + bool is_match() const; + int num_matches() const; + UCS_string matched_string() const; + const PCRE2_SIZE *get_ovector() const { return ovector; } + vector matched_string_list() const; + +private: + PCRE2_SIZE *ovector; + const PCRE2_UCHAR32 *matched_ucs; + pcre2_match_data *match_data; + int match_result; +}; + +class Regexp +{ +public: + Regexp(const UCS_string &pattern, int flags); + virtual ~Regexp(); + RegexpMatch *match(const UCS_string &match, PCRE2_SIZE size) const; + int expression_count() const; + +private: + pcre2_code *code; +}; + +#endif Index: src/SystemVariable.def =================================================================== --- src/SystemVariable.def (revision 1011) +++ src/SystemVariable.def (working copy) @@ -73,6 +73,7 @@ sf_def(Quad_NA, "NA", "Name Association" ) sf_def(Quad_NC, "NC", "Name Class" ) sf_def(Quad_NL, "NL", "Name List" ) + sf_def(Quad_RE, "RE", "Regular expression" ) sf_def(Quad_SI, "SI", "State Indicator" ) sf_def(Quad_SQL, "SQL", "SQL functions" ) sf_def(Quad_SVC, "SVC", "Shared Variable Control" ) @@ -86,6 +87,3 @@ sf_def(Quad_UCS, "UCS", "Universal Char Set (Unicode)" ) # undef sf_def #endif - - - Index: src/Token.def =================================================================== --- src/Token.def (revision 1011) +++ src/Token.def (working copy) @@ -116,6 +116,7 @@ TD(TOK_Quad_EC , TC_FUN1 , TV_FUN , ID::Quad_EC ) TD(TOK_Quad_ENV , TC_FUN1 , TV_FUN , ID::Quad_ENV ) TD(TOK_Quad_EX , TC_FUN1 , TV_FUN , ID::Quad_EX ) +TD(TOK_Quad_RE , TC_FUN2 , TV_FUN , ID::Quad_RE ) TD(TOK_Quad_SQL , TC_FUN2 , TV_FUN , ID::Quad_SQL ) TD(TOK_Quad_SVQ , TC_FUN1 , TV_FUN , ID::Quad_SVQ ) TD(TOK_Quad_SVR , TC_FUN1 , TV_FUN , ID::Quad_SVR ) Index: src/Workspace.hh =================================================================== --- src/Workspace.hh (revision 1011) +++ src/Workspace.hh (working copy) @@ -28,6 +28,7 @@ #include "Quad_CR.hh" #include "Quad_DLX.hh" #include "Quad_FIO.hh" +#include "Quad_RE.hh" #include "Quad_RL.hh" #include "Quad_SVx.hh" #include "ScalarFunction.hh"