emacs-pretest-bug
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Matching programming language identifiers, not "words"


From: Jim Blandy
Subject: Re: Matching programming language identifiers, not "words"
Date: 18 May 2004 10:07:51 -0500
User-agent: Gnus/5.09 (Gnus v5.9.0) Emacs/21.3

Tim Van Holder <address@hidden> writes:

> Eli Zaretskii wrote:
> 
> >A couple of weeks ago, Jim Blandy submitted changes to implement your
> >first suggestion, whereby "\_<" and "\_>" match an empty string at the
> >beginning and end of a symbol.  It is still not in the CVS, though.
> >
> >
> Excellent.

Here's the patch --- please test it.  Once we've got some positive
reports on it, I think it'll be ready to go in.

src/ChangeLog:
2004-04-29  Jim Blandy  <address@hidden>

        Add support for new '\_<' and '\_>' regexp operators, matching the
        beginning and ends of symbols.
        * regex.c (enum syntaxcode): Add Ssymbol.
        (init_syntax_once): Set the syntax for '_' to Ssymbol, not Sword.
        (symbeg, symend): New opcodes.
        (print_partial_compiled_pattern): Print the new opcodes properly.
        (regex_compile): Parse the new operators.
        (analyze_first): symbeg and symend match only the empty string.
        (mutually_exclusive_p): symend is mutually exclusive with \s_ and
        \sw; symbeg is mutually exclusive with \S_ and \Sw.
        (re_match_2_internal): Add code for symbeg and symend.
        * search.c (trivial_regexp_p): \_ is no longer a trivial regexp.

man/ChangeLog:
2004-04-29  Jim Blandy  <address@hidden>

        * search.texi (Regexps): Document the \_< and \_> regexp operators.

lispref/ChangeLog:
2004-05-04  Jim Blandy  <address@hidden>

        * searching.texi (Regexp Backslash): Document new \_< and \_>
        operators.

*** src/regex.c.~2~     2004-04-29 15:56:53.000000000 -0500
--- src/regex.c 2004-04-29 17:44:24.000000000 -0500
***************
*** 219,225 ****
  /* Define the syntax stuff for \<, \>, etc.  */
  
  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
! enum syntaxcode { Swhitespace = 0, Sword = 1 };
  
  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
--- 219,225 ----
  /* Define the syntax stuff for \<, \>, etc.  */
  
  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
! enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
  
  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
***************
*** 399,405 ****
       if (ISALNUM (c))
        re_syntax_table[c] = Sword;
  
!    re_syntax_table['_'] = Sword;
  
     done = 1;
  }
--- 399,405 ----
       if (ISALNUM (c))
        re_syntax_table[c] = Sword;
  
!    re_syntax_table['_'] = Ssymbol;
  
     done = 1;
  }
***************
*** 656,661 ****
--- 656,664 ----
    wordbound,  /* Succeeds if at a word boundary.  */
    notwordbound,       /* Succeeds if not at a word boundary.  */
  
+   symbeg,       /* Succeeds if at symbol beginning.  */
+   symend,       /* Succeeds if at symbol end.  */
+ 
        /* Matches any character whose syntax is specified.  Followed by
           a byte which contains a syntax code, e.g., Sword.  */
    syntaxspec,
***************
*** 1095,1100 ****
--- 1098,1110 ----
        case wordend:
          printf ("/wordend");
  
+       case symbeg:
+         printf ("/symbeg");
+         break;
+ 
+       case symend:
+         printf ("/symend");
+ 
        case syntaxspec:
          printf ("/syntaxspec");
          mcnt = *p++;
***************
*** 3135,3140 ****
--- 3145,3163 ----
              BUF_PUSH (wordend);
              break;
  
+           case '_':
+             if (syntax & RE_NO_GNU_OPS)
+               goto normal_char;
+               laststart = b;
+               PATFETCH (c);
+               if (c == '<')
+                 BUF_PUSH (symbeg);
+               else if (c == '>')
+                 BUF_PUSH (symend);
+               else
+                 FREE_STACK_RETURN (REG_BADPAT);
+               break;
+ 
            case 'b':
              if (syntax & RE_NO_GNU_OPS)
                goto normal_char;
***************
*** 3629,3634 ****
--- 3652,3659 ----
        case notwordbound:
        case wordbeg:
        case wordend:
+       case symbeg:
+       case symend:
          continue;
  
  
***************
*** 4396,4409 ****
        break;
  
      case wordend:
!     case notsyntaxspec:
        return ((re_opcode_t) *p1 == syntaxspec
!             && p1[1] == (op2 == wordend ? Sword : p2[1]));
  
      case wordbeg:
!     case syntaxspec:
        return ((re_opcode_t) *p1 == notsyntaxspec
!             && p1[1] == (op2 == wordbeg ? Sword : p2[1]));
  
      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
--- 4421,4440 ----
        break;
  
      case wordend:
!       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
!     case symend:
        return ((re_opcode_t) *p1 == syntaxspec
!               && (p1[1] == Ssymbol || p1[1] == Sword));
!     case notsyntaxspec:
!       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
  
      case wordbeg:
!       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
!     case symbeg:
        return ((re_opcode_t) *p1 == notsyntaxspec
!               && (p1[1] == Ssymbol || p1[1] == Sword));
!     case syntaxspec:
!       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
  
      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
***************
*** 5528,5533 ****
--- 5559,5650 ----
            }
          break;
  
+       case symbeg:
+         DEBUG_PRINT1 ("EXECUTING symbeg.\n");
+ 
+         /* We FAIL in one of the following cases: */
+ 
+         /* Case 1: D is at the end of string.  */
+         if (AT_STRINGS_END (d))
+           goto fail;
+         else
+           {
+             /* C1 is the character before D, S1 is the syntax of C1, C2
+                is the character at D, and S2 is the syntax of C2.  */
+             re_wchar_t c1, c2;
+             int s1, s2;
+ #ifdef emacs
+             int offset = PTR_TO_OFFSET (d);
+             int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+             UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+             PREFETCH ();
+             c2 = RE_STRING_CHAR (d, dend - d);
+             s2 = SYNTAX (c2);
+       
+             /* Case 2: S2 is neither Sword nor Ssymbol. */
+             if (s2 != Sword && s2 != Ssymbol)
+               goto fail;
+ 
+             /* Case 3: D is not at the beginning of string ... */
+             if (!AT_STRINGS_BEG (d))
+               {
+                 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ #ifdef emacs
+                 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+ #endif
+                 s1 = SYNTAX (c1);
+ 
+                 /* ... and S1 is Sword or Ssymbol.  */
+                 if (s1 == Sword || s1 == Ssymbol)
+                   goto fail;
+               }
+           }
+         break;
+ 
+       case symend:
+         DEBUG_PRINT1 ("EXECUTING symend.\n");
+ 
+         /* We FAIL in one of the following cases: */
+ 
+         /* Case 1: D is at the beginning of string.  */
+         if (AT_STRINGS_BEG (d))
+           goto fail;
+         else
+           {
+             /* C1 is the character before D, S1 is the syntax of C1, C2
+                is the character at D, and S2 is the syntax of C2.  */
+             re_wchar_t c1, c2;
+             int s1, s2;
+ #ifdef emacs
+             int offset = PTR_TO_OFFSET (d) - 1;
+             int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+             UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+             GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+             s1 = SYNTAX (c1);
+ 
+             /* Case 2: S1 is neither Ssymbol nor Sword.  */
+             if (s1 != Sword && s1 != Ssymbol)
+               goto fail;
+ 
+             /* Case 3: D is not at the end of string ... */
+             if (!AT_STRINGS_END (d))
+               {
+                 PREFETCH_NOLIMIT ();
+                 c2 = RE_STRING_CHAR (d, dend - d);
+ #ifdef emacs
+                 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ #endif
+                 s2 = SYNTAX (c2);
+ 
+                 /* ... and S2 is Sword or Ssymbol.  */
+                 if (s2 == Sword || s2 == Ssymbol)
+                     goto fail;
+               }
+           }
+         break;
+ 
        case syntaxspec:
        case notsyntaxspec:
          not = (re_opcode_t) *(p - 1) == notsyntaxspec;
*** src/search.c.~1~    2002-05-12 19:04:16.000000000 -0500
--- src/search.c        2004-04-29 17:30:17.000000000 -0500
***************
*** 962,968 ****
            {
            case '|': case '(': case ')': case '`': case '\'': case 'b':
            case 'B': case '<': case '>': case 'w': case 'W': case 's':
!           case 'S': case '=': case '{': case '}':
            case 'c': case 'C': /* for categoryspec and notcategoryspec */
            case '1': case '2': case '3': case '4': case '5':
            case '6': case '7': case '8': case '9':
--- 962,968 ----
            {
            case '|': case '(': case ')': case '`': case '\'': case 'b':
            case 'B': case '<': case '>': case 'w': case 'W': case 's':
!           case 'S': case '=': case '{': case '}': case '_':
            case 'c': case 'C': /* for categoryspec and notcategoryspec */
            case '1': case '2': case '3': case '4': case '5':
            case '6': case '7': case '8': case '9':
*** man/search.texi.~1~ 2002-07-06 08:44:06.000000000 -0500
--- man/search.texi     2004-04-29 17:38:41.000000000 -0500
***************
*** 672,677 ****
--- 672,689 ----
  @item \W
  matches any character that is not a word-constituent.
  
+ @item \_<
+ matches the empty string, but only at the beginning of a symbol.  A
+ symbol is a sequence of one or more word or symbol constituent
+ characters.  @samp{\_<} matches at the beginning of the buffer only if
+ a symbol-constituent character follows.
+ 
+ @item \_>
+ matches the empty string, but only at the end of a symbol.  A symbol
+ is a sequence of one or more word or symbol constituent characters.
+ @samp{\_>} matches at the end of the buffer only if the contents end
+ with a symbol-constituent character.
+ 
  @item address@hidden
  matches any character whose syntax is @var{c}.  Here @var{c} is a
  character that designates a particular syntax class: thus, @samp{w}
*** searching.texi.~1.48.~      2004-02-16 20:09:15.000000000 -0500
--- searching.texi      2004-05-05 01:12:38.000000000 -0500
***************
*** 666,671 ****
--- 666,686 ----
  with a word-constituent character.
  @end table
  
+ @item \_<
+ @cindex @samp{\_<} in regexp
+ @cindex symbols, matching in regexp
+ matches the empty string, but only at the beginning of a symbol.  A
+ symbol is a sequence of one or more word or symbol constituent
+ characters.  @samp{\_<} matches at the beginning of the buffer (or
+ string) only if a symbol-constituent character follows.
+ 
+ @item \_>
+ @cindex @samp{\_>} in regexp
+ matches the empty string, but only at the end of a symbol.  A symbol
+ is a sequence of one or more word or symbol constituent characters.
+ @samp{\_>} matches at the end of the buffer (or string) only if the
+ contents end with a symbol-constituent character.
+ 
  @kindex invalid-regexp
    Not every string is a valid regular expression.  For example, a string
  with unbalanced square brackets is invalid (with a few exceptions, such
*** etc/NEWS.~1.950.~   2004-04-27 17:02:27.000000000 -0500
--- etc/NEWS    2004-05-04 14:15:33.000000000 -0500
***************
*** 90,95 ****
--- 90,101 ----
  
  * Changes in Emacs 21.4
  
+ +++
+ ** There are now two new regular expression operators, \_< and \_>,
+ for matching the beginning and end of a symbol.  A symbol is a
+ non-empty sequence of either word or symbol constituent characters, as
+ specified by the syntax table.
+ 
  ---
  ** The IELM prompt is now, by default, read-only.  This can be
  controlled with the new user option `ielm-prompt-read-only'.





reply via email to

[Prev in Thread] Current Thread [Next in Thread]