src/ChangeLog:
2004-04-29  Jim Blandy  <address@hidden>

	Add support for new '\_<' and '\_>' regexp operators, matching the
	beginning and ends of symbols.
	* regex.c (enum syntaxcode): Add Ssymbol.
	(init_syntax_once): Set the syntax for '_' to Ssymbol, not Sword.
	(symbeg, symend): New opcodes.
	(print_partial_compiled_pattern): Print the new opcodes properly.
	(regex_compile): Parse the new operators.
	(analyze_first): symbeg and symend match only the empty string.
	(mutually_exclusive_p): symend is mutually exclusive with \s_ and
	\sw; symbeg is mutually exclusive with \S_ and \Sw.
	(re_match_2_internal): Add code for symbeg and symend.
	* search.c (trivial_regexp_p): \_ is no longer a trivial regexp.

man/ChangeLog:
2004-04-29  Jim Blandy  <address@hidden>

	* search.texi (Regexps): Document the \_< and \_> regexp operators.

lispref/ChangeLog:
2004-05-04  Jim Blandy  <address@hidden>

	* searching.texi (Regexp Backslash): Document new \_< and \_>
	operators.

*** src/regex.c.~2~	2004-04-29 15:56:53.000000000 -0500
--- src/regex.c	2004-04-29 17:44:24.000000000 -0500
***************
*** 219,225 ****
  /* Define the syntax stuff for \<, \>, etc.  */
  
  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
! enum syntaxcode { Swhitespace = 0, Sword = 1 };
  
  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
--- 219,225 ----
  /* Define the syntax stuff for \<, \>, etc.  */
  
  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
! enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
  
  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
***************
*** 399,405 ****
       if (ISALNUM (c))
  	re_syntax_table[c] = Sword;
  
!    re_syntax_table['_'] = Sword;
  
     done = 1;
  }
--- 399,405 ----
       if (ISALNUM (c))
  	re_syntax_table[c] = Sword;
  
!    re_syntax_table['_'] = Ssymbol;
  
     done = 1;
  }
***************
*** 656,661 ****
--- 656,664 ----
    wordbound,	/* Succeeds if at a word boundary.  */
    notwordbound,	/* Succeeds if not at a word boundary.	*/
  
+   symbeg,       /* Succeeds if at symbol beginning.  */
+   symend,       /* Succeeds if at symbol end.  */
+ 
  	/* Matches any character whose syntax is specified.  Followed by
  	   a byte which contains a syntax code, e.g., Sword.  */
    syntaxspec,
***************
*** 1095,1100 ****
--- 1098,1110 ----
  	case wordend:
  	  printf ("/wordend");
  
+ 	case symbeg:
+ 	  printf ("/symbeg");
+ 	  break;
+ 
+ 	case symend:
+ 	  printf ("/symend");
+ 
  	case syntaxspec:
  	  printf ("/syntaxspec");
  	  mcnt = *p++;
***************
*** 3135,3140 ****
--- 3145,3163 ----
  	      BUF_PUSH (wordend);
  	      break;
  
+ 	    case '_':
+ 	      if (syntax & RE_NO_GNU_OPS)
+ 		goto normal_char;
+               laststart = b;
+               PATFETCH (c);
+               if (c == '<')
+                 BUF_PUSH (symbeg);
+               else if (c == '>')
+                 BUF_PUSH (symend);
+               else
+                 FREE_STACK_RETURN (REG_BADPAT);
+               break;
+ 
  	    case 'b':
  	      if (syntax & RE_NO_GNU_OPS)
  		goto normal_char;
***************
*** 3629,3634 ****
--- 3652,3659 ----
  	case notwordbound:
  	case wordbeg:
  	case wordend:
+ 	case symbeg:
+ 	case symend:
  	  continue;
  
  
***************
*** 4396,4409 ****
        break;
  
      case wordend:
!     case notsyntaxspec:
        return ((re_opcode_t) *p1 == syntaxspec
! 	      && p1[1] == (op2 == wordend ? Sword : p2[1]));
  
      case wordbeg:
!     case syntaxspec:
        return ((re_opcode_t) *p1 == notsyntaxspec
! 	      && p1[1] == (op2 == wordbeg ? Sword : p2[1]));
  
      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
--- 4421,4440 ----
        break;
  
      case wordend:
!       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
!     case symend:
        return ((re_opcode_t) *p1 == syntaxspec
!               && (p1[1] == Ssymbol || p1[1] == Sword));
!     case notsyntaxspec:
!       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
  
      case wordbeg:
!       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
!     case symbeg:
        return ((re_opcode_t) *p1 == notsyntaxspec
!               && (p1[1] == Ssymbol || p1[1] == Sword));
!     case syntaxspec:
!       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
  
      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
***************
*** 5528,5533 ****
--- 5559,5650 ----
  	    }
  	  break;
  
+ 	case symbeg:
+ 	  DEBUG_PRINT1 ("EXECUTING symbeg.\n");
+ 
+ 	  /* We FAIL in one of the following cases: */
+ 
+ 	  /* Case 1: D is at the end of string.	 */
+ 	  if (AT_STRINGS_END (d))
+ 	    goto fail;
+ 	  else
+ 	    {
+ 	      /* C1 is the character before D, S1 is the syntax of C1, C2
+ 		 is the character at D, and S2 is the syntax of C2.  */
+ 	      re_wchar_t c1, c2;
+ 	      int s1, s2;
+ #ifdef emacs
+ 	      int offset = PTR_TO_OFFSET (d);
+ 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+ 	      UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+ 	      PREFETCH ();
+ 	      c2 = RE_STRING_CHAR (d, dend - d);
+ 	      s2 = SYNTAX (c2);
+ 	
+ 	      /* Case 2: S2 is neither Sword nor Ssymbol. */
+ 	      if (s2 != Sword && s2 != Ssymbol)
+ 		goto fail;
+ 
+ 	      /* Case 3: D is not at the beginning of string ... */
+ 	      if (!AT_STRINGS_BEG (d))
+ 		{
+ 		  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ #ifdef emacs
+ 		  UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+ #endif
+ 		  s1 = SYNTAX (c1);
+ 
+ 		  /* ... and S1 is Sword or Ssymbol.  */
+ 		  if (s1 == Sword || s1 == Ssymbol)
+ 		    goto fail;
+ 		}
+ 	    }
+ 	  break;
+ 
+ 	case symend:
+ 	  DEBUG_PRINT1 ("EXECUTING symend.\n");
+ 
+ 	  /* We FAIL in one of the following cases: */
+ 
+ 	  /* Case 1: D is at the beginning of string.  */
+ 	  if (AT_STRINGS_BEG (d))
+ 	    goto fail;
+ 	  else
+ 	    {
+ 	      /* C1 is the character before D, S1 is the syntax of C1, C2
+ 		 is the character at D, and S2 is the syntax of C2.  */
+ 	      re_wchar_t c1, c2;
+ 	      int s1, s2;
+ #ifdef emacs
+ 	      int offset = PTR_TO_OFFSET (d) - 1;
+ 	      int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+ 	      UPDATE_SYNTAX_TABLE (charpos);
+ #endif
+ 	      GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+ 	      s1 = SYNTAX (c1);
+ 
+ 	      /* Case 2: S1 is neither Ssymbol nor Sword.  */
+ 	      if (s1 != Sword && s1 != Ssymbol)
+ 		goto fail;
+ 
+ 	      /* Case 3: D is not at the end of string ... */
+ 	      if (!AT_STRINGS_END (d))
+ 		{
+ 		  PREFETCH_NOLIMIT ();
+ 		  c2 = RE_STRING_CHAR (d, dend - d);
+ #ifdef emacs
+ 		  UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ #endif
+ 		  s2 = SYNTAX (c2);
+ 
+ 		  /* ... and S2 is Sword or Ssymbol.  */
+ 		  if (s2 == Sword || s2 == Ssymbol)
+                     goto fail;
+ 		}
+ 	    }
+ 	  break;
+ 
  	case syntaxspec:
  	case notsyntaxspec:
  	  not = (re_opcode_t) *(p - 1) == notsyntaxspec;
*** src/search.c.~1~	2002-05-12 19:04:16.000000000 -0500
--- src/search.c	2004-04-29 17:30:17.000000000 -0500
***************
*** 962,968 ****
  	    {
  	    case '|': case '(': case ')': case '`': case '\'': case 'b':
  	    case 'B': case '<': case '>': case 'w': case 'W': case 's':
! 	    case 'S': case '=': case '{': case '}':
  	    case 'c': case 'C':	/* for categoryspec and notcategoryspec */
  	    case '1': case '2': case '3': case '4': case '5':
  	    case '6': case '7': case '8': case '9':
--- 962,968 ----
  	    {
  	    case '|': case '(': case ')': case '`': case '\'': case 'b':
  	    case 'B': case '<': case '>': case 'w': case 'W': case 's':
! 	    case 'S': case '=': case '{': case '}': case '_':
  	    case 'c': case 'C':	/* for categoryspec and notcategoryspec */
  	    case '1': case '2': case '3': case '4': case '5':
  	    case '6': case '7': case '8': case '9':
*** man/search.texi.~1~	2002-07-06 08:44:06.000000000 -0500
--- man/search.texi	2004-04-29 17:38:41.000000000 -0500
***************
*** 672,677 ****
--- 672,689 ----
  @item \W
  matches any character that is not a word-constituent.
  
+ @item \_<
+ matches the empty string, but only at the beginning of a symbol.  A
+ symbol is a sequence of one or more word or symbol constituent
+ characters.  @samp{\_<} matches at the beginning of the buffer only if
+ a symbol-constituent character follows.
+ 
+ @item \_>
+ matches the empty string, but only at the end of a symbol.  A symbol
+ is a sequence of one or more word or symbol constituent characters.
+ @samp{\_>} matches at the end of the buffer only if the contents end
+ with a symbol-constituent character.
+ 
  @item address@hidden
  matches any character whose syntax is @var{c}.  Here @var{c} is a
  character that designates a particular syntax class: thus, @samp{w}
*** searching.texi.~1.48.~	2004-02-16 20:09:15.000000000 -0500
--- searching.texi	2004-05-05 01:12:38.000000000 -0500
***************
*** 666,671 ****
--- 666,686 ----
  with a word-constituent character.
  @end table
  
+ @item \_<
+ @cindex @samp{\_<} in regexp
+ @cindex symbols, matching in regexp
+ matches the empty string, but only at the beginning of a symbol.  A
+ symbol is a sequence of one or more word or symbol constituent
+ characters.  @samp{\_<} matches at the beginning of the buffer (or
+ string) only if a symbol-constituent character follows.
+ 
+ @item \_>
+ @cindex @samp{\_>} in regexp
+ matches the empty string, but only at the end of a symbol.  A symbol
+ is a sequence of one or more word or symbol constituent characters.
+ @samp{\_>} matches at the end of the buffer (or string) only if the
+ contents end with a symbol-constituent character.
+ 
  @kindex invalid-regexp
    Not every string is a valid regular expression.  For example, a string
  with unbalanced square brackets is invalid (with a few exceptions, such
*** etc/NEWS.~1.950.~	2004-04-27 17:02:27.000000000 -0500
--- etc/NEWS	2004-05-04 14:15:33.000000000 -0500
***************
*** 90,95 ****
--- 90,101 ----
  
  * Changes in Emacs 21.4
  
+ +++
+ ** There are now two new regular expression operators, \_< and \_>,
+ for matching the beginning and end of a symbol.  A symbol is a
+ non-empty sequence of either word or symbol constituent characters, as
+ specified by the syntax table.
+ 
  ---
  ** The IELM prompt is now, by default, read-only.  This can be
  controlled with the new user option `ielm-prompt-read-only'.