Path: usenet.cise.ufl.edu!usenet.eel.ufl.edu!www.nntp.primenet.com!nntp.primenet.com!feed1.news.erols.com!howland.erols.net!newsfeed.internetmci.com!in2.uu.net!192.108.254.3!news.teleport.com!not-for-mail
From: chip@rio.atlantic.net (Chip Salzenberg)
Newsgroups: comp.lang.perl.announce,comp.lang.perl.misc
Subject: Patch to Perl 5.004 for case-insensitive patterns (REFCASE1)
Followup-To: comp.lang.perl.misc
Date: 20 May 1997 17:55:02 GMT
Organization: Internet Connect Company +1(352)375-2912, USA
Lines: 181
Sender: news-merlyn@gadget.cscaper.com
Approved: merlyn@stonehenge.com (comp.lang.perl.announce)
Message-ID: <5lsohm$m4t$1@nadine.teleport.com>
NNTP-Posting-Host: gadget.cscaper.com
X-Disclaimer: The "Approved" header verifies header information for article transmission and does not imply approval of content.
Xref: usenet.cise.ufl.edu comp.lang.perl.announce:178 comp.lang.perl.misc:27731

In a case-insensitive pattern match (//i), back-references to previous
parenthesized subpatterns (e.g. the \1 in "/(\w+)\s+\1/i") should be
case-insensitive as well.  In Perl 5.004, they aren't.  This patch fixes
that problem.

Index: patchlevel.h
***************
*** 41,42 ****
--- 41,43 ----
+ 	,"REFCASE1 - fix for case-insensitivity in regex backreferences"
  	,NULL
  };

Index: regcomp.h
***************
*** 73,95 ****
  #define NBOUND	22	/* no	Match "" at any word non-boundary */
  #define NBOUNDL	23	/* no	Match "" at any word non-boundary */
! #define REF	24	/* num	Match some already matched string */
! #define	OPEN	25	/* num	Mark this point in input as start of #n. */
! #define	CLOSE	26	/* num	Analogous to OPEN. */
! #define MINMOD	27	/* no	Next operator is not greedy. */
! #define GPOS	28	/* no	Matches where last m//g left off. */
! #define IFMATCH	29	/* no	Succeeds if the following matches. */
! #define UNLESSM	30	/* no	Fails if the following matches. */
! #define SUCCEED	31	/* no	Return from a subroutine, basically. */
! #define WHILEM	32	/* no	Do curly processing and see if rest matches. */
! #define ALNUM	33	/* no	Match any alphanumeric character */
! #define ALNUML	34 	/* no	Match any alphanumeric char in locale */
! #define NALNUM	35	/* no	Match any non-alphanumeric character */
! #define NALNUML	36	/* no	Match any non-alphanumeric char in locale */
! #define SPACE	37	/* no	Match any whitespace character */
! #define SPACEL	38	/* no	Match any whitespace char in locale */
! #define NSPACE	39	/* no	Match any non-whitespace character */
! #define NSPACEL	40	/* no	Match any non-whitespace char in locale */
! #define DIGIT	41	/* no	Match any numeric character */
! #define NDIGIT	42	/* no	Match any non-numeric character */
  
  /*
--- 73,97 ----
  #define NBOUND	22	/* no	Match "" at any word non-boundary */
  #define NBOUNDL	23	/* no	Match "" at any word non-boundary */
! #define REF	24	/* num	Match already matched string */
! #define REFF	25	/* num	Match already matched string, folded */
! #define REFFL	26	/* num	Match already matched string, folded in loc. */
! #define	OPEN	27	/* num	Mark this point in input as start of #n. */
! #define	CLOSE	28	/* num	Analogous to OPEN. */
! #define MINMOD	29	/* no	Next operator is not greedy. */
! #define GPOS	30	/* no	Matches where last m//g left off. */
! #define IFMATCH	31	/* no	Succeeds if the following matches. */
! #define UNLESSM	32	/* no	Fails if the following matches. */
! #define SUCCEED	33	/* no	Return from a subroutine, basically. */
! #define WHILEM	34	/* no	Do curly processing and see if rest matches. */
! #define ALNUM	35	/* no	Match any alphanumeric character */
! #define ALNUML	36 	/* no	Match any alphanumeric char in locale */
! #define NALNUM	37	/* no	Match any non-alphanumeric character */
! #define NALNUML	38	/* no	Match any non-alphanumeric char in locale */
! #define SPACE	39	/* no	Match any whitespace character */
! #define SPACEL	40	/* no	Match any whitespace char in locale */
! #define NSPACE	41	/* no	Match any non-whitespace character */
! #define NSPACEL	42	/* no	Match any non-whitespace char in locale */
! #define DIGIT	43	/* no	Match any numeric character */
! #define NDIGIT	44	/* no	Match any non-numeric character */
  
  /*
***************
*** 122,126 ****
      /*CURLY*/ 4, /*CURLYX*/ 4,
      0,0,0,0,0,0,0,0,0,0,0,0,
!     /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2,
      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  };
--- 124,128 ----
      /*CURLY*/ 4, /*CURLYX*/ 4,
      0,0,0,0,0,0,0,0,0,0,0,0,
!     /*REF*/ 2, 2, 2, /*OPEN*/ 2, /*CLOSE*/ 2,
      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  };
***************
*** 156,159 ****
--- 158,163 ----
  	NBOUND,
  	REF,
+ 	REF,
+ 	REF,
  	OPEN,
  	CLOSE,
***************
*** 182,186 ****
  #else
  EXT char varies[] = {
!     BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, WHILEM, 0
  };
  #endif
--- 186,190 ----
  #else
  EXT char varies[] = {
!     BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, WHILEM, 0
  };
  #endif

Index: regcomp.c
*************** tryagain:
*** 904,908 ****
  		else {
  		    regsawback = 1;
! 		    ret = reganode(REF, num);
  		    *flagp |= HASWIDTH;
  		    while (isDIGIT(*regparse))
--- 904,910 ----
  		else {
  		    regsawback = 1;
! 		    ret = reganode((regflags & PMf_FOLD)
! 				   ? ((regflags & PMf_LOCALE) ? REFFL : REFF)
! 				   : REF, num);
  		    *flagp |= HASWIDTH;
  		    while (isDIGIT(*regparse))
*************** char *op;
*** 1667,1670 ****
--- 1669,1678 ----
      case REF:
  	sv_catpvf(sv, "REF%d", ARG1(op));
+ 	break;
+     case REFF:
+ 	sv_catpvf(sv, "REFF%d", ARG1(op));
+ 	break;
+     case REFFL:
+ 	sv_catpvf(sv, "REFFL%d", ARG1(op));
  	break;
      case OPEN:

Index: regexec.c
*************** char *prog;
*** 826,830 ****
--- 826,834 ----
  	    nextchar = UCHARAT(++locinput);
  	    break;
+ 	case REFFL:
+ 	    regtainted = TRUE;
+ 	    /* FALL THROUGH */
  	case REF:
+ 	case REFF:
  	    n = ARG1(scan);  /* which paren pair */
  	    s = regstartp[n];
*************** char *prog;
*** 836,845 ****
  		break;
  	    /* Inline the first character, for speed. */
! 	    if (UCHARAT(s) != nextchar)
  		sayNO;
  	    ln = regendp[n] - s;
  	    if (locinput + ln > regeol)
  		sayNO;
! 	    if (ln > 1 && memNE(s, locinput, ln))
  		sayNO;
  	    locinput += ln;
--- 840,856 ----
  		break;
  	    /* Inline the first character, for speed. */
! 	    if (UCHARAT(s) != nextchar &&
! 		(OP(scan) == REF ||
! 		 (UCHARAT(s) != ((OP(scan) == REFF
! 				 ? fold : fold_locale)[nextchar]))))
  		sayNO;
  	    ln = regendp[n] - s;
  	    if (locinput + ln > regeol)
  		sayNO;
! 	    if (ln > 1 && (OP(scan) == REF
! 			   ? memNE(s, locinput, ln)
! 			   : (OP(scan) == REFF
! 			      ? ibcmp(s, locinput, ln)
! 			      : ibcmp_locale(s, locinput, ln))))
  		sayNO;
  	    locinput += ln;

-- 
Chip Salzenberg          - a.k.a. -            <chip@pobox.com>
    "Most organizations reward individuals and groups that
     choose to re-invent the wheel."  -- Bjarne Stroustrup


