1 #ParseMaster (July 25 2005)
\r
2 # Based on "ParseMaster.js" by Dean Edwards <http://dean.edwards.name/>
\r
3 # Ported to Perl by Rob Seiler, ELR Software Pty Ltd <http://www.elr.com.au>
\r
4 # Copyright 2005. License <http://creativecommons.org/licenses/LGPL/2.1/>
\r
10 # Package wide variable declarations
\r
11 use vars qw/$VERSION
\r
12 @_X_escaped @_X_patterns
\r
18 my $X_EXPRESSION = 0;
\r
19 my $X_REPLACEMENT = 1;
\r
22 # re's used to determine nesting levels
\r
23 my $X_GROUPS = qr/\(/o; # NB: Requires g modifier!
\r
24 my $X_SUB_REPLACE = qr/\$\d/o;
\r
25 my $X_INDEXED = qr/^\$\d+$/o;
\r
26 my $XX_ESCAPE = qr/\\./o; # NB: Requires g modifier!
\r
27 my $XX_DELETED = qr/\001[^\001]*\001/o; # NB: Requires g modifier!
\r
28 my $DIGIT = qr/[^\D]/o; # Yep - this is a digit - contains no non-digits
\r
34 @_X_escaped = (); # Re-initialize global for each instance
\r
35 @_X_patterns = (); # Re-initialize global for each instance
\r
36 # Instance variables - access by similarly named set/get functions
\r
37 $self->{_ignoreCase_} = 0;
\r
38 $self->{_escapeChar_} = '';
\r
39 bless ($self, $class);
\r
44 my ($self, $value) = @_;
\r
45 if (defined($value)) {
\r
46 $self->{_ignoreCase_} = $value;
\r
48 return $self->{_ignoreCase_};
\r
52 my ($self, $value) = @_;
\r
53 if (defined($value)) {
\r
54 $self->{_escapeChar_} = $value;
\r
56 return $self->{_escapeChar_};
\r
59 #######################
\r
60 # Public Parsemaster functions
\r
62 my $X_DELETE = sub(@$) {
\r
65 return (chr(001) . $X_match[$X_offset] . chr(001));
\r
66 }; # NB semicolon required for closure!
\r
68 # create and add a new pattern to the patterns collection
\r
70 my ($self, $expression, $X_replacement) = @_;
\r
71 if (!$X_replacement) {$X_replacement = $X_DELETE};
\r
73 # count the number of sub-expressions
\r
74 my $temp = &_X_internalEscape($expression);
\r
75 my $length = 1; # Always at least one because each pattern is itself a sub-expression
\r
76 $length += $temp =~ s/$X_GROUPS//g; # One way to count the left capturing parentheses in the regexp string
\r
78 # does the pattern deal with sub-expressions?
\r
79 if ((ref($X_replacement) ne "CODE") && ($X_replacement =~ m/$X_SUB_REPLACE/)) {
\r
80 if ($X_replacement =~ m/$X_INDEXED/) { # a simple lookup? (eg "$2")
\r
81 # store the index (used for fast retrieval of matched strings)
\r
82 $X_replacement = substr($X_replacement,1) - 1;
\r
84 else { # a complicated lookup (eg "Hello $2 $1")
\r
86 while ($i) { # Had difficulty getting Perl to do Dean's splitting and joining of strings containing $'s
\r
87 my $str = '$a[$o+' . ($i-1) . ']'; # eg $a[$o+1]
\r
88 $X_replacement =~ s/\$$i/$str/; # eg $2 $3 -> $a[$o+1] $a[$o+2]
\r
91 # build a function to do the lookup - returns interpolated string of array lookups
\r
92 $X_replacement = eval('sub {my $o=pop; my @a=@_; return "' . $X_replacement . '"};');
\r
96 # pass the modified arguments
\r
97 &_X_add($expression || q/^$/, $X_replacement, $length);
\r
100 # execute the global replacement
\r
102 #print Dumper(@_X_patterns);
\r
103 my ($self, $X_string) = @_;
\r
104 my $escChar = $self->escapeChar();
\r
105 my $ignoreCase = $self->ignoreCase();
\r
106 my ($regexp,$captures) = &_getPatterns(); # Concatenated and parenthesized regexp eg '(regex1)|(regex2)|(regex3)' etc
\r
107 $X_string = &_X_escape($X_string, $escChar);
\r
108 if ($ignoreCase) {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/gie} # Pass $X_String as a
\r
109 else {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/ge} # reference for speed
\r
111 $X_string = &_X_unescape($X_string, $escChar);
\r
112 $X_string =~ s/$XX_DELETED//g;
\r
117 push (@_X_patterns, [@_]); # Save each argument set as is into an array of arrays
\r
120 # this is the global replace function (it's quite complicated)
\r
121 sub _X_replacement {
\r
122 my (@arguments) = @_;
\r
123 #print Dumper (@arguments);
\r
124 if ($arguments[0] le '') {return ''}
\r
125 # Dereference last index (source String) here - faster than in _matchVars (maybe not needed at all?)
\r
126 $arguments[$#arguments] = ${$arguments[$#arguments]};
\r
128 # loop through the patterns
\r
129 for (my $j=0; $j<scalar(@_X_patterns); $j++) { # Loop through global all @_X_patterns
\r
130 my @X_pattern = @{$_X_patterns[$j]};
\r
131 # do we have a result? NB: "if ($arguments[$i])" as in Dean's Javascript is false for the value 0!!!
\r
132 if ((defined $arguments[$i]) && ($arguments[$i] gt '')) {
\r
133 my $X_replacement = $X_pattern[$X_REPLACEMENT];
\r
134 # switch on type of $replacement
\r
135 if (ref($X_replacement) eq "CODE") { # function
\r
136 return &$X_replacement(@arguments,$i);
\r
138 elsif ($X_replacement =~ m/$DIGIT/) { # number (contains no non-digits)
\r
139 return $arguments[$X_replacement + $i];
\r
142 return $X_replacement; # default
\r
144 } # skip over references to sub-expressions
\r
145 else {$i += $X_pattern[$X_LENGTH]}
\r
149 #######################
\r
150 # Private functions
\r
151 #######################
\r
153 # encode escaped characters
\r
155 my ($X_string, $X_escapeChar) = @_;
\r
156 if ($X_escapeChar) {
\r
157 my $re = '\\'.$X_escapeChar.'(.)';
\r
158 $X_string =~ s/$re/{push(@_X_escaped,$1); $X_escapeChar}/ge;
\r
163 # decode escaped characters
\r
165 my ($X_string, $X_escapeChar) = @_;
\r
166 if ($X_escapeChar) { # We'll only do this if there is an $X_escapeChar!
\r
167 my $re = '\\'.$X_escapeChar;
\r
168 $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped))}/ge; # Don't use Dean Edwards as below 'or' here - because zero will return ''!
\r
169 # $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped) || '')}/ge;
\r
174 sub _X_internalEscape {
\r
175 my ($string) = shift;
\r
176 $string =~ s/$XX_ESCAPE//g;
\r
180 # Builds an array of match variables to (approximately) emulate that available in Javascript String.replace()
\r
182 my ($m,$sref) = @_;
\r
183 my @args = (1..$m); # establish the number potential memory variables
\r
184 my @mv = map {eval("\$$_")} @args; # matchvarv[1..m] = the memory variables $1 .. $m
\r
185 unshift (@mv, $&); # matchvar[0] = the substring that matched
\r
186 push (@mv, length($`)); # matchvar[m+1] = offset within the source string where the match occurred (= length of prematch string)
\r
187 push (@mv, $sref); # matchvar[m+2] = reference to full source string (dereference in caller if/when needed)
\r
188 #print Dumper (@mv);
\r
195 for (my $i=0; $i<scalar(@_X_patterns); $i++) { # Loop through global all @_patterns
\r
196 push (@Patterns, $_X_patterns[$i][$X_EXPRESSION]); # accumulate the expressions
\r
197 $lcp += $_X_patterns[$i][$X_LENGTH]; # sum the left capturing parenthesis counts
\r
199 my $str = "(" . join(')|(',@Patterns). ")"; # enclose each pattern in () separated by "|"
\r
200 return ($str, $lcp);
\r