1N/A# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 1N/A# FOR FULL DOCUMENTATION SEE Balanced.pod 1N/A# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 1N/A #REARRANGE HERE DOC AND FILLET IF POSSIBLE 1N/A eval {
substr($$
textref,$_[
4],$_[
1]+$_[
5])=$
extra} ;
#CHOP OUT PREFIX & MATCH, IF POSSIBLE 1N/A# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 1N/A push @
pat,
"$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
1N/A push @
pat,
"$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
1N/A# THE EXTRACTION FUNCTIONS 1N/A my $
del =
defined $_[
1] ? $_[
1] :
qq{\
'\"\`}; 1N/A my $
pre =
defined $_[
2] ? $_[
2] :
'\s*';
1N/A my $
esc =
defined $_[
3] ? $_[
3] :
qq{\\};
1N/A my $
ldel =
defined $_[
1] ? $_[
1] :
'{([<';
1N/A my $
pre =
defined $_[
2] ? $_[
2] :
'\s*';
1N/A "Did not find a suitable bracket in delimiter: \"$_[1]\"",
1N/A $
ldel =
join(
'|',
map {
quotemeta $_ }
split(
'', $
ldel));
1N/A $
rdel =
join(
'|',
map {
quotemeta $_ }
split(
'', $
rdel));
1N/A _failmsg "Did not find opening bracket after prefix: \"$pre\"",
1N/A last if $
#nesting < 0; 1N/A my $
pre =
defined $_[
3] ? $_[
3] :
'\s*';
1N/A @
match[
8..
9,
0..
1,
2..
7];
# REM, PRE, BITS 1N/A _failmsg "Unable to construct closing tag to match: $rdel",
1N/A for (
qw,~ ! ^ & * ) _ + - = } ] :
" ; ' > . ? / | ',) 1N/A croak "Can't interpolate right delimiter $rdel" 1N/A eval "qq$del$rdel$del";
1N/A pos($$
textref) -=
length($
1);
# CUT OFF WHATEVER CAUSED THE SHORTNESS 1N/A my $
pre =
defined $_[
1] ? $_[
1] :
'\s*';
1N/A @
match[
2..
3,
4..
5,
0..
1];
# MATCH, REMAINDER, PREFIX 1N/A unless ($$
textref =~ m{\G\$\s*(?!::)(\d+|[][&`
'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 1N/A unless ($$
textref =~ m/\G((\$
#?|[*\@\%]|\\&)+)/gc) 1N/A qr/[({[]/,
qr/[)}\]]/,
1N/A qr/[({[]/,
qr/[)}\]]/,
0);
1N/A qr/\s*/,
qr/[{[]/,
qr/[}\]]/,
1N/A qr/[{[]/,
qr/[}\]]/,
0);
1N/A my $
pre =
defined $_[
2] ? $_[
2] :
'\s*';
1N/A $_ =
'('.
join(
'|',
map {
quotemeta $_ }
split(
'',$_)).
')' 1N/A @
match[
2..
3,
4..
5,
0..
1]
# MATCH, REMAINDER, PREFIX 1N/A next if $
1 eq '>';
# MIGHT BE A "LESS THAN" 1N/A qq{...
". Expected '$closing'}, 1N/A # NEED TO COVER MANY MORE CASES HERE!!! 1N/A | (\*\*|&&|\|\||<<|>>)=?
1N/A |
split|
grep|
map|
return 1N/A continue { $@ =
undef }
1N/A 'none' =>
'[cgimsox]*',
1N/A 'm' =>
'[cgimsox]*',
1N/A 's' =>
'[cegimsox]*',
1N/A my $
pre =
defined $_[
1] ? $_[
1] :
'\s*';
1N/A unless ($$
textref =~ m{\G(\b(?:m|s|
qq|
qx|
qw|q|
qr|
tr|y)\b(?=\s*\S)|<<)}
gc)
1N/A elsif ($$
textref =~ m{ \G
' ([^'\\]* (?:\\.[^
'\\]*)*) ' 1N/A | \G
" ([^"\\]* (?:\\.[^
"\\]*)*) " 1N/A | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
1N/A _failmsg "No block delimiter found after quotelike $op",
1N/A my $
max =
defined $_[
2] && $_[
2]>
0 ? $_[
2] :
1_000_000_000;
1N/A carp "extract_multiple reset maximal count to 1 in scalar context" 1N/A if $^W &&
defined($_[
2]) && $
max >
1;
1N/A foreach my $i (
0..$
#func ) 1N/A # print "[$field|$rem]" if $field; 1N/A elsif (
ref($
func)
eq 'Text::Balanced::Extractor')
1N/A my $
pre =
defined $_[
2] ? $_[
2] :
'\s*';
1N/A @
match[
8..
9,
0..
1,
2..
7];
# REM, PRE, BITS 1N/Ause overload '""' =>
sub {
"$_[0]->{error}, detected at offset $_[0]->{pos}" };
1N/AText::Balanced - Extract delimited text sequences from strings. 1N/A use Text::Balanced qw ( 1N/A # Extract the initial substring of $text that is delimited by 1N/A # two (unescaped) instances of the first character in $delim. 1N/A ($extracted, $remainder) = extract_delimited($text,$delim); 1N/A # Extract the initial substring of $text that is bracketed 1N/A # with a delimiter(s) specified by $delim (where the string 1N/A # in $delim contains one or more of '(){}[]<>'). 1N/A ($extracted, $remainder) = extract_bracketed($text,$delim); 1N/A # Extract the initial substring of $text that is bounded by 1N/A ($extracted, $remainder) = extract_tagged($text); 1N/A # Extract the initial substring of $text that is bounded by 1N/A # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1N/A ($extracted, $remainder) = 1N/A extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1N/A # Extract the initial substring of $text that represents a 1N/A # Perl "quote or quote-like operation" 1N/A ($extracted, $remainder) = extract_quotelike($text); 1N/A # Extract the initial substring of $text that represents a block 1N/A # of Perl code, bracketed by any of character(s) specified by $delim 1N/A # (where the string $delim contains one or more of '(){}[]<>'). 1N/A ($extracted, $remainder) = extract_codeblock($text,$delim); 1N/A # Extract the initial substrings of $text that would be extracted by 1N/A # one or more sequential applications of the specified functions 1N/A # or regular expressions 1N/A @extracted = extract_multiple($text, 1N/A [ \&extract_bracketed, 1N/A \&extract_quotelike, 1N/A \&some_other_extractor_sub, 1N/A# Create a string representing an optimized pattern (a la Friedl) 1N/A# that matches a substring delimited by any of the specified characters 1N/A# (in this case: any type of quote or a slash) 1N/A $patstring = gen_delimited_pat(q{'"`/}); 1N/A# Generate a reference to an anonymous sub that is just like extract_tagged 1N/A# but pre-compiled and optimized for a specific pair of tags, and consequently 1N/A# much faster (i.e. 3 times faster). It uses qr// for better performance on 1N/A# repeated calls, so it only works under Perl 5.005 or later. 1N/A $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1N/A ($extracted, $remainder) = $extract_head->($text); 1N/AThe various C<extract_...> subroutines may be used to 1N/Aextract a delimited substring, possibly after skipping a 1N/Aspecified prefix string. By default, that prefix is 1N/Aoptional whitespace (C</\s*/>), but you can change it to whatever 1N/Ayou wish (see below). 1N/AThe substring to be extracted must appear at the 1N/Acurrent C<pos> location of the string's variable 1N/A(or at index zero, if no C<pos> position is defined). 1N/AIn other words, the C<extract_...> subroutines I<don't> 1N/Aextract the first occurance of a substring anywhere 1N/Ain a string (like an unanchored regex would). Rather, 1N/Athey extract an occurance of the substring appearing 1N/Aimmediately at the current matching position in the 1N/Astring (like a C<\G>-anchored regex would). 1N/A=head2 General behaviour in list contexts 1N/AIn a list context, all the subroutines return a list, the first three 1N/Aelements of which are always: 1N/AThe extracted string, including the specified delimiters. 1N/AIf the extraction fails an empty string is returned. 1N/AThe remainder of the input string (i.e. the characters after the 1N/Aextracted string). On failure, the entire string is returned. 1N/AThe skipped prefix (i.e. the characters before the extracted string). 1N/AOn failure, the empty string is returned. 1N/ANote that in a list context, the contents of the original input text (the first 1N/Aargument) are not modified in any way. 1N/AHowever, if the input text was passed in a variable, that variable's 1N/AC<pos> value is updated to point at the first character after the 1N/Aextracted text. That means that in a list context the various 1N/Asubroutines can be used much like regular expressions. For example: 1N/A while ( $next = (extract_quotelike($text))[0] ) 1N/A # process next quote-like (in $next) 1N/A=head2 General behaviour in scalar and void contexts 1N/AIn a scalar context, the extracted string is returned, having first been 1N/Aremoved from the input text. Thus, the following code also processes 1N/Aeach quote-like operation, but actually removes them from $text: 1N/A while ( $next = extract_quotelike($text) ) 1N/A # process next quote-like (in $next) 1N/ANote that if the input text is a read-only string (i.e. a literal), 1N/Ano attempt is made to remove the extracted text. 1N/AIn a void context the behaviour of the extraction subroutines is 1N/Aexactly the same as in a scalar context, except (of course) that the 1N/Aextracted substring is not returned. 1N/A=head2 A note about prefixes 1N/APrefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1N/AThis can bite you if you're expecting a prefix specification like 1N/A'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1N/Apattern will only succeed if the <H1> tag is on the current line, since 1N/A. normally doesn't match newlines. 1N/ATo overcome this limitation, you need to turn on /s matching within 1N/Athe prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1N/A=head2 C<extract_delimited> 1N/AThe C<extract_delimited> function formalizes the common idiom 1N/Aof extracting a single-character-delimited substring from the start of 1N/Aa string. For example, to extract a single-quote delimited string, the 1N/Afollowing code is typically used: 1N/A ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1N/Abut with C<extract_delimited> it can be simplified to: 1N/A ($extracted,$remainder) = extract_delimited($text, "'"); 1N/AC<extract_delimited> takes up to four scalars (the input text, the 1N/Adelimiters, a prefix pattern to be skipped, and any escape characters) 1N/Aand extracts the initial substring of the text that 1N/Ais appropriately delimited. If the delimiter string has multiple 1N/Acharacters, the first one encountered in the text is taken to delimit 1N/AThe third argument specifies a prefix pattern that is to be skipped 1N/A(but must be present!) before the substring is extracted. 1N/AThe final argument specifies the escape character to be used for each 1N/AAll arguments are optional. If the escape characters are not specified, 1N/Aevery delimiter is escaped with a backslash (C<\>). 1N/AIf the prefix is not specified, the 1N/Apattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1N/Ais also not specified, the set C</["'`]/> is used. If the text to be processed 1N/Ais not specified either, C<$_> is used. 1N/AIn list context, C<extract_delimited> returns a array of three 1N/Aelements, the extracted substring (I<including the surrounding 1N/Adelimiters>), the remainder of the text, and the skipped prefix (if 1N/Aany). If a suitable delimited substring is not found, the first 1N/Aelement of the array is the empty string, the second is the complete 1N/Aoriginal text, and the prefix returned in the third element is an 1N/AIn a scalar context, just the extracted substring is returned. In 1N/Aa void context, the extracted substring (and any prefix) are simply 1N/Aremoved from the beginning of the first argument. 1N/A # Remove a single-quoted substring from the very beginning of $text: 1N/A $substring = extract_delimited($text, "'", ''); 1N/A # Remove a single-quoted Pascalish substring (i.e. one in which 1N/A # doubling the quote character escapes it) from the very 1N/A # beginning of $text: 1N/A $substring = extract_delimited($text, "'", '', "'"); 1N/A # Extract a single- or double- quoted substring from the 1N/A # beginning of $text, optionally after some whitespace 1N/A # (note the list context to protect $text from modification): 1N/A ($substring) = extract_delimited $text, q{"'}; 1N/A # Delete the substring delimited by the first '/' in $text: 1N/A $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1N/ANote that this last example is I<not> the same as deleting the first 1N/Aquote-like pattern. For instance, if C<$text> contained the string: 1N/A "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1N/Athen after the deletion it would contain: 1N/A "if ('.$UNIXCMD/s) { $cmd = $1; }" 1N/A "if ('./cmd' =~ ms) { $cmd = $1; }" 1N/ASee L<"extract_quotelike"> for a (partial) solution to this problem. 1N/A=head2 C<extract_bracketed> 1N/ALike C<"extract_delimited">, the C<extract_bracketed> function takes 1N/Aup to three optional scalar arguments: a string to extract from, a delimiter 1N/Aspecifier, and a prefix pattern. As before, a missing prefix defaults to 1N/Aoptional whitespace and a missing text defaults to C<$_>. However, a missing 1N/Adelimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1N/AC<extract_bracketed> extracts a balanced-bracket-delimited 1N/Asubstring (using any one (or more) of the user-specified delimiter 1N/Abrackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1N/Arespect quoted unbalanced brackets (see below). 1N/AA "delimiter bracket" is a bracket in list of delimiters passed as 1N/AC<extract_bracketed>'s second argument. Delimiter brackets are 1N/Aspecified by giving either the left or right (or both!) versions 1N/Aof the required bracket(s). Note that the order in which 1N/Atwo or more delimiter brackets are specified is not significant. 1N/AA "balanced-bracket-delimited substring" is a substring bounded by 1N/Amatched brackets, such that any other (left or right) delimiter 1N/Abracket I<within> the substring is also matched by an opposite 1N/A(right or left) delimiter bracket I<at the same level of nesting>. Any 1N/Atype of bracket not in the delimiter list is treated as an ordinary 1N/AIn other words, each type of bracket specified as a delimiter must be 1N/Abalanced and correctly nested within the substring, and any other kind of 1N/A("non-delimiter") bracket in the substring is ignored. 1N/AFor example, given the string: 1N/A $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1N/Athen a call to C<extract_bracketed> in a list context: 1N/A @result = extract_bracketed( $text, '{}' ); 1N/A ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1N/Asince both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1N/A(In a scalar context just the first element of the array would be returned. In 1N/Aa void context, C<$text> would be replaced by an empty string.) 1N/ALikewise the call in: 1N/A @result = extract_bracketed( $text, '{[' ); 1N/Awould return the same result, since all sets of both types of specified 1N/Adelimiter brackets are correctly nested and balanced. 1N/AHowever, the call in: 1N/A @result = extract_bracketed( $text, '{([<' ); 1N/Awould fail, returning: 1N/A ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1N/Abecause the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1N/Athe embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1N/Areturn an empty string. In a void context, C<$text> would be unchanged.) 1N/ANote that the embedded single-quotes in the string don't help in this 1N/Acase, since they have not been specified as acceptable delimiters and are 1N/Atherefore treated as non-delimiter characters (and ignored). 1N/AHowever, if a particular species of quote character is included in the 1N/Adelimiter specification, then that type of quote will be correctly handled. 1N/Afor example, if C<$text> is: 1N/A $text = '<A HREF=">>>>">link</A>'; 1N/A @result = extract_bracketed( $text, '<">' ); 1N/A ( '<A HREF=">>>>">', 'link</A>', "" ) 1N/Aas expected. Without the specification of C<"> as an embedded quoter: 1N/A @result = extract_bracketed( $text, '<>' ); 1N/A ( '<A HREF=">', '>>>">link</A>', "" ) 1N/AIn addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1N/Aquoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1N/Aletter 'q' as a delimiter. Hence: 1N/A @result = extract_bracketed( $text, '<q>' ); 1N/Awould correctly match something like this: 1N/A $text = '<leftop: conj /and/ conj>'; 1N/ASee also: C<"extract_quotelike"> and C<"extract_codeblock">. 1N/A=head2 C<extract_variable> 1N/AC<extract_variable> extracts any valid Perl variable or 1N/Avariable-involved expression, including scalars, arrays, hashes, array 1N/Aaccesses, hash look-ups, method calls through objects, subroutine calles 1N/Athrough subroutine references, etc. 1N/AThe subroutine takes up to two optional arguments: 1N/AA string to be processed (C<$_> if the string is omitted or C<undef>) 1N/AA string specifying a pattern to be matched as a prefix (which is to be 1N/Askipped). If omitted, optional whitespace is skipped. 1N/AOn success in a list context, an array of 3 elements is returned. The 1N/Athe extracted variable, or variablish expression 1N/Athe remainder of the input text, 1N/Athe prefix substring (if any), 1N/AOn failure, all of these values (except the remaining text) are C<undef>. 1N/AIn a scalar context, C<extract_variable> returns just the complete 1N/Asubstring that matched a variablish expression. C<undef> is returned on 1N/Afailure. In addition, the original input text has the returned substring 1N/A(and any prefix) removed from it. 1N/AIn a void context, the input text just has the matched substring (and 1N/Aany specified prefix) removed. 1N/A=head2 C<extract_tagged> 1N/AC<extract_tagged> extracts and segments text between (balanced) 1N/AThe subroutine takes up to five optional arguments: 1N/AA string to be processed (C<$_> if the string is omitted or C<undef>) 1N/AA string specifying a pattern to be matched as the opening tag. 1N/AIf the pattern string is omitted (or C<undef>) then a pattern 1N/Athat matches any standard XML tag is used. 1N/AA string specifying a pattern to be matched at the closing tag. 1N/AIf the pattern string is omitted (or C<undef>) then the closing 1N/Atag is constructed by inserting a C</> after any leading bracket 1N/Acharacters in the actual opening tag that was matched (I<not> the pattern 1N/Athat matched the tag). For example, if the opening tag pattern 1N/Ais specified as C<'{{\w+}}'> and actually matched the opening tag 1N/AC<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1N/AA string specifying a pattern to be matched as a prefix (which is to be 1N/Askipped). If omitted, optional whitespace is skipped. 1N/AA hash reference containing various parsing options (see below) 1N/AThe various options that can be specified are: 1N/A=item C<reject =E<gt> $listref> 1N/AThe list reference contains one or more strings specifying patterns 1N/Athat must I<not> appear within the tagged text. 1N/AFor example, to extract 1N/Aan HTML link (which should not contain nested links) use: 1N/A extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1N/A=item C<ignore =E<gt> $listref> 1N/AThe list reference contains one or more strings specifying patterns 1N/Athat are I<not> be be treated as nested tags within the tagged text 1N/A(even if they would match the start tag pattern). 1N/AFor example, to extract an arbitrary XML tag, but ignore "empty" elements: 1N/A extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1N/A(also see L<"gen_delimited_pat"> below). 1N/A=item C<fail =E<gt> $str> 1N/AThe C<fail> option indicates the action to be taken if a matching end 1N/Atag is not encountered (i.e. before the end of the string or some 1N/AC<reject> pattern matches). By default, a failure to match a closing 1N/Atag causes C<extract_tagged> to immediately fail. 1N/AHowever, if the string value associated with <reject> is "MAX", then 1N/AC<extract_tagged> returns the complete text up to the point of failure. 1N/AIf the string is "PARA", C<extract_tagged> returns only the first paragraph 1N/Aafter the tag (up to the first line that is either empty or contains 1N/Aonly whitespace characters). 1N/AIf the string is "", the the default behaviour (i.e. failure) is reinstated. 1N/AFor example, suppose the start tag "/para" introduces a paragraph, which then 1N/Acontinues until the next "/endpara" tag or until another "/para" tag is 1N/A $text = "/para line 1\n\nline 3\n/para line 4"; 1N/A extract_tagged($text, '/para', '/endpara', undef, 1N/A {reject => '/para', fail => MAX ); 1N/A # EXTRACTED: "/para line 1\n\nline 3\n" 1N/ASuppose instead, that if no matching "/endpara" tag is found, the "/para" 1N/Atag refers only to the immediately following paragraph: 1N/A $text = "/para line 1\n\nline 3\n/para line 4"; 1N/A extract_tagged($text, '/para', '/endpara', undef, 1N/A {reject => '/para', fail => MAX ); 1N/A # EXTRACTED: "/para line 1\n" 1N/ANote that the specified C<fail> behaviour applies to nested tags as well. 1N/AOn success in a list context, an array of 6 elements is returned. The elements are: 1N/Athe extracted tagged substring (including the outermost tags), 1N/Athe remainder of the input text, 1N/Athe prefix substring (if any), 1N/Athe text between the opening and closing tags 1N/Athe closing tag (or "" if no closing tag was found) 1N/AOn failure, all of these values (except the remaining text) are C<undef>. 1N/AIn a scalar context, C<extract_tagged> returns just the complete 1N/Asubstring that matched a tagged text (including the start and end 1N/Atags). C<undef> is returned on failure. In addition, the original input 1N/Atext has the returned substring (and any prefix) removed from it. 1N/AIn a void context, the input text just has the matched substring (and 1N/Aany specified prefix) removed. 1N/A=head2 C<gen_extract_tagged> 1N/A(Note: This subroutine is only available under Perl5.005) 1N/AC<gen_extract_tagged> generates a new anonymous subroutine which 1N/Aextracts text between (balanced) specified tags. In other words, 1N/Ait generates a function identical in function to C<extract_tagged>. 1N/AThe difference between C<extract_tagged> and the anonymous 1N/Asubroutines generated by 1N/AC<gen_extract_tagged>, is that those generated subroutines: 1N/Ado not have to reparse tag specification or parsing options every time 1N/Athey are called (whereas C<extract_tagged> has to effectively rebuild 1N/Aits tag parser on every call); 1N/Amake use of the new qr// construct to pre-compile the regexes they use 1N/A(whereas C<extract_tagged> uses standard string variable interpolation 1N/Ato create tag-matching patterns). 1N/AThe subroutine takes up to four optional arguments (the same set as 1N/AC<extract_tagged> except for the string to be processed). It returns 1N/Aa reference to a subroutine which in turn takes a single argument (the text to 1N/AIn other words, the implementation of C<extract_tagged> is exactly 1N/A $extractor = gen_extract_tagged(@_); 1N/A return $extractor->($text); 1N/A(although C<extract_tagged> is not currently implemented that way, in order 1N/Ato preserve pre-5.005 compatibility). 1N/AUsing C<gen_extract_tagged> to create extraction functions for specific tags 1N/Ais a good idea if those functions are going to be called more than once, since 1N/Atheir performance is typically twice as good as the more general-purpose 1N/A=head2 C<extract_quotelike> 1N/AC<extract_quotelike> attempts to recognize, extract, and segment any 1N/Aone of the various Perl quotes and quotelike operators (see 1N/AL<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1N/Adelimiters (for the quotelike operators), and trailing modifiers are 1N/Aall caught. For example, in: 1N/A extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1N/A extract_quotelike ' "You said, \"Use sed\"." ' 1N/A extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1N/A extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1N/Athe full Perl quotelike operations are all extracted correctly. 1N/ANote too that, when using the /x modifier on a regex, any comment 1N/Acontaining the current pattern delimiter will cause the regex to be 1N/Aimmediately terminated. In other words: 1N/A (?i) # CASE INSENSITIVE 1N/A [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1N/Awill be extracted as if it were: 1N/A (?i) # CASE INSENSITIVE 1N/A [a-z_] # LEADING ALPHABETIC/' 1N/AThis behaviour is identical to that of the actual compiler. 1N/AC<extract_quotelike> takes two arguments: the text to be processed and 1N/Aa prefix to be matched at the very beginning of the text. If no prefix 1N/Ais specified, optional whitespace is the default. If no text is given, 1N/AIn a list context, an array of 11 elements is returned. The elements are: 1N/Athe extracted quotelike substring (including trailing modifiers), 1N/Athe remainder of the input text, 1N/Athe prefix substring (if any), 1N/Athe name of the quotelike operator (if any), 1N/Athe left delimiter of the first block of the operation, 1N/Athe text of the first block of the operation 1N/A(that is, the contents of 1N/Aa quote, the regex of a match or substitution or the target list of a 1N/Athe right delimiter of the first block of the operation, 1N/Athe left delimiter of the second block of the operation 1N/A(that is, if it is a C<s>, C<tr>, or C<y>), 1N/Athe text of the second block of the operation 1N/A(that is, the replacement of a substitution or the translation list 1N/Athe right delimiter of the second block of the operation (if any), 1N/Athe trailing modifiers on the operation (if any). 1N/AFor each of the fields marked "(if any)" the default value on success is 1N/AOn failure, all of these values (except the remaining text) are C<undef>. 1N/AIn a scalar context, C<extract_quotelike> returns just the complete substring 1N/Athat matched a quotelike operation (or C<undef> on failure). In a scalar or 1N/Avoid context, the input text has the same substring (and any specified 1N/A # Remove the first quotelike literal that appears in text 1N/A $quotelike = extract_quotelike($text,'.*?'); 1N/A # Replace one or more leading whitespace-separated quotelike 1N/A # literals in $_ with "<QLL>" 1N/A do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1N/A # Isolate the search pattern in a quotelike operation from $text 1N/A ($op,$pat) = (extract_quotelike $text)[3,5]; 1N/A print "search pattern: $pat\n"; 1N/A print "$op is not a pattern matching operation\n"; 1N/A=head2 C<extract_quotelike> and "here documents" 1N/AC<extract_quotelike> can successfully extract "here documents" from an input 1N/Astring, but with an important caveat in list contexts. 1N/AUnlike other types of quote-like literals, a here document is rarely 1N/Aa contiguous substring. For example, a typical piece of code using 1N/Ahere document might look like this: 1N/A This is the message. 1N/AGiven this as an input string in a scalar context, C<extract_quotelike> 1N/Awould correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1N/Aleaving the string " || die;\nexit;" in the original variable. In other words, 1N/Athe two separate pieces of the here document are successfully extracted and 1N/AIn a list context, C<extract_quotelike> would return the list 1N/A"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1N/Aincluding fore and aft delimiters), 1N/A" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1N/A"" (i.e. the prefix substring -- trivial in this case), 1N/A"<<" (i.e. the "name" of the quotelike operator) 1N/A"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1N/A"This is the message.\n" (i.e. the text of the here document), 1N/A"EOMSG" (i.e. the right delimiter of the here document), 1N/A"" (a here document has no second left delimiter, second text, second right 1N/Adelimiter, or trailing modifiers). 1N/AHowever, the matching position of the input variable would be set to 1N/A"exit;" (i.e. I<after> the closing delimiter of the here document), 1N/Awhich would cause the earlier " || die;\nexit;" to be skipped in any 1N/Asequence of code fragment extractions. 1N/ATo avoid this problem, when it encounters a here document whilst 1N/Aextracting from a modifiable string, C<extract_quotelike> silently 1N/Arearranges the string to an equivalent piece of Perl: 1N/A This is the message. 1N/Ain which the here document I<is> contiguous. It still leaves the 1N/Amatching position after the here document, but now the rest of the line 1N/Aon which the here document starts is not skipped. 1N/ATo prevent <extract_quotelike> from mucking about with the input in this way 1N/A(this is the only case where a list-context C<extract_quotelike> does so), 1N/Ayou can pass the input variable as an interpolated literal: 1N/A $quotelike = extract_quotelike("$var"); 1N/A=head2 C<extract_codeblock> 1N/AC<extract_codeblock> attempts to recognize and extract a balanced 1N/Abracket delimited substring that may contain unbalanced brackets 1N/Ainside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1N/Ais like a combination of C<"extract_bracketed"> and 1N/AC<"extract_quotelike">. 1N/AC<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1N/Aa text to process, a set of delimiter brackets to look for, and a prefix to 1N/Amatch first. It also takes an optional fourth parameter, which allows the 1N/Aoutermost delimiter brackets to be specified separately (see below). 1N/AOmitting the first argument (input text) means process C<$_> instead. 1N/AOmitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1N/AOmitting the third argument (prefix argument) implies optional whitespace at the start. 1N/AOmitting the fourth argument (outermost delimiter brackets) indicates that the 1N/Avalue of the second argument is to be used for the outermost delimiters. 1N/AOnce the prefix an dthe outermost opening delimiter bracket have been 1N/Arecognized, code blocks are extracted by stepping through the input text and 1N/Atrying the following alternatives in sequence: 1N/ATry and match a closing delimiter bracket. If the bracket was the same 1N/Aspecies as the last opening bracket, return the substring to that 1N/Apoint. If the bracket was mismatched, return an error. 1N/ATry to match a quote or quotelike operator. If found, call 1N/AC<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1N/Athe error it returned. Otherwise go back to step 1. 1N/ATry to match an opening delimiter bracket. If found, call 1N/AC<extract_codeblock> recursively to eat the embedded block. If the 1N/Arecursive call fails, return an error. Otherwise, go back to step 1. 1N/AUnconditionally match a bareword or any other single character, and 1N/Athen go back to step 1. 1N/A # Find a while loop in the text 1N/A if ($text =~ s/.*?while\s*\{/{/) 1N/A $loop = "while " . extract_codeblock($text); 1N/A # Remove the first round-bracketed list (which may include 1N/A # round- or curly-bracketed code blocks or quotelike operators) 1N/A extract_codeblock $text, "(){}", '[^(]*'; 1N/AThe ability to specify a different outermost delimiter bracket is useful 1N/Ain some circumstances. For example, in the Parse::RecDescent module, 1N/Aparser actions which are to be performed only on a successful parse 1N/Aare specified using a C<E<lt>defer:...E<gt>> directive. For example: 1N/A sentence: subject verb object 1N/A <defer: {$::theVerb = $item{verb}} > 1N/AParse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1N/Awithin the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1N/AA deferred action like this: 1N/A <defer: {if ($count>10) {$count--}} > 1N/Awill be incorrectly parsed as: 1N/A <defer: {if ($count> 1N/Abecause the "less than" operator is interpreted as a closing delimiter. 1N/ABut, by extracting the directive using 1N/AS<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1N/Athe '>' character is only treated as a delimited at the outermost 1N/Alevel of the code block, so the directive is parsed correctly. 1N/A=head2 C<extract_multiple> 1N/AThe C<extract_multiple> subroutine takes a string to be processed and a 1N/Alist of extractors (subroutines or regular expressions) to apply to that string. 1N/AIn an array context C<extract_multiple> returns an array of substrings 1N/Aof the original string, as extracted by the specified extractors. 1N/AIn a scalar context, C<extract_multiple> returns the first 1N/Asubstring successfully extracted from the original string. In both 1N/Ascalar and void contexts the original string has the first successfully 1N/Aextracted substring removed from it. In all contexts 1N/AC<extract_multiple> starts at the current C<pos> of the string, and 1N/Asets that C<pos> appropriately after it matches. 1N/AHence, the aim of of a call to C<extract_multiple> in a list context 1N/Ais to split the processed string into as many non-overlapping fields as 1N/Apossible, by repeatedly applying each of the specified extractors 1N/Ato the remainder of the string. Thus C<extract_multiple> is 1N/Aa generalized form of Perl's C<split> subroutine. 1N/AThe subroutine takes up to four optional arguments: 1N/AA string to be processed (C<$_> if the string is omitted or C<undef>) 1N/AA reference to a list of subroutine references and/or qr// objects and/or 1N/Aliteral strings and/or hash references, specifying the extractors 1N/Ato be used to split the string. If this argument is omitted (or 1N/A sub { extract_variable($_[0], '') }, 1N/A sub { extract_quotelike($_[0],'') }, 1N/A sub { extract_codeblock($_[0],'{}','') }, 1N/AAn number specifying the maximum number of fields to return. If this 1N/Aargument is omitted (or C<undef>), split continues as long as possible. 1N/AIf the third argument is I<N>, then extraction continues until I<N> fields 1N/Ahave been successfully extracted, or until the string has been completely 1N/ANote that in scalar and void contexts the value of this argument is 1N/Aautomatically reset to 1 (under C<-w>, a warning is issued if the argument 1N/AA value indicating whether unmatched substrings (see below) within the 1N/Atext should be skipped or returned as fields. If the value is true, 1N/Asuch substrings are skipped. Otherwise, they are returned. 1N/AThe extraction process works by applying each extractor in 1N/Asequence to the text string. 1N/AIf the extractor is a subroutine it is called in a list context and is 1N/Aexpected to return a list of a single element, namely the extracted 1N/Atext. It may optionally also return two further arguments: a string 1N/Arepresenting the text left after extraction (like $' for a pattern 1N/Amatch), and a string representing any prefix skipped before the 1N/Aextraction (like $` in a pattern match). Note that this is designed 1N/Ato facilitate the use of other Text::Balanced subroutines with 1N/AC<extract_multiple>. Note too that the value returned by an extractor 1N/Asubroutine need not bear any relationship to the corresponding substring 1N/Aof the original text (see examples below). 1N/AIf the extractor is a precompiled regular expression or a string, 1N/Ait is matched against the text in a scalar context with a leading 1N/A'\G' and the gc modifiers enabled. The extracted value is either 1N/A$1 if that variable is defined after the match, or else the 1N/Acomplete match (i.e. $&). 1N/AIf the extractor is a hash reference, it must contain exactly one element. 1N/AThe value of that element is one of the 1N/Aabove extractor types (subroutine reference, regular expression, or string). 1N/AThe key of that element is the name of a class into which the successful 1N/Areturn value of the extractor will be blessed. 1N/AIf an extractor returns a defined value, that value is immediately 1N/Atreated as the next extracted field and pushed onto the list of fields. 1N/AIf the extractor was specified in a hash reference, the field is also 1N/Ablessed into the appropriate class, 1N/AIf the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 1N/Aassumed to have failed to extract. 1N/AIf none of the extractor subroutines succeeds, then one 1N/Acharacter is extracted from the start of the text and the extraction 1N/Asubroutines reapplied. Characters which are thus removed are accumulated and 1N/Aeventually become the next field (unless the fourth argument is true, in which 1N/Acase they are disgarded). 1N/AFor example, the following extracts substrings that are valid Perl variables: 1N/A @fields = extract_multiple($text, 1N/A [ sub { extract_variable($_[0]) } ], 1N/AThis example separates a text into fields which are quote delimited, 1N/Acurly bracketed, and anything else. The delimited and bracketed 1N/Aparts are also blessed to identify them (the "anything else" is unblessed): 1N/A @fields = extract_multiple($text, 1N/A { Delim => sub { extract_delimited($_[0],q{'"}) } }, 1N/A { Brack => sub { extract_bracketed($_[0],'{}') } }, 1N/AThis call extracts the next single substring that is a valid Perl quotelike 1N/Aoperator (and removes it from $text): 1N/A $quotelike = extract_multiple($text, 1N/A sub { extract_quotelike($_[0]) }, 1N/AFinally, here is yet another way to do comma-separated value parsing: 1N/A @fields = extract_multiple($csv_text, 1N/A sub { extract_delimited($_[0],q{'"}) }, 1N/AThe list in the second argument means: 1N/AI<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 1N/AThe undef third argument means: 1N/AI<"...as many times as possible...">, 1N/Aand the true value in the fourth argument means 1N/AI<"...discarding anything else that appears (i.e. the commas)">. 1N/AIf you wanted the commas preserved as separate fields (i.e. like split 1N/Adoes if your split pattern has capturing parentheses), you would 1N/Ajust make the last parameter undefined (or remove it). 1N/A=head2 C<gen_delimited_pat> 1N/AThe C<gen_delimited_pat> subroutine takes a single (string) argument and 1N/A > builds a Friedl-style optimized regex that matches a string delimited 1N/Aby any one of the characters in the single argument. For example: 1N/A gen_delimited_pat(q{'"}) 1N/A (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 1N/ANote that the specified delimiters are automatically quotemeta'd. 1N/AA typical use of C<gen_delimited_pat> would be to build special purpose tags 1N/Afor C<extract_tagged>. For example, to properly ignore "empty" XML elements 1N/A(which might contain quoted strings): 1N/A my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 1N/A extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 1N/AC<gen_delimited_pat> may also be called with an optional second argument, 1N/Awhich specifies the "escape" character(s) to be used for each delimiter. 1N/AFor example to match a Pascal-style string (where ' is the delimiter 1N/Aand '' is a literal ' within the string): 1N/A gen_delimited_pat(q{'},q{'}); 1N/ADifferent escape characters can be specified for different delimiters. 1N/AFor example, to specify that '/' is the escape for single quotes 1N/Aand '%' is the escape for double quotes: 1N/A gen_delimited_pat(q{'"},q{/%}); 1N/AIf more delimiters than escape chars are specified, the last escape char 1N/Ais used for the remaining delimiters. 1N/AIf no escape char is specified for a given specified delimiter, '\' is used. 1N/AC<gen_delimited_pat> was previously called 1N/AC<delimited_pat>. That name may still be used, but is now deprecated. 1N/AIn a list context, all the functions return C<(undef,$original_text)> 1N/Aon failure. In a scalar context, failure is indicated by returning C<undef> 1N/A(in this case the input text is not modified in any way). 1N/AIn addition, on failure in I<any> context, the C<$@> variable is set. 1N/AAccessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 1N/AAccessing C<$@-E<gt>{pos}> returns the offset into the original string at 1N/Awhich the error was detected (although not necessarily where it occurred!) 1N/APrinting C<$@> directly produces the error message, with the offset appended. 1N/AOn success, the C<$@> variable is guaranteed to be C<undef>. 1N/AThe available diagnostics are: 1N/A=item C<Did not find a suitable bracket: "%s"> 1N/AThe delimiter provided to C<extract_bracketed> was not one of 1N/AC<'()[]E<lt>E<gt>{}'>. 1N/A=item C<Did not find prefix: /%s/> 1N/AA non-optional prefix was specified but wasn't found at the start of the text. 1N/A=item C<Did not find opening bracket after prefix: "%s"> 1N/AC<extract_bracketed> or C<extract_codeblock> was expecting a 1N/Aparticular kind of bracket at the start of the text, and didn't find it. 1N/A=item C<No quotelike operator found after prefix: "%s"> 1N/AC<extract_quotelike> didn't find one of the quotelike operators C<q>, 1N/AC<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 1N/A=item C<Unmatched closing bracket: "%c"> 1N/AC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 1N/Aa closing bracket where none was expected. 1N/A=item C<Unmatched opening bracket(s): "%s"> 1N/AC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 1N/Aout of characters in the text before closing one or more levels of nested 1N/A=item C<Unmatched embedded quote (%s)> 1N/AC<extract_bracketed> attempted to match an embedded quoted substring, but 1N/Afailed to find a closing quote to match it. 1N/A=item C<Did not find closing delimiter to match '%s'> 1N/AC<extract_quotelike> was unable to find a closing delimiter to match the 1N/Aone that opened the quote-like operation. 1N/A=item C<Mismatched closing bracket: expected "%c" but found "%s"> 1N/AC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 1N/Aa valid bracket delimiter, but it was the wrong species. This usually 1N/Aindicates a nesting error, but may indicate incorrect quoting or escaping. 1N/A=item C<No block delimiter found after quotelike "%s"> 1N/AC<extract_quotelike> or C<extract_codeblock> found one of the 1N/Aquotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 1N/Awithout a suitable block after it. 1N/A=item C<Did not find leading dereferencer> 1N/AC<extract_variable> was expecting one of '$', '@', or '%' at the start of 1N/Aa variable, but didn't find any of them. 1N/A=item C<Bad identifier after dereferencer> 1N/AC<extract_variable> found a '$', '@', or '%' indicating a variable, but that 1N/Acharacter was not followed by a legal Perl identifier. 1N/A=item C<Did not find expected opening bracket at %s> 1N/AC<extract_codeblock> failed to find any of the outermost opening brackets 1N/A=item C<Improperly nested codeblock at %s> 1N/AA nested code block was found that started with a delimiter that was specified 1N/Aas being only to be used as an outermost bracket. 1N/A=item C<Missing second block for quotelike "%s"> 1N/AC<extract_codeblock> or C<extract_quotelike> found one of the 1N/Aquotelike operators C<s>, C<tr> or C<y> followed by only one block. 1N/A=item C<No match found for opening bracket> 1N/AC<extract_codeblock> failed to find a closing bracket to match the outermost 1N/A=item C<Did not find opening tag: /%s/> 1N/AC<extract_tagged> did not find a suitable opening tag (after any specified 1N/A=item C<Unable to construct closing tag to match: /%s/> 1N/AC<extract_tagged> matched the specified opening tag and tried to 1N/Amodify the matched text to produce a matching closing tag (because 1N/Anone was specified). It failed to generate the closing tag, almost 1N/Acertainly because the opening tag did not start with a 1N/Abracket of some kind. 1N/A=item C<Found invalid nested tag: %s> 1N/AC<extract_tagged> found a nested tag that appeared in the "reject" list 1N/A(and the failure mode was not "MAX" or "PARA"). 1N/A=item C<Found unbalanced nested tag: %s> 1N/AC<extract_tagged> found a nested opening tag that was not matched by a 1N/Acorresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 1N/A=item C<Did not find closing tag> 1N/AC<extract_tagged> reached the end of the text without finding a closing tag 1N/Ato match the original opening tag (and the failure mode was not 1N/ADamian Conway (damian@conway.org) 1N/A=head1 BUGS AND IRRITATIONS 1N/AThere are undoubtedly serious bugs lurking somewhere in this code, if 1N/Aonly because parts of it give the impression of understanding a great deal 1N/Amore about Perl than they really do. 1N/ABug reports and other feedback are most welcome. 1N/A Copyright (c) 1997-2001, Damian Conway. All Rights Reserved. 1N/A This module is free software. It may be used, redistributed 1N/A and/or modified under the same terms as Perl itself.