File Coverage

File:blib/lib/Regexp/Wildcards.pm
Coverage:100.0%

linestmtbrancondsubpodtimecode
1package Regexp::Wildcards;
2
3
10
10
10
101
44
150
use strict;
4
10
10
10
99
42
100
use warnings;
5
6
10
10
10
119
42
192
use Carp qw/croak/;
7
10
10
10
210
47
169
use Text::Balanced qw/extract_bracketed/;
8
9 - 17
=head1 NAME

Regexp::Wildcards - Converts wildcard expressions to Perl regular expressions.

=head1 VERSION

Version 1.03

=cut
18
19
10
10
10
149
42
94
use vars qw/$VERSION/;
20BEGIN {
21
10
77
 $VERSION = '1.03';
22}
23
24 - 60
=head1 SYNOPSIS

    use Regexp::Wildcards;

    my $rw = Regexp::Wildcards->new(type => 'unix');

    my $re;
    $re = $rw->convert('a{b?,c}*');          # Do it Unix shell style.
    $re = $rw->convert('a?,b*',   'win32');  # Do it Windows shell style.
    $re = $rw->convert('*{x,y}?', 'jokers'); # Process the jokers and escape the rest.
    $re = $rw->convert('%a_c%',   'sql');    # Turn SQL wildcards into regexps.

    $rw = Regexp::Wildcards->new(
     do      => [ qw/jokers brackets/ ], # Do jokers and brackets.
     capture => [ qw/any greedy/ ],      # Capture *'s greedily.
    );

    $rw->do(add => 'groups');            # Don't escape groups.
    $rw->capture(rem => [ qw/greedy/ ]); # Actually we want non-greedy matches.
    $re = $rw->convert('*a{,(b)?}?c*');  # '(.*?)a(?:|(b).).c(.*?)'
    $rw->capture();                      # No more captures.

=head1 DESCRIPTION

In many situations, users may want to specify patterns to match but don't need the full power of regexps.
Wildcards make one of those sets of simplified rules.
This module converts wildcard expressions to Perl regular expressions, so that you can use them for matching.

It handles the C<*> and C<?> jokers, as well as Unix bracketed alternatives C<{,}>, but also C<%> and C<_> SQL wildcards.
If required, it can also keep original C<(...)> groups or C<^> and C<$> anchors.
Backspace (C<\>) is used as an escape character.

Typesets that mimic the behaviour of Windows and Unix shells are also provided.

=head1 METHODS

=cut
61
62sub _check_self {
63
468
8995
 croak 'First argument isn\'t a valid ' . __PACKAGE__ . ' object'
64  unless ref $_[0] and $_[0]->isa(__PACKAGE__);
65}
66
67my %types = (
68 jokers => [ qw/jokers/ ],
69 sql => [ qw/sql/ ],
70 commas => [ qw/commas/ ],
71 brackets => [ qw/brackets/ ],
72 unix => [ qw/jokers brackets/ ],
73 win32 => [ qw/jokers commas/ ],
74);
75$types{$_} = $types{win32} for qw/dos os2 MSWin32 cygwin/;
76$types{$_} = $types{unix} for qw/linux
77                                  darwin machten next
78                                  aix irix hpux dgux dynixptx
79                                  bsdos freebsd openbsd
80                                  svr4 solaris sunos dec_osf
81                                  sco_sv unicos unicosmk/;
82
83my %escapes = (
84 jokers => '?*',
85 sql => '_%',
86 commas => ',',
87 brackets => '{},',
88 groups => '()',
89 anchors => '^$',
90);
91
92my %captures = (
93 single => sub { $_[1] ? '(.)' : '.' },
94 any => sub { $_[1] ? ($_[0]->{greedy} ? '(.*)'
95                                            : '(.*?)')
96                         : '.*' },
97 brackets => sub { $_[1] ? '(' : '(?:'; },
98 greedy => undef
99);
100
101sub _validate {
102
66
259
 my $self = shift;
103
66
296
 _check_self $self;
104
66
281
 my $valid = shift;
105
66
238
 my $old = shift;
106
66
336
 $old = { } unless defined $old;
107
108
66
202
 my %opts;
109
66
409
 if (@_ <= 1) {
110
47
325
  $opts{set} = defined $_[0] ? $_[0] : { };
111 } elsif (@_ % 2) {
112
2
17
  croak 'Arguments must be passed as an unique scalar or as key => value pairs';
113 } else {
114
17
138
  %opts = @_;
115 }
116
117
64
200
 my %checked;
118
64
292
 for (qw/set add rem/) {
119
184
620
  my $opt = $opts{$_};
120
184
981
  next unless defined $opt;
121  my $cb = {
122
15
175
   '' => sub { +{ ($_[0] => 1) x (exists $valid->{$_[0]}) } },
123
25
47
25
71
428
132
   'ARRAY' => sub { +{ map { ($_ => 1) x (exists $valid->{$_}) } @{$_[0]} } },
124
6
20
58
194
   'HASH' => sub { +{ map { ($_ => $_[0]->{$_}) x (exists $valid->{$_}) }
125
20
63
                        keys %{$_[0]} } }
126
64
1350
  }->{ ref $opt };
127
64
833
  croak 'Wrong option set' unless $cb;
128
60
252
  $checked{$_} = $cb->($opt);
129 }
130
131
60
316
 my $config = (exists $checked{set}) ? $checked{set} : $old;
132
60
60
179
731
 $config->{$_} = $checked{add}->{$_} for grep $checked{add}->{$_},
133
60
162
                                          keys %{$checked{add} || {}};
134
60
60
213
556
 delete $config->{$_} for grep $checked{rem}->{$_},
135
60
209
                                          keys %{$checked{rem} || {}};
136
137
60
473
 $config;
138}
139
140sub _do {
141
32
119
 my $self = shift;
142
32
99
 my $config;
143
32
269
 $config->{do} = $self->_validate(\%escapes, $self->{do}, @_);
144
29
140
 $config->{escape} = '';
145
29
29
29
85
89
275
 $config->{escape} .= $escapes{$_} for keys %{$config->{do}};
146
29
155
 $config->{escape} = quotemeta $config->{escape};
147
29
109
 $config;
148}
149
150sub do {
151
18
1
192
 my $self = shift;
152
18
99
 _check_self $self;
153
16
98
 my $config = $self->_do(@_);
154
13
13
40
117
 $self->{$_} = $config->{$_} for keys %$config;
155
13
67
 $self;
156}
157
158sub _capture {
159
34
132
 my $self = shift;
160
34
124
 my $config;
161
34
282
 $config->{capture} = $self->_validate(\%captures, $self->{capture}, @_);
162
31
214
 $config->{greedy} = delete $config->{capture}->{greedy};
163
31
225
 for (keys %captures) {
164
124
925
  $config->{'c_' . $_} = $captures{$_}->($config, $config->{capture}->{$_})
165                                               if $captures{$_}; # Skip 'greedy'
166 }
167
31
166
 $config;
168}
169
170sub capture {
171
36
1
220
 my $self = shift;
172
36
193
 _check_self $self;
173
34
192
 my $config = $self->_capture(@_);
174
31
31
95
441
 $self->{$_} = $config->{$_} for keys %$config;
175
31
207
 $self;
176}
177
178sub _type {
179
18
133
 my ($self, $type) = @_;
180
18
120
 $type = 'unix' unless defined $type;
181
18
117
 croak 'Wrong type' unless exists $types{$type};
182
16
118
 my $config = $self->_do($types{$type});
183
16
63
 $config->{type} = $type;
184
16
57
 $config;
185}
186
187sub type {
188
16
1
91
 my $self = shift;
189
16
92
 _check_self $self;
190
14
94
 my $config = $self->_type(@_);
191
13
13
34
234
 $self->{$_} = $config->{$_} for keys %$config;
192
13
77
 $self;
193}
194
195sub new {
196
14
1
199
 my $class = shift;
197
14
201
 $class = ref($class) || $class || __PACKAGE__;
198
14
104
 croak 'Optional arguments must be passed as key => value pairs' if @_ % 2;
199
13
76
 my %args = @_;
200
13
59
 my $self = { };
201
13
177
 bless $self, $class;
202
13
81
 if (defined $args{do}) {
203
4
27
  $self->do($args{do});
204 } else {
205
9
72
  $self->type($args{type});
206 }
207
13
105
 $self->capture($args{capture});
208}
209
210 - 429
=head2 C<< new [ do => $what E<verbar> type => $type ], capture => $captures >>

Constructs a new L<Regexp::Wildcard> object.

C<do> lists all features that should be enabled when converting wildcards to regexps.
Refer to L</do> for details on what can be passed in C<$what>.

The C<type> specifies a predefined set of C<do> features to use.
See L</type> for details on which types are valid.
The C<do> option overrides C<type>.

C<capture> lists which atoms should be capturing.
Refer to L</capture> for more details.

=head2 C<< do [ $what E<verbar> set => $c1, add => $c2, rem => $c3 ] >>

Specifies the list of metacharacters to convert or to prevent for escaping.
They fit into six classes :

=over 4

=item *

C<'jokers'>

Converts C<?> to C<.> and C<*> to C<.*>.

    'a**\\*b??\\?c' ==> 'a.*\\*b..\\?c'

=item *

C<'sql'>

Converts C<_> to C<.> and C<%> to C<.*>.

    'a%%\\%b__\\_c' ==> 'a.*\\%b..\\_c'

=item *

C<'commas'>

Converts all C<,> to C<|> and puts the complete resulting regular expression inside C<(?: ... )>.

    'a,b{c,d},e' ==> '(?:a|b\\{c|d\\}|e)'

=item *

C<'brackets'>

Converts all matching C<{ ... ,  ... }> brackets to C<(?: ... | ... )> alternations.
If some brackets are unbalanced, it tries to substitute as many of them as possible, and then escape the remaining unmatched C<{> and C<}>.
Commas outside of any bracket-delimited block are also escaped.

    'a,b{c,d},e'    ==> 'a\\,b(?:c|d)\\,e'
    '{a\\{b,c}d,e}' ==> '(?:a\\{b|c)d\\,e\\}'
    '{a{b,c\\}d,e}' ==> '\\{a\\{b\\,c\\}d\\,e\\}'

=item *

C<'groups'>

Keeps the parenthesis C<( ... )> of the original string without escaping them.
Currently, no check is done to ensure that the parenthesis are matching.

    'a(b(c))d\\(\\)' ==> (no change)

=item *

C<'anchors'>

Prevents the I<beginning-of-line> C<^> and I<end-of-line> C<$> anchors to be escaped.
Since C<[...]> character class are currently escaped, a C<^> will always be interpreted as I<beginning-of-line>.

    'a^b$c' ==> (no change)

=back

Each C<$c> can be any of :

=over 4

=item *

A hash reference, with wanted metacharacter group names (described above) as keys and booleans as values ;

=item *

An array reference containing the list of wanted metacharacter classes ;

=item *

A plain scalar, when only one group is required.

=back

When C<set> is present, the classes given as its value replace the current object options.
Then the C<add> classes are added, and the C<rem> classes removed.

Passing a sole scalar C<$what> is equivalent as passing C<< set => $what >>.
No argument means C<< set => [ ] >>.

    $rw->do(set => 'jokers');           # Only translate jokers.
    $rw->do('jokers');                  # Same.
    $rw->do(add => [ qw/sql commas/ ]); # Translate also SQL and commas.
    $rw->do(rem => 'jokers');           # Specifying both 'sql' and 'jokers' is useless.
    $rw->do();                          # Translate nothing.

The C<do> method returns the L<Regexp::Wildcards> object.

=head2 C<type $type>

Notifies to convert the metacharacters that corresponds to the predefined type C<$type>.
C<$type> can be any of :

=over 4

=item *

C<'jokers'>, C<'sql'>, C<'commas'>, C<'brackets'>

Singleton types that enable the corresponding C<do> classes.

=item *

C<'unix'>

Covers typical Unix shell globbing features (effectively C<'jokers'> and C<'brackets'>).

=item *

C<$^O> values for common Unix systems

Wrap to C<'unix'> (see L<perlport> for the list).

=item *

C<undef>

Defaults to C<'unix'>.

=item *

C<'win32'>

Covers typical Windows shell globbing features (effectively C<'jokers'> and C<'commas'>).

=item *

C<'dos'>, C<'os2'>, C<'MSWin32'>, C<'cygwin'>

Wrap to C<'win32'>.

=back

In particular, you can usually pass C<$^O> as the C<$type> and get the corresponding shell behaviour.

    $rw->type('win32'); # Set type to win32.
    $rw->type($^O);     # Set type to unix on Unices and win32 on Windows
    $rw->type();        # Set type to unix.

The C<type> method returns the L<Regexp::Wildcards> object.

=head2 C<< capture [ $captures E<verbar> set => $c1, add => $c2, rem => $c3 ] >>

Specifies the list of atoms to capture.
This method works like L</do>, except that the classes are different :

=over 4

=item *

C<'single'>

Captures all unescaped I<"exactly one"> metacharacters, i.e. C<?> for wildcards or C<_> for SQL.

    'a???b\\??' ==> 'a(.)(.)(.)b\\?(.)'
    'a___b\\__' ==> 'a(.)(.)(.)b\\_(.)'

=item *

C<'any'>

Captures all unescaped I<"any"> metacharacters, i.e. C<*> for wildcards or C<%> for SQL.

    'a***b\\**' ==> 'a(.*)b\\*(.*)'
    'a%%%b\\%%' ==> 'a(.*)b\\%(.*)'

=item *

C<'greedy'>

When used in conjunction with C<'any'>, it makes the C<'any'> captures greedy (by default they are not).

    'a***b\\**' ==> 'a(.*?)b\\*(.*?)'
    'a%%%b\\%%' ==> 'a(.*?)b\\%(.*?)'

=item *

C<'brackets'>

Capture matching C<{ ... , ... }> alternations.

    'a{b\\},\\{c}' ==> 'a(b\\}|\\{c)'

=back

    $rw->capture(set => 'single');           # Only capture "exactly one" metacharacters.
    $rw->capture('single');                  # Same.
    $rw->capture(add => [ qw/any greedy/ ]); # Also greedily capture "any" metacharacters.
    $rw->capture(rem => 'greedy');           # No more greed please.
    $rw->capture();                          # Capture nothing.

The C<capture> method returns the L<Regexp::Wildcards> object.

=head2 C<convert $wc [ , $type ]>

Converts the wildcard expression C<$wc> into a regular expression according to the options stored into the L<Regexp::Wildcards> object, or to C<$type> if it's supplied.
It successively escapes all unprotected regexp special characters that doesn't hold any meaning for wildcards, then replace C<'jokers'>, C<'sql'> and C<'commas'> or C<'brackets'> (depending on the L</do> or L</type> options), all of this by applying the C<'capture'> rules specified in the constructor or by L</capture>.

=cut
430
431sub convert {
432
332
1
2746
 my ($self, $wc, $type) = @_;
433
332
1965
 _check_self $self;
434
330
1660
 my $config = (defined $type) ? $self->_type($type) : $self;
435
329
1612
 return unless defined $wc;
436
437
328
1605
 my $e = $config->{escape};
438 # Escape :
439 # - an even number of \ that doesn't protect a regexp/wildcard metachar
440 # - an odd number of \ that doesn't protect a wildcard metachar
441
328
8224
 $wc =~ s/
442  (?<!\\)(
443   (?:\\\\)*
444   (?:
445     [^\w\s\\$e]
446    |
447     \\
448     (?: [^\W$e] | \s | $ )
449   )
450  )
451 /\\$1/gx;
452
453
328
1530
 my $do = $config->{do};
454
328
2492
 $wc = $self->_jokers($wc) if $do->{jokers};
455
328
2451
 $wc = $self->_sql($wc) if $do->{sql};
456
328
2506
 if ($do->{brackets}) {
457
40
225
  $wc = $self->_bracketed($wc);
458 } elsif ($do->{commas} and $wc =~ /(?<!\\)(?:\\\\)*,/) {
459
7
57
  $wc = $self->{'c_brackets'} . $self->_commas($wc) . ')';
460 }
461
462
328
3968
 return $wc;
463}
464
465 - 507
=head1 EXPORT

An object module shouldn't export any function, and so does this one.

=head1 DEPENDENCIES

L<Carp> (core module since perl 5), L<Text::Balanced> (since 5.7.3).

=head1 CAVEATS

This module does not implement the strange behaviours of Windows shell that result from the special handling of the three last characters (for the file extension).
For example, Windows XP shell matches C<*a> like C<.*a>, C<*a?> like C<.*a.?>, C<*a??> like C<.*a.{0,2}> and so on.

=head1 SEE ALSO

L<Text::Glob>.

=head1 AUTHOR

Vincent Pit, C<< <perl at profvince.com> >>, L<http://www.profvince.com>.

You can contact me by mail or on C<irc.perl.org> (vincent).

=head1 BUGS

Please report any bugs or feature requests to C<bug-regexp-wildcards at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Regexp-Wildcards>. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Regexp::Wildcards

Tests code coverage report is available at L<http://www.profvince.com/perl/cover/Regexp-Wildcards>.

=head1 COPYRIGHT & LICENSE

Copyright 2007-2009 Vincent Pit, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut
508
509
134
1386
sub _extract ($) { extract_bracketed $_[0], '{', qr/.*?(?<!\\)(?:\\\\)*(?={)/ }
510
511sub _jokers {
512
201
735
 my $self = shift;
513
201
1105
 local $_ = $_[0];
514 # substitute ? preceded by an even number of \
515
201
791
 my $s = $self->{c_single};
516
201
1845
 s/(?<!\\)((?:\\\\)*)\?/$1$s/g;
517 # substitute * preceded by an even number of \
518
201
728
 $s = $self->{c_any};
519
201
2372
 s/(?<!\\)((?:\\\\)*)\*+/$1$s/g;
520
201
1315
 return $_;
521}
522
523sub _sql {
524
175
671
 my $self = shift;
525
175
811
 local $_ = $_[0];
526 # substitute _ preceded by an even number of \
527
175
764
 my $s = $self->{c_single};
528
175
2005
 s/(?<!\\)((?:\\\\)*)_/$1$s/g;
529 # substitute % preceded by an even number of \
530
175
1106
 $s = $self->{c_any};
531
175
1343
 s/(?<!\\)((?:\\\\)*)%+/$1$s/g;
532
175
1037
 return $_;
533}
534
535sub _commas {
536
67
291
 local $_ = $_[1];
537 # substitute , preceded by an even number of \
538
67
882
 s/(?<!\\)((?:\\\\)*),/$1|/g;
539
67
390
 return $_;
540}
541
542sub _brackets {
543
47
244
 my ($self, $rest) = @_;
544
47
216
 substr $rest, 0, 1, '';
545
47
147
 chop $rest;
546
47
173
 my ($re, $bracket, $prefix) = ('');
547
47
60
60
130
228
13573
 while (do { ($bracket, $rest, $prefix) = _extract $rest; $bracket }) {
548
13
82
  $re .= $self->_commas($prefix) . $self->_brackets($bracket);
549 }
550
47
262
 $re .= $self->_commas($rest);
551
47
357
 return $self->{c_brackets} . $re . ')';
552}
553
554sub _bracketed {
555
40
301
 my ($self, $rest) = @_;
556
40
201
 my ($re, $bracket, $prefix) = ('');
557
40
74
74
149
312
24756
 while (do { ($bracket, $rest, $prefix) = _extract $rest; $bracket }) {
558
34
238
  $re .= $prefix . $self->_brackets($bracket);
559 }
560
40
164
 $re .= $rest;
561
40
425
 $re =~ s/(?<!\\)((?:\\\\)*[\{\},])/\\$1/g;
562
40
240
 return $re;
563}
564
5651; # End of Regexp::Wildcards