#!/usr/bin/perl
use strict;
use warnings;
use Benchmark qw(:all);
my %folders = (
Temp1 => 10,
Temp2 => 100,
Temp3 => 1000,
Temp4 => 5000,
Temp5 => 10000,
# Temp6 => 20000,
);
foreach my $folder (sort keys %folders) {
print "Folder: $folder, Files: ", $folders{$folder}, "\n";
timethese(5, {
'Way 1' => sub {
opendir(my $DIR, $folder) or die "Error: couldn't open dir '$folder': $!\n";
while(my $file = readdir $DIR) {
my $filename = "$folder/$file";
if(-f $filename) {
# print STDERR "$filename\n";
}
}
closedir($DIR);
},
'Way 2' => sub {
foreach my $file (glob "$folder/*") {
my $filename = $file;
if(-f $filename) {
# print STDERR "$filename\n";
}
}
},
'Way 3' => sub {
opendir(my $DIR, $folder) or die "Error: couldn't open dir '$folder': $!\n";
my @files = map { "$folder/$_" }
grep { -f "$folder/$_" }
readdir($DIR);
closedir($DIR);
foreach my $file (@files) {
my $filename = $file;
# print STDERR "$filename\n";
}
},
});
}
the result does not make sence to me, but maybe you can tell me why
Folder: Temp1, Files: 10
Benchmark: timing 5 iterations of Way 1, Way 2, Way 3...
Way 1: 0 wallclock secs ( 0.00 usr + 0.00 sys = 0.00 CPU)
(warning: too few iterations for a reliable count)
Way 2: 0 wallclock secs ( 0.00 usr + 0.00 sys = 0.00 CPU)
(warning: too few iterations for a reliable count)
Way 3: 0 wallclock secs ( 0.00 usr + 0.00 sys = 0.00 CPU)
(warning: too few iterations for a reliable count)
Folder: Temp2, Files: 100
Benchmark: timing 5 iterations of Way 1, Way 2, Way 3...
Way 1: 0 wallclock secs ( 0.00 usr + 0.05 sys = 0.05 CPU) @ 108.70/s (n=5)
(warning: too few iterations for a reliable count)
Way 2: 0 wallclock secs ( 0.00 usr + 0.08 sys = 0.08 CPU) @ 63.29/s (n=5)
(warning: too few iterations for a reliable count)
Way 3: 0 wallclock secs ( 0.03 usr + 0.00 sys = 0.03 CPU) @ 161.29/s (n=5)
(warning: too few iterations for a reliable count)
Folder: Temp3, Files: 1000
Benchmark: timing 5 iterations of Way 1, Way 2, Way 3...
Way 1: 1 wallclock secs ( 0.19 usr + 0.63 sys = 0.81 CPU) @ 6.15/s (n=5)
Way 2: 1 wallclock secs ( 0.41 usr + 1.11 sys = 1.52 CPU) @ 3.30/s (n=5)
Way 3: 1 wallclock secs ( 0.16 usr + 0.67 sys = 0.83 CPU) @ 6.04/s (n=5)
Folder: Temp4, Files: 5000
Benchmark: timing 5 iterations of Way 1, Way 2, Way 3...
Way 1: 9 wallclock secs ( 1.20 usr + 7.86 sys = 9.06 CPU) @ 0.55/s (n=5)
Way 2: 19 wallclock secs ( 1.73 usr + 16.59 sys = 18.33 CPU) @ 0.27/s (n=5)
Way 3: 10 wallclock secs ( 0.92 usr + 7.91 sys = 8.83 CPU) @ 0.57/s (n=5)
Folder: Temp5, Files: 10000
Benchmark: timing 5 iterations of Way 1, Way 2, Way 3...
Way 1: 43 wallclock secs ( 3.17 usr + 39.39 sys = 42.56 CPU) @ 0.12/s (n=5)
Way 2: 85 wallclock secs ( 6.53 usr + 76.41 sys = 82.94 CPU) @ 0.06/s (n=5)
Way 3: 42 wallclock secs ( 3.19 usr + 38.06 sys = 41.25 CPU) @ 0.12/s (n=5)
TMTWTDI! More than just three, that is. But basically they all boil down to do an opendir followed by a (sequence of) readdir('s). So if you're really that concerned about speed and -alleged- efficiency, then you sould go the way of explicitly doing them.
If you're actually doing globbing, though, chances are you'd better use Perl's glob a.k.a. File::Glob rather than reinventing the wheel and risking to make errors doing so. Said this...
the result does not make sence to me, but maybe you can tell me why
Well, they make sense enough to me. In the first place all times are comparable, which is fine and supports my claims above. Other than that "Way 3" seems to be slightly faster than "Way 1", but that may well be within the experimental error. As far as the former is concerned, there's still room for improvements, as you interpolate $file twice, in the [doc://map and [doc://grep]: I'd just switch them:
'Way 3' => sub {
opendir my $DIR, $folder
or die "Error: couldn't open dir '$folder': $!\n";
my @files = grep -f, map "$folder/$_", readdir $DIR;
for my $file (@files) {
# my $filename = $file; # no more need for this!
# print STDERR "$filename\n";
}
}
You could also avoid assigning to @files altogether.
And as far as glob goes, there are too many unknown parameters involved, since it must do regexp matching. I don't know if it also does special optimizations for particular patterns, but chances are that it may depend on the actual filenames: as I said, far too many parameters!
Whatever, your benchmarks are bound to be imprecise. I don't know under Windows, but under Linux certainly and I believe under most unices the kernel keeps data in memory to avoid doing relatively slow disk reads and writes, which is why sync exists. One more reason, IMHO, not to bother at all!
I think your results are so surprising because your benchmark code adds a lot of unneccessary cruft to the different approaches. For example, in method 1 you're instantiating my $file at every pass through the loop.
FWIW, here's my benchmark of approach 1 (iterative) vs. approach 3 (slurpy):
use Benchmark qw/:all/;
my $folder = "C:\\Perl";
sub iterative {
opendir $DIR, '.';
my $file;
while ($file = readdir $DIR) {
$file = "$folder\\$file";
next unless -f $file;
my @s = stat $file;
}
closedir $DIR; # I like being explicit;
}
sub slurpy {
opendir $DIR, '.';
my @files = map { "$folder\\$_" } grep { -f $_ } readdir($DIR);
foreach (@files) {
my @s = stat $_;
}
closedir $DIR;
}
cmpthese ( 200, {
'iterative' => \&iterative,
'slurpy' => \&slurpy,
});
__END__
Rate slurpy iterative
slurpy 74.4/s -- -44%
iterative 133/s 79% --
As you can see, the iterative approach is *much* faster in my benchmark. (The other variable is that I'm running WinXP).
C:\>perl -v This is perl, v5.8.6 built for MSWin32-x86-multi-thread (with 3 registered patches, see perl -V for more detail)
As I recall, the default glob is unnecessarily slow on Win32 because it does stat a lot more than it needs to. And stat is unnecessarily slow on Win32 because it does some expensive things in order to fill in bits that are rarely used. If doing glob to a remote network share (at least), the slow-down can be quite dramatic (orders of magnitude).
I had an idea for using something like an environment variable to allow skipping the expensive parts of stat but I don't have it "swapped in" at the moment. But the over-user of stat by the default glob should just be fixed.
- tye
perlmonks.org content © perlmonks.org and blazar, esskar, radiantmatrix, tye
prlmnks.org © 2006 edmund von der burg (eccles & toad)
v 0.03