From: Russ Allbery Date: Mon, 4 Nov 2013 21:16:54 +0000 (-0800) Subject: Add wordlist filter mode to cdbmake-wordlist X-Git-Tag: release/2.2~25 X-Git-Url: https://git.eyrie.org/?a=commitdiff_plain;h=df0c85456e4e51e0ab35560a62a87dea046e5fe8;p=kerberos%2Fkrb5-strength.git Add wordlist filter mode to cdbmake-wordlist Add a new -o (--output) option that applies any configured filtering and writes out a new wordlist file instead of creating a CDB file. Refactor the script to avoid adding too much complexity with this feature. --- diff --git a/tests/tools/cdbmake-wordlist-t b/tests/tools/cdbmake-wordlist-t index ca4b53d..08d8d91 100755 --- a/tests/tools/cdbmake-wordlist-t +++ b/tests/tools/cdbmake-wordlist-t @@ -17,7 +17,7 @@ if ! command -v cdb >/dev/null 2>&1 ; then fi # Output the test plan. -plan 18 +plan 20 # Create a temporary directory and wordlist and ensure it's writable. tmpdir=`test_tmpdir` @@ -79,6 +79,14 @@ ok_program 'Database still contains happenstance' 0 '1' \ ok_program 'Database does not contain password' 100 '' \ cdb -q "$tmpdir/wordlist.cdb" password +# Try filtering the wordlist into a new wordlist. +ok_program 'Wordlist filtering' 0 '' \ + "$cdbmake" -a -x '.*d' -l 8 -o "$tmpdir/wordlist.new" "$tmpdir/wordlist" +( echo 'bitterbane'; echo 'happenstance' ) > "$tmpdir/wordlist.expected" +ok_program 'Filtered wordlist is correct' 0 '' \ + cmp "$tmpdir/wordlist.expected" "$tmpdir/wordlist.new" +rm -f "$tmpdir/wordlist.expected" "$tmpdir/wordlist.new" + # Clean up. rm -f "$tmpdir/wordlist.cdb" rm -f "$tmpdir/wordlist" diff --git a/tools/cdbmake-wordlist b/tools/cdbmake-wordlist index 8cc86c5..74621a7 100755 --- a/tools/cdbmake-wordlist +++ b/tools/cdbmake-wordlist @@ -34,6 +34,74 @@ sub print_fh { return; } +# Filter the given input file and write it to a CDB data file, and then use +# cdb to turn that into a database. +# +# $in_fh - Input file handle for the source wordlist +# $input - Name of the input file, from which the CDB file name is derived +# $filter - Reference to sub that returns true to keep a word, false otherwise +# +# Returns: undef +# Throws: Text exception on output failure or pre-existing temporary file +sub write_cdb { + my ($in_fh, $input, $filter) = @_; + + # Create a temporary file to write the CDB input into. + my $tmp = $input . '.data'; + if (-f $tmp) { + die "$0: temporary output file $tmp already exists\n"; + } + open(my $tmp_fh, '>', $tmp) + or die "$0: cannot create output file $tmp: $!\n"; + + # Walk through the input word list and write each word that passes the + # filter to the output file handle as CDB data. + while (defined(my $word = <$in_fh>)) { + chomp($word); + next if !$filter->($word); + my $length = length($word); + print_fh($tmp_fh, "+$length,1:$word->1\n"); + } + + # Add a trailing newline, required by the CDB data format, and close. + print_fh($tmp_fh, "\n"); + close($tmp_fh) or die "$0: cannot write to temporary file $tmp: $!\n"; + + # Run cdb to turn the result into a CDB database. Ignore duplicate keys. + system($CDB, '-c', '-u', "$input.cdb", $tmp) == 0 + or die "$0: cdb -c failed\n"; + + # Remove the temporary file and return. + unlink($tmp) or die "$0: cannot remove temporary file $tmp: $!\n"; + return; +} + +# Filter the given input file and write the results to a new wordlist. +# +# $in_fh - Input file handle for the source wordlist +# $output - Output file name to which to write the resulting wordlist +# $filter - Reference to sub that returns true to keep a word, false otherwise +# +# Returns: undef +# Throws: Text exception on output failure +sub write_wordlist { + my ($in_fh, $output, $filter) = @_; + open(my $out_fh, '>', $output) + or die "$0: cannot create output file $output: $!\n"; + + # Walk through the input word list and write each word that passes the + # filter to the output file handle. + while (defined(my $word = <$in_fh>)) { + chomp($word); + next if !$filter->($word); + print_fh($out_fh, "$word\n"); + } + + # All done. + close($out_fh) or die "$0: cannot write to output file $output: $!\n"; + return; +} + # Always flush output. STDOUT->autoflush; @@ -42,13 +110,14 @@ my $fullpath = $0; local $0 = basename($0); # Parse the argument list. -my ($ascii, @exclude, $max_length, $min_length, $manual); +my ($ascii, @exclude, $max_length, $min_length, $manual, $output); Getopt::Long::config('bundling', 'no_ignore_case'); GetOptions( 'ascii|a' => \$ascii, 'max-length|L=i' => \$max_length, 'min-length|l=i' => \$min_length, 'manual|man|m' => \$manual, + 'output|o=s' => \$output, 'exclude|x=s' => \@exclude, ); if ($manual) { @@ -60,43 +129,40 @@ if (@ARGV != 1) { } my $input = $ARGV[0]; -# The output file goes in the same directory and is named the same as the -# input but with .data appended. -my $output = $input . '.data'; -if (-f $output) { - die "$0: temporary output file $output already exists\n"; -} - -# Process the input file into the output file, converting it to cdb input -# format. -open(my $in, '<', $input) - or die "$0: cannot open input file $input: $!\n"; -open(my $out, '>', $output) - or die "$0: cannot create output file $output: $!\n"; -WORD: while (defined(my $word = <$in>)) { - chomp($word); +# Build a filter from our command-line parameters. This is an anonymous sub +# that returns true to keep a word and false otherwise. +my $filter = sub { + my ($word) = @_; my $length = length($word); - next if (defined($min_length) && $length < $min_length); - next if (defined($max_length) && $length > $max_length); + + # Check length. + return if (defined($min_length) && $length < $min_length); + return if (defined($max_length) && $length > $max_length); + + # Check character classes. if ($ascii) { - next if $word =~ m{ [^[:ascii:]] }xms; - next if $word =~ m{ [[:cntrl:]] }xms; + return if $word =~ m{ [^[:ascii:]] }xms; + return if $word =~ m{ [[:cntrl:]] }xms; } + + # Check regex exclusions. for my $pattern (@exclude) { - next WORD if $word =~ m{ $pattern }xms; + return if $word =~ m{ $pattern }xms; } - print_fh($out, "+$length,1:$word->1\n"); -} -print_fh($out, "\n"); -close($in) or die "$0: cannot read all of input file $input: $!\n"; -close($out) or die "$0: cannot write to output file $output: $!\n"; -# Run cdb to turn the result into a constant database. Ignore duplicate keys. -system($CDB, '-c', '-u', "$input.cdb", $output) == 0 - or die "$0: cdb -c failed\n"; + # Word passes. Return success. + return 1; +}; -# Remove the temporary file. -unlink($output) or die "$0: cannot remove temporary file $output: $!\n"; +# Process the input file into either wordlist output or a CDB file. +open(my $in_fh, '<', $input) + or die "$0: cannot open input file $input: $!\n"; +if (defined($output)) { + write_wordlist($in_fh, $output, $filter); +} else { + write_cdb($in_fh, $input, $filter); +} +close($in_fh) or die "$0: cannot read all of input file $input: $!\n"; # All done. exit(0); @@ -104,7 +170,8 @@ __END__ =for stopwords cdbmake-wordlist cdb whitespace wordlist lookups lookup sublicense -MERCHANTABILITY NONINFRINGEMENT krb5-strength --ascii Allbery +MERCHANTABILITY NONINFRINGEMENT krb5-strength --ascii Allbery regexes +output-wordlist =head1 NAME @@ -113,7 +180,7 @@ cdbmake-wordlist - Create a cdb database from a wordlist =head1 SYNOPSIS B [B<-am>] [B<-l> I] [B<-L> I] - [B<-x> I ...] I + [B<-o> I] [B<-x> I ...] I =head1 DESCRIPTION @@ -130,6 +197,10 @@ B takes one argument, the input wordlist file. The output cdb database will have the same name as I but with C<.cdb> appended. The input wordlist file does not have to be sorted. +B can, instead of building a CDB file, filter a wordlist +against the criteria given on the command line and generate a new +wordlist. See the B<-o> option for more details. + =head1 OPTIONS =over 4 @@ -168,6 +239,17 @@ The default is not to filter out any words for minimum length. Print out this documentation (which is done simply by feeding the script to C). +=item B<-o> I, B<--output>=I + +Rather than creating a CDB database, apply the filter rules given by the +other command-line arguments and generate a new wordlist in the file name +given by the I option. This can be used to reduce the size of +a raw wordlist file (such as one taken from Internet sources) by removing +the words that will be filtered out of the CDB file anyway, thus reducing +the size of the source required to regenerate the CDB database. + +If this option is given, no CDB database will be created. + =item B<-x> I, B<--exclude>=I Filter all words matching the regular expression I from the