Add wordlist filter mode to cdbmake-wordlist

author Russ Allbery <eagle@eyrie.org>

Mon, 4 Nov 2013 21:16:54 +0000 (13:16 -0800)

committer Russ Allbery <eagle@eyrie.org>

Mon, 4 Nov 2013 21:16:54 +0000 (13:16 -0800)
author Russ Allbery <eagle@eyrie.org>
Mon, 4 Nov 2013 21:16:54 +0000 (13:16 -0800)
committer Russ Allbery <eagle@eyrie.org>
Mon, 4 Nov 2013 21:16:54 +0000 (13:16 -0800)
diff --git a/tests/tools/cdbmake-wordlist-t b/tests/tools/cdbmake-wordlist-t

index ca4b53df9a5e142625e8a4f3e96f49969056b5eb..08d8d91eaddb62aa63f3826f13f48e016544ebfa 100755 (executable)
--- a/tests/tools/cdbmake-wordlist-t
+++ b/tests/tools/cdbmake-wordlist-t
@@ -17,7 +17,7 @@ if ! command -v cdb >/dev/null 2>&1 ; then
  fi
  
  # Output the test plan.
-plan 18
+plan 20
  
  # Create a temporary directory and wordlist and ensure it's writable.
  tmpdir=`test_tmpdir`
@@ -79,6 +79,14 @@ ok_program 'Database still contains happenstance' 0 '1' \
  ok_program 'Database does not contain password' 100 '' \
      cdb -q "$tmpdir/wordlist.cdb" password
  
+# Try filtering the wordlist into a new wordlist.
+ok_program 'Wordlist filtering' 0 '' \
+    "$cdbmake" -a -x '.*d' -l 8 -o "$tmpdir/wordlist.new" "$tmpdir/wordlist"
+( echo 'bitterbane'; echo 'happenstance' ) > "$tmpdir/wordlist.expected"
+ok_program 'Filtered wordlist is correct' 0 '' \
+    cmp "$tmpdir/wordlist.expected" "$tmpdir/wordlist.new"
+rm -f "$tmpdir/wordlist.expected" "$tmpdir/wordlist.new"
+
  # Clean up.
  rm -f "$tmpdir/wordlist.cdb"
  rm -f "$tmpdir/wordlist"
diff --git a/tools/cdbmake-wordlist b/tools/cdbmake-wordlist

index 8cc86c57522551c788f195f2b8e6fed836629df5..74621a7ef7b1ad93cf005af286a4036a5203a2ec 100755 (executable)
--- a/tools/cdbmake-wordlist
+++ b/tools/cdbmake-wordlist
@@ -34,6 +34,74 @@ sub print_fh {
      return;
  }
  
+# Filter the given input file and write it to a CDB data file, and then use
+# cdb to turn that into a database.
+#
+# $in_fh  - Input file handle for the source wordlist
+# $input  - Name of the input file, from which the CDB file name is derived
+# $filter - Reference to sub that returns true to keep a word, false otherwise
+#
+# Returns: undef
+#  Throws: Text exception on output failure or pre-existing temporary file
+sub write_cdb {
+    my ($in_fh, $input, $filter) = @_;
+
+    # Create a temporary file to write the CDB input into.
+    my $tmp = $input . '.data';
+    if (-f $tmp) {
+        die "$0: temporary output file $tmp already exists\n";
+    }
+    open(my $tmp_fh, '>', $tmp)
+      or die "$0: cannot create output file $tmp: $!\n";
+
+    # Walk through the input word list and write each word that passes the
+    # filter to the output file handle as CDB data.
+    while (defined(my $word = <$in_fh>)) {
+        chomp($word);
+        next if !$filter->($word);
+        my $length = length($word);
+        print_fh($tmp_fh, "+$length,1:$word->1\n");
+    }
+
+    # Add a trailing newline, required by the CDB data format, and close.
+    print_fh($tmp_fh, "\n");
+    close($tmp_fh) or die "$0: cannot write to temporary file $tmp: $!\n";
+
+    # Run cdb to turn the result into a CDB database.  Ignore duplicate keys.
+    system($CDB, '-c', '-u', "$input.cdb", $tmp) == 0
+      or die "$0: cdb -c failed\n";
+
+    # Remove the temporary file and return.
+    unlink($tmp) or die "$0: cannot remove temporary file $tmp: $!\n";
+    return;
+}
+
+# Filter the given input file and write the results to a new wordlist.
+#
+# $in_fh  - Input file handle for the source wordlist
+# $output - Output file name to which to write the resulting wordlist
+# $filter - Reference to sub that returns true to keep a word, false otherwise
+#
+# Returns: undef
+#  Throws: Text exception on output failure
+sub write_wordlist {
+    my ($in_fh, $output, $filter) = @_;
+    open(my $out_fh, '>', $output)
+      or die "$0: cannot create output file $output: $!\n";
+
+    # Walk through the input word list and write each word that passes the
+    # filter to the output file handle.
+    while (defined(my $word = <$in_fh>)) {
+        chomp($word);
+        next if !$filter->($word);
+        print_fh($out_fh, "$word\n");
+    }
+
+    # All done.
+    close($out_fh) or die "$0: cannot write to output file $output: $!\n";
+    return;
+}
+
  # Always flush output.
  STDOUT->autoflush;
  
@@ -42,13 +110,14 @@ my $fullpath = $0;
  local $0 = basename($0);
  
  # Parse the argument list.
-my ($ascii, @exclude, $max_length, $min_length, $manual);
+my ($ascii, @exclude, $max_length, $min_length, $manual, $output);
  Getopt::Long::config('bundling', 'no_ignore_case');
  GetOptions(
      'ascii|a'        => \$ascii,
      'max-length|L=i' => \$max_length,
      'min-length|l=i' => \$min_length,
      'manual|man|m'   => \$manual,
+    'output|o=s'     => \$output,
      'exclude|x=s'    => \@exclude,
  );
  if ($manual) {
@@ -60,43 +129,40 @@ if (@ARGV != 1) {
  }
  my $input = $ARGV[0];
  
-# The output file goes in the same directory and is named the same as the
-# input but with .data appended.
-my $output = $input . '.data';
-if (-f $output) {
-    die "$0: temporary output file $output already exists\n";
-}
-
-# Process the input file into the output file, converting it to cdb input
-# format.
-open(my $in, '<', $input)
-  or die "$0: cannot open input file $input: $!\n";
-open(my $out, '>', $output)
-  or die "$0: cannot create output file $output: $!\n";
-WORD: while (defined(my $word = <$in>)) {
-    chomp($word);
+# Build a filter from our command-line parameters.  This is an anonymous sub
+# that returns true to keep a word and false otherwise.
+my $filter = sub {
+    my ($word) = @_;
      my $length = length($word);
-    next if (defined($min_length) && $length < $min_length);
-    next if (defined($max_length) && $length > $max_length);
+
+    # Check length.
+    return if (defined($min_length) && $length < $min_length);
+    return if (defined($max_length) && $length > $max_length);
+
+    # Check character classes.
      if ($ascii) {
-        next if $word =~ m{ [^[:ascii:]] }xms;
-        next if $word =~ m{ [[:cntrl:]] }xms;
+        return if $word =~ m{ [^[:ascii:]] }xms;
+        return if $word =~ m{ [[:cntrl:]] }xms;
      }
+
+    # Check regex exclusions.
      for my $pattern (@exclude) {
-        next WORD if $word =~ m{ $pattern }xms;
+        return if $word =~ m{ $pattern }xms;
      }
-    print_fh($out, "+$length,1:$word->1\n");
-}
-print_fh($out, "\n");
-close($in)  or die "$0: cannot read all of input file $input: $!\n";
-close($out) or die "$0: cannot write to output file $output: $!\n";
  
-# Run cdb to turn the result into a constant database.  Ignore duplicate keys.
-system($CDB, '-c', '-u', "$input.cdb", $output) == 0
-  or die "$0: cdb -c failed\n";
+    # Word passes.  Return success.
+    return 1;
+};
  
-# Remove the temporary file.
-unlink($output) or die "$0: cannot remove temporary file $output: $!\n";
+# Process the input file into either wordlist output or a CDB file.
+open(my $in_fh, '<', $input)
+  or die "$0: cannot open input file $input: $!\n";
+if (defined($output)) {
+    write_wordlist($in_fh, $output, $filter);
+} else {
+    write_cdb($in_fh, $input, $filter);
+}
+close($in_fh) or die "$0: cannot read all of input file $input: $!\n";
  
  # All done.
  exit(0);
@@ -104,7 +170,8 @@ __END__
  
  =for stopwords
  cdbmake-wordlist cdb whitespace wordlist lookups lookup sublicense
-MERCHANTABILITY NONINFRINGEMENT krb5-strength --ascii Allbery
+MERCHANTABILITY NONINFRINGEMENT krb5-strength --ascii Allbery regexes
+output-wordlist
  
  =head1 NAME
  
@@ -113,7 +180,7 @@ cdbmake-wordlist - Create a cdb database from a wordlist
  =head1 SYNOPSIS
  
  B<cdbmake-wordlist> [B<-am>] [B<-l> I<min-length>] [B<-L> I<max-length>]
-    [B<-x> I<exclude> ...] I<wordlist>
+    [B<-o> I<output-wordlist>] [B<-x> I<exclude> ...] I<wordlist>
  
  =head1 DESCRIPTION
  
@@ -130,6 +197,10 @@ B<cdbmake-wordlist> takes one argument, the input wordlist file.  The
  output cdb database will have the same name as I<wordlist> but with
  C<.cdb> appended.  The input wordlist file does not have to be sorted.
  
+B<cdbmake-wordlist> can, instead of building a CDB file, filter a wordlist
+against the criteria given on the command line and generate a new
+wordlist.  See the B<-o> option for more details.
+
  =head1 OPTIONS
  
  =over 4
@@ -168,6 +239,17 @@ The default is not to filter out any words for minimum length.
  Print out this documentation (which is done simply by feeding the script to
  C<perldoc -t>).
  
+=item B<-o> I<wordlist>, B<--output>=I<wordlist>
+
+Rather than creating a CDB database, apply the filter rules given by the
+other command-line arguments and generate a new wordlist in the file name
+given by the I<wordlist> option.  This can be used to reduce the size of
+a raw wordlist file (such as one taken from Internet sources) by removing
+the words that will be filtered out of the CDB file anyway, thus reducing
+the size of the source required to regenerate the CDB database.
+
+If this option is given, no CDB database will be created.
+
  =item B<-x> I<exclude>, B<--exclude>=I<exclude>
  
  Filter all words matching the regular expression I<exclude> from the
author	Russ Allbery <eagle@eyrie.org>
	Mon, 4 Nov 2013 21:16:54 +0000 (13:16 -0800)
committer	Russ Allbery <eagle@eyrie.org>
	Mon, 4 Nov 2013 21:16:54 +0000 (13:16 -0800)
tests/tools/cdbmake-wordlist-t		patch \| blob \| history
tools/cdbmake-wordlist		patch \| blob \| history