3 # Turn a wordlist into a CDB or SQLite database.
5 # This program takes as input a word list (a file of words separated by
6 # newlines) and turns it into either a CDB or a SQLite database that can be
7 # used by the krb5-strength plugin or heimdal-strength program to check
8 # passwords against a password dictionary. It can also filter a word list in
9 # various ways to create a new word list.
11 ##############################################################################
12 # Declarations and configuration
13 ##############################################################################
20 use File::Basename qw(basename);
21 use Getopt::Long qw(GetOptions);
23 # The path to the cdb utility, used to create the final database. By default,
24 # the user's PATH is searched for cdb.
27 # The SQL used to create the SQLite database.
28 ## no critic (ValuesAndExpressions::ProhibitImplicitNewlines)
29 my $SQLITE_CREATE = q{
30 CREATE TABLE passwords (
31 password TEXT UNIQUE NOT NULL,
32 drowssap TEXT UNIQUE NOT NULL
36 # The SQL used to insert passwords into the database.
37 my $SQLITE_INSERT = q{
38 INSERT OR IGNORE INTO passwords (password, drowssap) values (?, ?)
42 ##############################################################################
44 ##############################################################################
46 # say with error checking and an explicit file handle.
48 # $fh - Output file handle
49 # @args - Remaining arguments to print
52 # Throws: Text exception on output failure
55 say {$fh} @args or croak("say failed: $!");
59 ##############################################################################
61 ##############################################################################
63 # Filter the given input file and write it to a CDB data file, and then use
64 # cdb to turn that into a database.
66 # $in_fh - Input file handle for the source wordlist
67 # $output - Name of the output CDB file
68 # $filter - Reference to sub that returns true to keep a word, false otherwise
71 # Throws: Text exception on output failure or pre-existing temporary file
73 my ($in_fh, $output, $filter) = @_;
75 # Check that the output CDB file doesn't exist.
77 die "$0: output file $output already exists\n";
80 # Create a temporary file to write the CDB input into.
81 my $tmp = $output . '.data';
83 die "$0: temporary output file $tmp already exists\n";
85 open(my $tmp_fh, '>', $tmp);
87 # Walk through the input word list and write each word that passes the
88 # filter to the output file handle as CDB data.
89 while (defined(my $word = <$in_fh>)) {
91 next if !$filter->($word);
92 my $length = length($word);
93 say_fh($tmp_fh, "+$length,1:$word->1");
96 # Add a trailing newline, required by the CDB data format, and close.
100 # Run cdb to turn the result into a CDB database. Ignore duplicate keys.
101 system($CDB, '-c', '-u', $output, $tmp) == 0
102 or die "$0: cdb -c failed\n";
104 # Remove the temporary file and return.
109 # Filter the given input file and write it to a newly-created SQLite database.
110 # Requires the DBI and DBD::SQLite modules be installed. The database will
111 # contain one table, passwords, with two columns, password and drowssap, which
112 # store the word and the word reversed for each word that passes the filter.
114 # $in_fh - Input file handle for the source wordlist
115 # $output - Name of the output SQLite database
116 # $filter - Reference to sub that returns true to keep a word, false otherwise
119 # Throws: Text exception on output failure, pre-existing output file, or
120 # missing Perl modules
122 my ($in_fh, $output, $filter) = @_;
124 # Check that the output SQLite file doesn't exist.
126 die "$0: output file $output already exists\n";
129 # Load the required modules.
133 # Open and create the database.
134 my $options = { PrintError => 0, RaiseError => 1, AutoCommit => 1 };
135 my $dbh = DBI->connect("dbi:SQLite:dbname=$output", q{}, q{}, $options);
136 $dbh->do($SQLITE_CREATE);
138 # Tune SQLite to improve the speed of bulk inserts. Use unsafe insert
139 # processing and increase the index cache to 500MB.
140 $dbh->do('PRAGMA synchronous = 0');
141 $dbh->do('PRAGMA cache_size = 500000');
143 # Start a transaction and prepare the insert statement for each word.
145 my $sth = $dbh->prepare($SQLITE_INSERT);
147 # Walk through the input word list and add each word that passes the
148 # filter to the database, both as-is and reversed.
149 while (defined(my $word = <$in_fh>)) {
151 next if !$filter->($word);
152 my $reversed = reverse($word);
153 $sth->execute($word, $reversed);
156 # Commit and close the database.
162 # Filter the given input file and write the results to a new wordlist.
164 # $in_fh - Input file handle for the source wordlist
165 # $output - Output file name to which to write the resulting wordlist
166 # $filter - Reference to sub that returns true to keep a word, false otherwise
169 # Throws: Text exception on output failure
171 my ($in_fh, $output, $filter) = @_;
172 open(my $out_fh, '>', $output);
174 # Walk through the input word list and write each word that passes the
175 # filter to the output file handle.
176 while (defined(my $word = <$in_fh>)) {
178 next if !$filter->($word);
179 say_fh($out_fh, $word);
187 ##############################################################################
189 ##############################################################################
191 # Given the parsed command-line options as a hash, construct a filter for the
192 # word list and return it. The filter will, given a word, return true if the
193 # word should be included in the dictionary and false otherwise.
195 # $config_ref - Hash of configuration options
196 # ascii - Strip non-printable or non-ASCII words
197 # exclude - Reference to array of regex patterns to exclude
198 # min_length - Minimum word length
199 # max_length - Maximum word length
201 # Returns: Filter function to check a word.
203 my ($config_ref) = @_;
205 # Build a filter from our command-line parameters. This is an anonymous
206 # sub that returns true to keep a word and false otherwise.
209 my $length = length($word);
210 my $min_length = $config_ref->{'min-length'};
211 my $max_length = $config_ref->{'max-length'};
214 return if (defined($min_length) && $length < $min_length);
215 return if (defined($max_length) && $length > $max_length);
217 # Check character classes.
218 if ($config_ref->{ascii}) {
219 return if $word =~ m{ [^[:ascii:]] }xms;
220 return if $word =~ m{ [[:cntrl:]] }xms;
223 # Check regex exclusions.
224 if ($config_ref->{exclude}) {
225 for my $pattern (@{ $config_ref->{exclude} }) {
226 return if $word =~ m{ $pattern }xms;
230 # Word passes. Return success.
236 ##############################################################################
238 ##############################################################################
240 # Always flush output.
243 # Clean up the script name for error reporting.
245 local $0 = basename($0);
247 # Parse the argument list.
250 'ascii|a', 'cdb|c=s', 'max-length|L=i', 'min-length|l=i',
251 'manual|man|m', 'output|o=s', 'sqlite|s=s', 'exclude|x=s@',
253 Getopt::Long::config('bundling', 'no_ignore_case');
254 GetOptions(\%config, @options);
255 if ($config{manual}) {
256 say_fh(\*STDOUT, 'Feeding myself to perldoc, please wait...');
257 exec('perldoc', '-t', $fullpath);
260 die "Usage: krb5-strength-wordlist <wordlist>\n";
262 if ($config{cdb} && ($config{output} || $config{sqlite})) {
263 die "$0: -c cannot be used with -o or -s\n";
264 } elsif ($config{output} && $config{sqlite}) {
265 die "$0: -o cannot be used with -c or -s\n";
267 my $input = $ARGV[0];
269 # Build the filter closure.
270 my $filter = build_filter(\%config);
272 # Process the input file into either wordlist output or a CDB file.
273 open(my $in_fh, '<', $input);
274 if ($config{output}) {
275 write_wordlist($in_fh, $config{output}, $filter);
276 } elsif ($config{cdb}) {
277 write_cdb($in_fh, $config{cdb}, $filter);
278 } elsif ($config{sqlite}) {
279 write_sqlite($in_fh, $config{sqlite}, $filter);
287 ##############################################################################
289 ##############################################################################
292 krb5-strength-wordlist krb5-strength cdb whitespace lookups lookup
293 sublicense MERCHANTABILITY NONINFRINGEMENT krb5-strength --ascii Allbery
294 regexes output-wordlist heimdal-strength SQLite output-wordlist
295 output-sqlite DBI wordlist SPDX-License-Identifier MIT
299 krb5-strength-wordlist - Create a krb5-strength database from a word list
303 B<krb5-strength-wordlist> [B<-am>] [B<-c> I<output-cdb>] [B<-l> I<min-length>]
304 [B<-L> I<max-length>] [B<-o> I<output-wordlist>] [B<-s> I<output-sqlite>]
305 [B<-x> I<exclude> ...] I<wordlist>
309 B<krb5-strength-wordlist> converts a word list (a file containing one word
310 per line) into a database that can be used by the krb5-strength plugin or
311 B<heimdal-strength> command for checking passwords. Two database formats
312 are supported, with different features. CDB is more space-efficient and
313 possibly faster, but supports checking passwords only against exact
314 matches or simple transformations (removing small numbers of leading and
315 trailing characters). SQLite creates a much larger database, but supports
316 rejecting any password within edit distance one of a word in the word
319 CDB is a format invented by Dan Bernstein for fast, constant databases.
320 The database is fixed during creation and cannot be changed without
321 rebuilding it, and is optimized for very fast access. For cdb, the
322 database generated by this program will have keys for each word in the
323 word list and the constant C<1> as the value.
325 SQLite stores the word list in a single table containing both each word
326 and each word reversed. This allows the krb5-strength plugin or
327 B<heimdal-strength> command to reject passwords within edit distance one
328 of any word in the word list. (Edit distance one means that the word list
329 entry can be formed by changing a single character of the password, either
330 by adding one character, removing one character, or changing one character
331 to a different character.) However, the SQLite database will be much
332 larger and lookups may be somewhat slower.
334 B<krb5-strength-wordlist> takes one argument, the input word list file.
335 Use the B<-c> option to specify an output CDB file, B<-s> to specify an
336 output SQLite file, or B<-o> to just filter the word list against the
337 criteria given on the command line and generate a new word list.
338 The input word list file does not have to be sorted. See the individual
339 option descriptions for more information.
345 =item B<-a>, B<--ascii>
347 Filter all words that contain non-ASCII characters or control characters
348 from the resulting cdb file, leaving only words that consist solely of
349 ASCII non-control characters.
351 =item B<-c> I<output-cdb>, B<--cdb>=I<output-cdb>
353 Create a CDB database in I<output-cdb>. A temporary file named after
354 I<output-cdb> with C<.data> appended will be created in the same directory
355 and used to stage the database contents. The actual CDB file will be
356 built using the B<cdb> command, which must be on the user's path. If
357 either file already exists, B<krb5-strength-wordlist> will abort with an
360 This option cannot be used with B<-o> or B<-s>.
362 =item B<-L> I<maximum>, B<--max-length>=I<maximum>
364 Filter all words of length greater than I<maximum> from the resulting cdb
365 database. The length of each line (minus the separating newline) in the
366 input word list will be checked against I<minimum> and will be filtered
367 out of the resulting database if it is shorter. Useful for generating
368 password dictionaries from word lists that contain random noise that's
369 highly unlikely to be used as a password.
371 The default is to not filter out any words for maximum length.
373 =item B<-l> I<minimum>, B<--min-length>=I<minimum>
375 Filter all words of length less than I<minimum> from the resulting cdb
376 database. The length of each line (minus the separating newline) in the
377 input word list will be checked against I<minimum> and will be filtered
378 out of the resulting database if it is shorter. Useful for generating
379 password dictionaries where shorter passwords will be rejected by a
380 generic length check and no dictionary lookup will be done for a transform
381 of the password shorter than the specified minimum.
383 The default is not to filter out any words for minimum length.
385 =item B<-m>, B<--man>, B<--manual>
387 Print out this documentation (which is done simply by feeding the script to
390 =item B<-o> I<wordlist>, B<--output>=I<wordlist>
392 Rather than creating a database, apply the filter rules given by the other
393 command-line arguments and generate a new word list in the file name given
394 by the I<wordlist> option. This can be used to reduce the size of a raw
395 word list file (such as one taken from Internet sources) by removing the
396 words that will be filtered out of the dictionary anyway, thus reducing
397 the size of the source required to regenerate the dictionary.
399 This option cannot be used with B<-c> or B<-s>.
401 =item B<-s> I<output-sqlite>, B<--sqlite>=I<output-sqlite>
403 Create a SQLite database in I<output-sqlite>. If this file already
404 exists, B<krb5-strength-wordlist> will abort with an error. The resulting
405 SQLite database will have one table, C<passwords>, with two columns,
406 C<password> and C<drowssap>. The first holds a word from the word list,
407 and the second holds the same word reversed.
409 Using this option requires the DBI and DBD::SQLite Perl modules be
412 This option cannot be used with B<-c> or B<-o>.
414 =item B<-x> I<exclude>, B<--exclude>=I<exclude>
416 Filter all words matching the regular expression I<exclude> from the
417 resulting cdb database. This regular expression will be matched against
418 each line of the source word list after the trailing newline is removed.
419 This option may be given repeatedly to add multiple exclusion regexes.
425 Russ Allbery <eagle@eyrie.org>
427 =head1 COPYRIGHT AND LICENSE
429 Copyright 2016, 2020, 2023 Russ Allbery <eagle@eyrie.org>
431 Copyright 2013-2014 The Board of Trustees of the Leland Stanford Junior
434 Permission is hereby granted, free of charge, to any person obtaining a
435 copy of this software and associated documentation files (the "Software"),
436 to deal in the Software without restriction, including without limitation
437 the rights to use, copy, modify, merge, publish, distribute, sublicense,
438 and/or sell copies of the Software, and to permit persons to whom the
439 Software is furnished to do so, subject to the following conditions:
441 The above copyright notice and this permission notice shall be included in
442 all copies or substantial portions of the Software.
444 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
445 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
446 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
447 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
448 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
449 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
450 DEALINGS IN THE SOFTWARE.
452 SPDX-License-Identifier: MIT
456 cdb(1), L<DBI>, L<DBD::SQLite>
458 The cdb file format is defined at L<http://cr.yp.to/cdb.html>.
460 The current version of this program is available from its web page at
461 L<https://www.eyrie.org/~eagle/software/krb5-strength/> as part of the
462 krb5-strength package.
467 # copyright-at-end-flag: t