#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell

# Takes a stream of documents from 'tokenize' and creates a list
# of 1- 2- and 3-grams sorted in order of frequency.

use strict;
use Digest::MD5 qw( md5 );
use Bloom16;

my $usage = <<"EOU";

Usage: ngrams [window] [threshold]

	window - {1, 2, 3}-gram etc.
	threshold - how many times should an ngram be seen before 
				taking note of it? (Maximum 15)

This program takes a stream of text and prints to STDOUT the ngrams 
it has seen [threshold] or more times. Usage examples:

# cat text.txt | ngrams 3 10 > trigrams_over10.txt
# tar zxOf /corpora/reuters0.tar.gz | tokenize | ngrams 2 11 > rtrbi_11.txt

EOU

my @help = grep /^--/, @ARGV;
@ARGV = grep !/^--/, @ARGV;

if( @help ){
	print $usage;
	exit 0;
}

my $N = shift; # window size (1, 2, 3)
$N--;

my $T = shift; # threshold
$T++;

my $bloom = Bloom16->new(536_870_912);  # 2**29 ~ 67 megs

my $tot_words = 0;
my $seen = 0;

$/ = '</DOC>';

my $doc_count = 0;
while(<>){

	my $doc_name = '';
	if( s|<DOCNO>([^<]+)</DOCNO>|| ){
		$doc_name = $1;
#		if( ++$doc_count % 50 == 0 ){
#			print STDERR chr(13), $doc_name, "(", $doc_count, ")";
#		}
	}
	else {
		print "John Doe document, skipping.\n";
		next;
	}

	my @words = split /\s+/, $_;
	my @window = ();
	shift @words while $words[0] =~ /\d/; 
	(@window[0..$N], @words) = @words;

	while( @words ){

		my $string = join(' ', @window[0..$N]);
#		my $md5 = md5($string);

			if( $bloom->filter($string) == $T ){
				print $string, "\n";
				if($seen % 50 == 0){
					print STDERR chr(13), $string;
				}
				$seen++;
			}



		shift @words while $words[0] =~ /\d/;
		push @window, shift @words;
		shift @window;

		$tot_words++;
	}

	my $string = join(' ', @window[0..$N]);
#	my $md5 = md5($string);

		if( $bloom->filter($string) == $T ){
			print $string, "\n";
			$seen++;
		}

# Check to see if bloom tables need purging.

		if( $bloom->full ){
			print STDERR "\nFilter out of space!\n";
			$bloom->reset;
		}
}


print STDERR chr(13), "Found $seen ", $N+1, 
	"grams\n\n";

exit 0;



