#!/usr/bin/perl -X
use strict;
use warnings;
use Bio::SeqIO;
use Getopt::Long;
use Bio::Phylo::Util::Logger ':levels';

# Writes sequences greater than --length from infile to outfile
#
# Usage:
# seqfilter -i infile.fasta -o outfile.fasta -l 25000

# process command line arguments
my $verbosity = INFO;
my $infile;
my $outfile;
GetOptions(
	'infile=s'  => \$infile,
	'outfile=s' => \$outfile,
	'verbose+'  => \$verbosity,
);
Bio::Phylo::Util::Logger->new( 
	'-level' => $verbosity, 
	'-class' => 'main' 
);

# open file handle or use STDIN
my $in;
if ( $infile ) {
	INFO "reading from infile $infile";
	$in = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-file'   => $infile,
	);
}
else {
	INFO "reading from STDIN";
	$in = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-fh'     => \*STDIN,
	);
}

# open file handle or STDOUT
my $out;
if ( $outfile ) {
	INFO "writing to outfile $outfile";
	$out = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-file'   => ">$outfile",
	);
}
else {
	INFO "writing to STDOUT";
	$out = Bio::SeqIO->new(
		'-format' => 'fasta',
		'-fh'     => \*STDOUT,
	);
}

# start filtering
my %seen;
while( my $seq = $in->next_seq ) {
	my $id = $seq->id;
	unless( $seen{$id}++ ) {
		$out->write_seq( $seq );
	}
	else {
		INFO "already seen ID $id";
	}
}

