orthograph-manager

#!/usr/bin/env perl
#--------------------------------------------------
# This file is part of Orthograph.
# Copyright 2014 Malte Petersen <mptrsen@uni-bonn.de>
# 
# Orthograph is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
# 
# Orthograph is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along with
# Orthograph. If not, see http://www.gnu.org/licenses/.
#-------------------------------------------------- 

# Pragmas
use strict;         # make me write clean code
use warnings;       # cry if something seems odd
use autodie;        # die automatically on I/O functions

# Core modules
use Carp;           # carp and croak: warn and die with call stack
use File::Temp;     # temporary files
use File::Path qw( remove_tree make_path );	# this also uses File::Spec
use FindBin;        # locate the dir of this script during compile time
use Getopt::Long;   # parse command line arguments
use Data::Dumper;   # good for debugging

# Custom modules
use lib $FindBin::RealBin;               # $RealBin is the directory of the original script
use Seqload::Fasta qw( check_if_fasta ); # object-oriented access to fasta files
use Orthograph::Functions;               # functions for all Orthograph tools
use Orthograph::Config;                  # configuration in global hashref $config

#--------------------------------------------------
# # Copy configuration
#-------------------------------------------------- 
my $config = $Orthograph::Config::config;

#--------------------------------------------------
# # The user only wants help
#-------------------------------------------------- 
if ($config->{'help'}) {
	Orthograph::Functions::print_usage($config) and exit;
}


#--------------------------------------------------
# # These variables can be set in the config file
#-------------------------------------------------- 

# Database settings
my $database_backend          = $config->{'database-backend'};
my $db_dbname                 = $config->{'mysql-database'};
my $db_dbpwd                  = $config->{'mysql-password'};
my $db_dbserver               = $config->{'mysql-server'};
my $db_dbuser                 = $config->{'mysql-username'};
my $db_prefix                 = $config->{'db-prefix'};
my $db_table_aaseqs           = $config->{'db_table_aaseqs'};
my $db_table_blast            = $config->{'db_table_blast'};
my $db_table_blastdbs         = $config->{'db_table_blastdbs'};
my $db_table_ests             = $config->{'db_table_ests'};
my $db_table_hmmsearch        = $config->{'db_table_hmmsearch'};
my $db_table_log_evalues      = $config->{'db_table_log_evalues'};
my $db_table_scores           = $config->{'db_table_scores'};
my $db_table_ntseqs           = $config->{'db_table_ntseqs'};
my $db_table_ogs              = $config->{'db_table_ogs'};
my $db_table_orthologs        = $config->{'db_table_orthologs'};
my $db_table_seqpairs         = $config->{'db_table_sequence_pairs'};
my $db_table_seqtypes         = $config->{'db_table_sequence_types'};
my $db_table_set_details      = $config->{'db_table_set_details'};
my $db_table_taxa             = $config->{'db_table_taxa'};
my $db_table_temp             = $config->{'db_table_temp'};

# Programs in the order of their use
my $alignment_program          = $config->{'alignment-program'};
my $hmmbuild_program           = $config->{'hmmbuild-program'};
my $translate_program          = $config->{'translate-program'};
my $exonerate_program          = $config->{'exonerate-program'};
my $hmmsearch_program          = $config->{'hmmsearch-program'};
my $blast_program              = $config->{'blast-program'};
my $makeblastdb_program        = $config->{'makeblastdb-program'};
my $swipe_program              = $config->{'swipe-program'};

# Actions
my $create                    = $config->{'create'};
my $delete_all                = $config->{'destroy'};
my $delete_ogs                = $config->{'delete-ogs'};
my $delete_set                = $config->{'delete-set'};
my $listtaxa                  = $config->{'list-taxa'};
my $listsets                  = $config->{'list-sets'};
my $listogs                   = $config->{'list-ogs'};
my $isnucleotide              = $config->{'load-ogs-nucleotide'};
my $ispeptide                 = $config->{'load-ogs-peptide'};
my $overwrite                 = $config->{'overwrite'};
my $testdeps                  = $config->{'test-deps'};

# other variables
my $verbose                   = $config->{'verbose'};
my $outputdir                 = $config->{'output-directory'};
my $reference_taxon_shorthand = $config->{'reference-taxon-shorthand'};
my $ogsversion                = $config->{'ogs-version'};
my $taxon_name                = $config->{'ogs-taxon-name'};
my $orthodb5_format           = $config->{'orthodb5-format'};

# substitution character for selenocysteine, which normally leads to blast freaking out
my $u_subst                   = $config->{'substitute-u-with'};

# which database backend do we use?
my $use_mysql       = $database_backend =~ /mysql/i  ? 1 : 0;
my $use_sqlite      = $database_backend =~ /sqlite/i ? 1 : 0;
my $database        = undef;

# load the database modules at runtime, depending on the backend setting
if ($use_mysql)     {
	require Wrapper::Mysql;
	$database = $config->{'mysql-database'};
}
elsif ($use_sqlite) {
	require Wrapper::Sqlite;
	$database = $config->{'sqlite-database'};
}

# table names. too lazy to change all of them, so i'll just reuse the old hash structure
my $t            = {
	'aaseqs'       => $db_table_aaseqs,
	'blastdbs'     => $db_table_blastdbs,
	'ntseqs'       => $db_table_ntseqs,
	'ogs'          => $db_table_ogs,
	'orthologs'    => $db_table_orthologs,
	'seqpairs'     => $db_table_seqpairs,
	'seqtypes'     => $db_table_seqtypes,
	'set_details'  => $db_table_set_details,
	'taxa'         => $db_table_taxa,
	'temp'         => $db_table_temp,
};

my $blastdir          = '';
my $description       = '';
my $name              = '';
my $ogs_name          = '';
my $setdir            = '';
my $setname           = '';
my $sql               = '';
my $temptable         = $db_table_temp;
my $total_seqs        = 0;
my %names             = ( );

my $infile = '';

unless (defined $db_dbuser and defined $db_dbpwd ) {
	die "Fatal: You must specify database username (--mysql-username) and password (--mysql-password).\n";
}

# name == username unless defined name
$name = $db_dbuser unless defined($name);

#--------------------------------------------------
# # if you really want to delete everything...
#-------------------------------------------------- 
if ($delete_all) {
	print "This will erase your Orthograph database structure. Are you sure (y/n)? ";
	get_yn() or exit;
	print "OK, whatever you say.\n";
	drop_tables();
	print "OK.\n";
	exit;
}
#--------------------------------------------------
# # if you only want me to create the tables, fine. this will give you a clean slate.
#-------------------------------------------------- 
elsif ($create) {
	print "This will create a new Orthograph table structure in the database '$database' with the table prefix '$db_prefix'.\nAny existing Orthograph table structures with that prefix will be erased. Are you sure (y/n)? ";
	get_yn() or exit;
	unless (-e $outputdir) {
		die "Fatal: output directory $outputdir specified, but does not exist\n";
	}
	drop_tables();
	create_tables();
	print "Created Orthograph table structure in database '$database'.\n";
	if ($use_sqlite) {
		if (-z Wrapper::Sqlite::attached_db_file()) {
			unlink Wrapper::Sqlite::attached_db_file();
		}
		else {
			print "Did not delete species database '" . Wrapper::Sqlite::attached_db_file() . "'  as it is not empty\n";
		}
	}
	exit;
}

# want me to delete a set?
elsif ($delete_set) { 
	delete_set($delete_set);
	print "OK, removed orthology relationships for set '$delete_set' completely. No sequences were deleted from database.\n";
	exit;
}

# or remove an ogs from the database?
elsif ($delete_ogs) {
	print "Deleting OGS sequences from $delete_ogs. Are you sure (y/n)? ";
	get_yn() or exit;
	my $c = delete_ogs($delete_ogs);
	print "OK, removed $c sequences for OGS '$delete_ogs' from the database.\n";
	exit;
}

#--------------------------------------------------
# Just list the present ortholog sets, then exit
#-------------------------------------------------- 
if ($listsets) {
	list_sets() and exit;
}

# list ogs
elsif ($listogs) {
	print_ogs_list() and exit;
}

# list taxa
elsif ($listtaxa) {
	list_set_details() and exit;
}

# test dependencies and everything
elsif ($testdeps) {
	Orthograph::Functions::test_dependencies($translate_program, $alignment_program, $hmmbuild_program, $makeblastdb_program, $hmmsearch_program, $blast_program, $exonerate_program, $swipe_program);
	exit;
}

# load a nucleotide ogs file
elsif ($isnucleotide) {
	$infile = $isnucleotide;
	# 1 for nucleotide
	upload_ogs($infile, 1) or fail_and_exit("Nucleotide loading failed");
	exit;
}

# load a peptide ogs file
elsif ($ispeptide) {
	$infile = $ispeptide;
	# 2 for amino-acid
	upload_ogs($infile, 2) or fail_and_exit("Peptide loading failed");
	exit;
}

# otherwise, this must be an orthodb file
$infile = shift @ARGV or Orthograph::Functions::print_usage($config) and exit 1;
die("Fatal: file not found: $infile\n") unless -f $infile;

#--------------------------------------------------
# # continue and gather information
#-------------------------------------------------- 

# stop here if db structure is not present!
if (!db_structure_present()) {
	fail_and_exit('Orthograph database structure not present! Did you forget to run `orthograph-manager -create`?');
}

print "Setting up core ortholog set from '$infile' in database '$database' on host '$db_dbserver'... (press CTRL+C to abort)\n";

if (!$setname) { $setname = get_setname() }

# TODO rewrite this part using parametrized queries to protect from SQL injections?
# does this set exist in the database already?
if (set_exists($setname)) {
	warn "Warning: An ortholog set with this name already exists in $db_dbname on $db_dbserver.\n";
	unless ($overwrite) {
		print "Pick a different name or use -o for overwriting.\n";
		exit;
	}
	print "Overwriting existing entries (may cause data disintegration). Are you sure (y/n)? ";
	get_yn() or print "OK, exiting\n" and exit;
}

$description = get_answer("\nEnter a description for the set (optional but recommended). IMPORTANT: Do not use any commas (,)");

unless (defined $setname) {
	print "Fatal: You must specify the core-ortholog set name (-s).\n";
	exit;
}

#--------------------------------------------------
# # read the input file and create a CSV file with everything we know 
#-------------------------------------------------- 

# read the input file and associate the taxon shorthands to existing OGS
my $ogsid_for = get_all_ogsids($infile);

# insert the new set into database
my $new_setid = insert_new_set($setname, $description) or fail_and_exit("Failed to insert new set into database");

# make a csv file with everything we need
my $csvf = make_csv_for_set_uploading($infile, $new_setid, $ogsid_for);

# load the file content into temp table
load_set_into_temptable($csvf) or fail_and_exit("Failed to upload set file");

# set up orthologous relationships in the database
insert_orthologs() or fail_and_exit("Failed to establish ortholog relationships in database");

my $n = get_number_of_orthologs_for_set($new_setid);
my $n_cogs = get_number_of_cogs_for_set($new_setid);

# insert the blast db info
insert_blastdb($new_setid);

printf "Done: %d orthologous sequence relationships (%d COGs) for set %s\n", $n, $n_cogs, $setname;

exit;


#--------------------------------------------------
# # Functions follow 
#-------------------------------------------------- 

sub fill_tables_from_temp_table {
	if ($use_mysql) {
		Wrapper::Mysql::fill_tables_from_temp_table($t, $temptable);
	}
	elsif ($use_sqlite) {
		Wrapper::Sqlite::fill_tables_from_temp_table($t, $temptable);
	}
}
#--------------------------------------------------
# # create a temporary table for high-speed loading of the data. we can sort it out later.
#-------------------------------------------------- 

sub load_temp_table_with_data {
	my $csvfile = shift @_;

	if ($use_mysql) {
		Wrapper::Mysql::create_temp_table($temptable);
		Wrapper::Mysql::load_csv_into_temptable($csvfile, $temptable);
	}
	elsif ($use_sqlite) {
		Wrapper::Sqlite::create_temp_table($temptable);
		Wrapper::Sqlite::load_csv_into_temptable($csvfile, $temptable);
	}

	print "Successfully loaded $total_seqs sequences from $infile into database.\n";

	# get name and longname from the temp table; they do not get updated automatically
	my $dbh = get_dbh();
	$sql = $dbh->prepare("SELECT DISTINCT name, longname FROM $temptable");
	$sql->execute();
	while (my @result = $sql->fetchrow) {
		$names{$result[0]} = '';
	}
	$dbh->disconnect;

	# ask the user for the taxon names that aren't complete
	foreach (sort(keys %names)) {
		unless ($names{$_}) {
			$names{$_} = get_answer("Enter complete name for the shorthand '$_' (no commas!)");
		}
	}

	# update the temptable with the collected info
	$dbh = get_dbh();
	foreach (keys %names) {
		$sql = $dbh->prepare("UPDATE $temptable SET longname='$names{$_}' WHERE name='$_'");
		$sql->execute();
	}
	$dbh->disconnect;
}

sub get_dbh {
	if ($use_mysql) {
		my $dbh = Wrapper::Mysql::get_dbh() or fail_and_exit('Couldn not get a database connection');
		return $dbh;
	}
	elsif ($use_sqlite) {
		my $dbh = Wrapper::Sqlite::get_dbh() or fail_and_exit('Could not get a database connection');
		return $dbh;
	}
}

sub fail_and_exit {
	my $msg = shift @_;
	print STDERR 'Fatal: ' . $msg . "\n";
	exit 1;
}

sub set_exists {
	my $setname = shift;
	if ($use_mysql) {
		my $query = "SELECT name FROM $t->{'set_details'} WHERE name = '$setname'";
		return Wrapper::Mysql::check($query)
	}
	elsif ($use_sqlite) {
		my $query = "SELECT name FROM $t->{'set_details'} WHERE name = '$setname'";
		return Wrapper::Sqlite::check($query)
	}
}


# drop all tables. no really, all that start with $prefix.
sub drop_tables {
	if ($use_mysql) {
		Wrapper::Mysql::drop_tables($t);
	}
	elsif ($use_sqlite) {
		Wrapper::Sqlite::drop_tables($t);
	}
}


# create tables. makes use of the %create_table hash.
sub create_tables {
	if ($use_mysql) {
		Wrapper::Mysql::create_tables($t);
	}
	elsif ($use_sqlite) {
		Wrapper::Sqlite::create_tables($t);
	}
}

sub make_csv_for_set_uploading {
	my $infile    = shift;
	my $setid     = shift;
	my $ogsid_for = shift;
	my @f = ( );
	my $csvfh = File::Temp->new();
	my $lines = Orthograph::Functions::file2arrayref($infile);
	while (my $l = shift @$lines) {
		chomp $l;
		next if $l =~ /^Group\tGene\tProtein/;  # skip orthodb table header lines
		next if $l =~ /^\s*$/;	# skip empty lines
		@f = split "\t", $l;
		# skip those that are not mentioned at all
		next unless defined $ogsid_for->{$f[2]};
		# skip those that should not be included
		next if $ogsid_for->{$f[2]} == 0;
		printf $csvfh "%s,%s,%d,%d\n", 
			$f[0],								# cog id
			$f[1],								# protein id
			$ogsid_for->{$f[2]},	# ogs id for this shorthand
			$setid,								# well... setid
		;
	}
	$csvfh->close();
	return $csvfh;
}

# sub: make_csv
# generate a csv file from the ortholog set fasta file
sub make_csv {
	my $inf = shift @_;
	my $fh = File::Temp->new(UNLINK=>1);
	my $faobj = Seqload::Fasta->open($inf);

	my $taxon_shorthand = '';
	my $orthodb_id      = '';
	my $sequence_id     = '';

	no warnings;	# disable warnings for printing of the NULL column
	while (my ($hdr, $seq) = $faobj->next_seq()) {
		$seq =~ s/-//g;	# remove all gaps from the sequence
		# does the sequence consist of only amino acid symbols?
		if ($seq =~ /([^ACDEFGHIKLMNPQRSTVWYX*])/) {
			# change all Us to X if requested
			if ($u_subst and $seq =~ s/U/$u_subst/gi) {
				warn("Warning: Selenocysteine character ('U') replaced with '$u_subst' in sequence '$hdr'.\n");
			}
			else {
				warn("Warning: Sequence $hdr contains nonstandard amino acid symbol '$1'! Make sure your alignment program tolerates this.\n");
			}
		}
		++$total_seqs;
		# no commas in the header please, they confuse the csv parser
		$hdr =~ s/,/_/g;
		my @headerfields = split(/\s+/, $hdr);
		# for backwards compatibility with orthodb 5 format
		if ($orthodb5_format) {
			$taxon_shorthand = $headerfields[0];
			$orthodb_id      = $headerfields[2];
			$sequence_id     = $headerfields[3];
			# only the first 5 characters
			$taxon_shorthand = substr($taxon_shorthand, 0, 5);
		}
		# otherwise, this is recent, i.e., orthodb 7 format
		else {
			$taxon_shorthand = $headerfields[-1];
			$orthodb_id      = $headerfields[-2];
			$sequence_id     = $headerfields[0];
		}

		printf $fh "%s,%s,%s,%s,%s,%s,%s,%s\n", 
			$taxon_shorthand,              # taxon shorthand
			'',                            # this will later be the full name
			$setname,	                     # set name
			$orthodb_id,                   # orthodb id, e.g. EOG500001
			File::Spec->catdir($blastdir), # blast database directory
			$sequence_id,                  # sequence id as it occurs in the ogs
			$seq,                          # sequence
			$description;                  # set description
	}
	use warnings;	# resume yelling 
	undef $faobj;
	$fh->close();
	print "used temporary csv file '$fh'\n" if $verbose;
	return $fh;
}

# sub: get_yn
# Get a response of the type 'y' or 'n'. Prompts the user until she complies :D
# returns 1 on 'y', 0 on 'n'
sub get_yn {
	local $| = 0;
	my $yn = readline(STDIN);
	chomp $yn;
	until ($yn =~ /^[yn]$/) {
		print "Response '$yn' not understood. Are you sure (y/n)? ";
		$yn = readline(STDIN);
		chomp $yn;
	}
	$| = 1;
	if ($yn eq 'n') {
		return 0;
	}
	return 1;
}

sub upload_ogs {
	my $inf = shift;
	my $type = shift;
	
	# get taxon name
	if ($taxon_name) {
		print "Using taxon name '$taxon_name'\n";
	}
	else {
		$taxon_name = get_answer("Please enter the taxon name for this data file");
	}
	# get ogs version
	if ($ogsversion) {
		print "Using OGS version '$ogsversion'\n";
	}
	else {
		$ogsversion = get_answer("Please enter the official gene set (OGS) version for this data file");
	}

	# insert taxon into table
	my $taxid = insert_taxon_into_database($taxon_name) or fail_and_exit('Could not insert taxon into database');

	# update OGS table
	my $ogsid = insert_ogs_info_into_database($type, $taxid, $ogsversion) or fail_and_exit('Could not insert OGS info into database');
	print "Got OGS ID $ogsid and taxon ID $taxid for $taxon_name\n" if $verbose;

	# generate a csv file and get a list of headers
	my ($csvf, $hdrs) = make_csv_for_ogs_uploading($inf, $taxid, $type, $ogsid);

	print "Data look OK, uploading...\n";

	# upload:
	# create temporary table
	create_temptable_for_ogs_data();
	# upload from temporary table
	upload_ogs_sequences($csvf, $hdrs, $taxid, $type, $ogsid) or fail_and_exit('OGS sequence loading failed');

	my $n_seqpairs = get_sequence_count_for_taxon($taxid);
	printf("\n%d sequences for '%s' in database '%s' on %s.\n", $n_seqpairs, $taxon_name, $database, $db_dbserver);
	return 1;
}

sub make_csv_for_ogs_uploading {
	my $inf   = shift;
	my $taxid = shift;
	my $type  = shift;
	my $ogsid = shift;
	my $hdrs  = [ ];
	my $tmpfh = File::Temp->new();
	unless (-e $inf) { fail_and_exit("No such file: $inf") }
	my $fafh  = Seqload::Fasta->open($inf);
	while (my ($hdr, $seq) = $fafh->next_seq()) {
		$hdr =~ s/\s.+$//; # remove the first whitespace from the header and everything after it
		$hdr =~ s/,/_/g; # remove commas from the header as they confuse the csv parser
		$seq =~ s/-//g;	# remove all gaps from the sequence
		# check if the sequence consists of valid characters
		if ($type == 2) {
			if ($seq =~ /([^ACDEFGHIKLMNPQRSTVWYX*])/) {
				# change all Us to X if requested
				if ($u_subst and $seq =~ s/U/$u_subst/gi) {
					warn("Warning: Selenocysteine character ('U') replaced with '$u_subst' in sequence '$hdr'.\n");
				}
				else {
					warn("Warning: Sequence $hdr contains nonstandard amino acid symbol '$1'! Make sure your alignment program tolerates this.\n");
				}
			}
		}
		else {
			if ($seq =~ /([^ATCGN*])/i) {
				warn("Warning: Sequence $hdr contains nonstandard nucleic acid symbol '$1'!\n");
			}
		}
		# write to csv file
		printf $tmpfh "%d,%d,%d,%s,%s\n",
			$taxid,
			$type,
			$ogsid,
			$hdr,
			$seq,
		;
		# add to list of headers
		push @$hdrs, $hdr;
	}
	$tmpfh->close();

	return ($tmpfh, $hdrs);
}


sub insert_taxon_into_database {
	my $name = shift;
	if ($use_mysql) {
		return Wrapper::Mysql::insert_taxon_into_database($name, 1);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::insert_taxon_into_database($name, 1);
	}
}

sub insert_ogs_info_into_database {
	if ($use_mysql) {
		return Wrapper::Mysql::insert_ogs_info_into_database(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::insert_ogs_info_into_database(@_);
	}
}


sub upload_ogs_sequences {
	if ($use_mysql) {
		return Wrapper::Mysql::upload_ogs_sequences(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::upload_ogs_sequences(@_);
	}
}

sub get_answer {
	my $prompt = shift;
	print "$prompt: ";
	my $ans    = <STDIN>;
	chomp $ans;
	return $ans;
}

sub get_all_ogsids {
	my $inf = shift;
	my $seen = { };
	my @fields = ( );
	open my $fh, '<', $inf;
	while (my $line = <$fh>) {
		# skip the first line if it is some orthodb file
		if ($line =~ /^Group\tGene\tProtein/) { next }
		# skip empty lines as well
		if ($line =~ /^\s*$/) { next }
		$line =~ s/\r\n?/\n/g; # convert to unix-style line breaks
		chomp $line;
		@fields = split "\t", $line;
		next if exists $seen->{$fields[2]};
		$seen->{$fields[2]} = 0;
	}
	$seen = assign_ogsids_to_taxa($seen);
	return $seen;
}

sub assign_ogsids_to_taxa {
	my $ogsid_for = shift;
	my $list_of_ogs = get_list_of_ogs();
	print "\n";
	print_ogs_list(2);
	print "\n";
	foreach (sort { $a cmp $b } keys %$ogsid_for) {
		$ogsid_for->{$_} = get_ogsid($list_of_ogs, $_, $ogsid_for)
	}
	return $ogsid_for;
}

sub print_ogs_list {
	my $type = shift;
	my $list_of_ogs = get_list_of_ogs();
	my $len = 0;
	my $len_ver = 7;
	foreach (@$list_of_ogs) {
		if (length $_->[1] > $len) { $len = length $_->[1] };
		if (length $_->[2] > $len_ver) { $len_ver = length $_->[1] };
	}
	print "This is a list of present OGS in the database:\n";
	printf "[%s] %-${len}s  %${len_ver}s  %s  %s\n", '#', 'Taxon name', 'version', 'type', 'sequences';
	printf "%s%-${len}s%s\n", '-' x (length(scalar @$list_of_ogs) + 2), '-' x $len,   '---------------------------';
	foreach (@$list_of_ogs) {
		# only print if ogs is of requested type
		if ($type) { next unless $_->[3] == $type }
		printf "[%d] %-${len}s  %-${len_ver}s  %-4s  %d\n",
			$_->[0],	# ogs id
			$_->[1],	# taxon name
			$_->[2],	# ogs version
			$_->[3] == 1 ? 'nt' : 'aa',	# ogs type, can be nt or aa
			$_->[4],	# number of sequences
		;
	}
	return 1;
}

sub get_ogsid {
	my $list_of_ogs = shift;
	my $taxon = shift;
	my $ogsid_for = shift;
	my $prompt = "Enter the OGS ID for $taxon or 0 if you don't want to use this OGS";
	my $id = get_integer($prompt);
	my $ok = 0;
	while (!$ok) {
		# 0 is ok, don't use this ogs
		if ($id == 0) {
			return $id;
		}
		# ogs id doesn't even exist
		if (not grep {$_->[0] == $id} @$list_of_ogs) {
			print "Invalid OGS ID $id. ";
			$id = get_integer($prompt);
		}
		# ogs id has been assigned previously
		elsif (grep { $ogsid_for->{$_} == $id } keys %$ogsid_for) {
			print "OGS ID $id already taken. ";
			$id = get_integer($prompt);
		}
		# ok
		else {
			$ok = 1;
		}
	}
	return $id;
}

sub get_integer {
	my $prompt = shift;
	my $ans = get_answer($prompt);
	while ($ans !~ /^\d+$/) {
		print "Not a valid number. ";
		$ans = get_answer($prompt);
	}
	return $ans;
}


sub insert_new_set {
	my $name = shift;
	my $descript = shift;
	if ($use_mysql) {
		return Wrapper::Mysql::insert_new_set($name, $descript);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::insert_new_set($name, $descript);
	}
}


sub get_reftaxon_id {
	my $shorthand = shift @_;
	if ($use_mysql) {
		return Wrapper::Mysql::get_reftaxon_id($shorthand);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_reftaxon_id($shorthand);
	}
}

sub get_taxon_shorthands {
	if ($use_mysql) {
		return Wrapper::Mysql::get_taxon_shorthands();
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_taxon_shorthands();
	}
}

sub create_temptable_for_ogs_data {
	if ($use_mysql) {
		return Wrapper::Mysql::create_temptable_for_ogs_data();
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::create_temptable_for_ogs_data();
	}
}

sub import_ogs_into_database {
	if ($use_mysql) {
		return Wrapper::Mysql::import_ogs_into_database(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::import_ogs_into_database(@_);
	}
}

sub get_sequence_count_for_taxon {
	if ($use_mysql) {
		return Wrapper::Mysql::get_sequence_count_for_taxon(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_sequence_count_for_taxon(@_);
	}
}

# Sub: create_blastdb
# Generates a BLAST database 
# Expects: scalar string dirname
# Returns: File::Spec path object for the BLAST database
sub create_blastdb {
	my $infiles = shift;
	my $taxsh = '';
	if (scalar(@_) > 0) {
		$taxsh = shift;
	}
	else {
		($taxsh = $infile) =~ s/\.(fa|fas|fasta|fsta|fsa)$//;
	}
	if (check_if_fasta() == 0) {
		return 0;
	}
	# TODO rewrite this part using parametrized queries to protect from SQL injections?
	# TODO make this work
	my $blastdir = Wrapper::Mysql::mysql_get("SELECT blastdb_path FROM $t->{'blastdbs'} WHERE  = ");
	my $outfile = File::Spec->catfile($taxsh);
	my @makeblastdbcmd = qq(makeblastdb -in $infile -out $outfile -input_type fasta -title $taxsh);
}

# Sub: get_setname
# Listens for the set name on STDIN
# Arguments: none
# Returns: scalar string setname
sub get_setname {
	$setname = get_answer("\nEnter the set name (required; ASCII only, no commas!)");
	unless ($setname) { print "Exiting.\n" and exit }
	if ($setname =~ /[^a-zA-Z0-9_]/) {
		die "Error: Must be alphanumeric (a-z, 0-9, and underscore).\n";
	}
	return $setname;
}

sub getblastdir {
	# TODO rewrite this part using parametrized queries to protect from SQL injections?
	my $bd = Wrapper::Mysql::mysql_get("SELECT blastdb_path FROM $t->{'blastdbs'} LIMIT 1");
	if (!$bd) {
		$blastdir = get_answer("Enter the location where the BLAST database directories are located (required)");
	}
	else {
		$blastdir = $$bd[0][0];	# array of arrays, only element
	}
	return $blastdir;
}


sub list_sets {
	my $maxlen_set  = 0;
	my $maxlen_desc = 0;
	my $sets = get_ortholog_sets();
	if (not $sets) {
		print "No sets in database.\n";
		return 1;
	}
	# determine maximum length of set names for the table
	foreach my $set (@$sets) {
		$maxlen_set = length($set->[0]) if (length($set->[0]) > $maxlen_set);
		$maxlen_desc = length($set->[1]) if (length($set->[1]) > $maxlen_desc);
	}
	print "The following ortholog sets are presently installed:\n";
	printf("%-${maxlen_set}s   %-${maxlen_desc}s   %9s %5s\n", 'Set name', 'description', 'orthologs', 'COGs');
	printf("%-${maxlen_set}s---%-${maxlen_desc}s---%9s-%-5s\n", '-' x $maxlen_set, '-' x $maxlen_desc, '-' x 9, '-' x 5);
	foreach my $set (@$sets) {
		printf("%-${maxlen_set}s   %-${maxlen_desc}s   %9s %5s\n", $set->[0], $set->[1], $set->[2], $set->[3]);
	}
	return 1;
}


sub list_set_details {
	my %reftaxa_of = get_taxa_in_all_sets();
	my $maxlen_setname  = 0;
	foreach my $setname (keys %reftaxa_of) {
		$maxlen_setname = length($setname) if (length($setname) > $maxlen_setname);
	}
	printf "%-${maxlen_setname}s  %s\n", 'Set name', 'reference taxa';
	printf "%-${maxlen_setname}s--%s\n", '-' x 11, '-' x 14;
	foreach my $setname (sort keys %reftaxa_of) {
		printf("%-${maxlen_setname}s  %s\n", $setname, $reftaxa_of{$setname});
	}
	return 1;
}

# Sub: delete_set
# Delete a set from the database
# Arguments: scalar string setname
# Returns: true if successful, false otherwise
sub delete_set {
	my $setname = shift or confess('Usage: delete_set($SETNAME)');
	# get set id
	my $setid = get_set_id($setname) or print "No set with name '$setname' in database, none deleted\n" and exit;
	# delete the set
	if ($use_mysql) {
		Wrapper::Mysql::delete_set($setid);
	}
	elsif ($use_sqlite) {
		Wrapper::Sqlite::delete_set($setid);
	}
	# also delete the set dir completely
	my $this_set_dir = File::Spec->catdir($setdir, $setname);
	if (-e $this_set_dir) {
		File::Path->remove_tree($this_set_dir);
		printf "Removed $this_set_dir\n"; 
	}
	return 1;
}

sub delete_ogs {
	my $ogsfile = shift @_ or confess("Usage: delete_ogs(OGSFILE)");

	# get taxon shorthand
	# TODO rewrite this part using parametrized queries to protect from SQL injections?
	my $list = get_list_of_taxa();
	print "This is a list of present taxa in the database:\n";
	for (my $i = 0; $i < scalar(@$list); ++$i) {
		printf("%2d. %s %s\n", $i+1, $$list[$i][0], $$list[$i][1]);
	}
	my $in = get_answer("Please enter a number to select one of the taxa (don't fuck with this, no error-catching has been implemented yet)");
	my $taxid;

	# make sure the user doesn't fuck up
	while (($in !~ /^\d+$/) or ($in > scalar(@$list)) or ($in < 1) or !defined($in)) {
		$in = get_answer("Not understood: '$in'. DO NOT FUCK WITH THIS. Enter one of the numbers to select one of the taxa");
	}
	
	# correlate that number to the list of taxa
	$taxid = $$list[$in-1][0];

	# open the file and delete each sequence (slow, I know...)
	my $headers = get_headers_from_fasta($ogsfile);
	my $count = delete_sequences_with_headers($headers);

	delete_taxon($taxid);
	return $count;
}

sub get_headers_from_fasta {
	my $f = shift;
	my $hs = [ ];
	my $fh = Seqload::Fasta->open($f);
	while (my ($h, $s) = $fh->next_seq()) {
		push @$hs, $h;
	}
	undef $fh;
	return $hs;
}

sub delete_sequences_with_headers {
	if ($use_mysql) {
		return Wrapper::Mysql::delete_sequences_with_headers(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::delete_sequences_with_headers(@_);
	}
}

sub delete_taxon {
	if ($use_mysql) {
		return Wrapper::Mysql::delete_taxon(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::delete_taxon(@_);
	}
}

sub get_taxa_in_all_sets{
	my $data = { };
	my %setlist = ();
	if ($use_mysql) {
		$data = Wrapper::Mysql::get_taxa_in_all_sets(@_);
	}
	elsif ($use_sqlite) {
		$data =  Wrapper::Sqlite::get_taxa_in_all_sets(@_);
	}
	foreach my $row (@$data) {
		$setlist{$$row[0]} .= ', ' . $$row[1];
	}
	foreach my $set (keys %setlist) {
		$setlist{$set} =~ s/^, //;
	}
	return %setlist;
}

sub get_ortholog_sets {
	if ($use_mysql) {
		return Wrapper::Mysql::get_ortholog_sets();
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_ortholog_sets();
	}
}

sub get_list_of_ogs {
	if ($use_mysql) {
		return Wrapper::Mysql::get_list_of_ogs();
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_list_of_ogs();
	}
}

sub get_list_of_taxa {
	if ($use_mysql) {
		return Wrapper::Mysql::get_list_of_taxa(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_list_of_taxa(@_);
	}
}

sub db_structure_present {
	if ($use_mysql) {
		return Wrapper::Mysql::db_structure_present();
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::db_structure_present();
	}
}

sub load_set_into_temptable {
	if ($use_mysql) {
		return Wrapper::Mysql::load_set_into_temptable(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::load_set_into_temptable(@_);
	}
}

sub insert_orthologs {
	if ($use_mysql) {
		return Wrapper::Mysql::insert_orthologs(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::insert_orthologs(@_);
	}
}

sub get_number_of_cogs_for_set {
	if ($use_mysql) {
		return Wrapper::Mysql::get_number_of_cogs_for_set(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_number_of_cogs_for_set(@_);
	}
}

sub get_number_of_orthologs_for_set {
	if ($use_mysql) {
		return Wrapper::Mysql::get_number_of_orthologs_for_set(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_number_of_orthologs_for_set(@_);
	}
}

sub get_set_id {
	if ($use_mysql) {
		return Wrapper::Mysql::get_set_id(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::get_set_id(@_);
	}
}

sub insert_blastdb {
	if ($use_mysql) {
		return Wrapper::Mysql::insert_blastdb(@_);
	}
	elsif ($use_sqlite) {
		return Wrapper::Sqlite::insert_blastdb(@_);
	}
}