Commit c1376e4a authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Check input format

parent 53171251
......@@ -122,7 +122,8 @@ cacheManagement('old', $cachePath, 1);
################## Check external software in the PATH
checkExternalSoftware($webblast_exe, $exonerate_exe);
#TODO: remove that for production
#checkExternalSoftware($webblast_exe, $exonerate_exe);
if ( $local ){
checkExternalSoftware($blast_exe);
}
......@@ -134,51 +135,57 @@ if ( $local ){
my ($template_NT, $template_AA, $template_SEQ);
($template_NT, $template_AA, $template_SEQ) = checkTemplate($template) if ( $template );
# Open and Check the input fasta file
$local = 2 if ( $giga==1 );
# Open and Check the input fasta file
my $originalMSA = $msa;
my $converted = 0; # Accept only fasta format
#TODO: Do a better file converter and identifier is msa is not fasta
#($msa, $converted) = isFastaFile($msa, $converted);
##Open and Check the fasta file
my $originalMSA = $msa;
my $change = 0;
($msa, $change) = checkFastaFile($msa, $change);
my $fasta_checker = -1;
my (@input_order, @original_names, @original_seq); #Use 3 lists in parallel : original_names, seq, order(1->n)
open( my $MSA, '<', "$msa" ) if ( -e "$msa" );
open( my $MSA, '<', "$msa" ) or die "\n\tCannot open your MSA file\n\n";
READ_MSA:
while(<$MSA>){
next READ_MSA if ( $_ =~ /^#/ );
if ( $_ =~ /^>/ ){
$fasta_checker++;
my $name = $_;
$name =~ s/\r\n//g; #Remove return lines from windows OS '^M'
$name =~ s/^>[ \t]+/>/;
$name =~ s{\r\n}{}g; #Remove return lines from windows OS '^M'
$name =~ s{\r}{\n}g;
$name =~ s{^>[ \t]+}{>};
chomp($name);
@original_names = (@original_names, $name);
@input_order = (@input_order, $fasta_checker);
push @original_names, $name;
push @input_order, $fasta_checker;
}
elsif ( $_ !~ /^>/ ){
my $seqq = $_;
$seqq =~ s/\r\n//g;
$seqq =~ s/\./-/g; #for msa with '.' as gap
$seqq =~ s{[BJOUZ]}{X}ig; #U here for selenocystein
$seqq =~ s/[^A-Za-z\-\*\n\r]//g; #Remove all the non-gap or non-alphabetic characters from the seq
chomp($seqq);
elsif ( $_ =~ /[A-Z\-\.]/i ){
my $sequence = $_;
$sequence =~ s{\r\n}{}g;
$sequence =~ s{\r}{\n}g;
$sequence =~ s{\.}{-}g; #for msa with '.' as gap
#TODO because exonerate cannot work with selenocystein
$sequence =~ s{[BJOUZ]}{X}ig; #U here for selenocystein
$sequence =~ s{[^A-Za-z\-\*\n]}{}g; #Remove all the non-gap or non-alphabetic characters from the seq
chomp($sequence);
#fasta sequence on 1 line
if ( !exists($original_seq[$fasta_checker]) ){
@original_seq = (@original_seq, $seqq);
push @original_seq, $sequence;
}
else {
$original_seq[$fasta_checker] = $original_seq[$fasta_checker].$seqq;
$original_seq[$fasta_checker] .= $sequence;
}
}
}
close $MSA;
#Are there FASTA sequences ?
if ( $fasta_checker == -1 ){
failure();
print {*STDERR} "\tThe MSA file does NOT seem to be a protein FASTA format\n\tSee <a href='http://en.wikipedia.org/wiki/FASTA_format' target='_blank'>FASTA format</a>\n\n";
print {*STDERR} "\tThe MSA file does NOT seem to be a protein FASTA format
\tSee an example of FASTA format <a href='http://en.wikipedia.org/wiki/FASTA_format' target='_blank'>here</a>\n\n";
exit(1);
}
elsif ( $lim >0 && $fasta_checker > $lim ){
......@@ -188,24 +195,26 @@ elsif ( $lim >0 && $fasta_checker > $lim ){
}
elsif ( exists( $original_seq[0] ) && $original_seq[0] =~ /[acgtu]/i ){
my $first_seq = $original_seq[0];
#Check if sequences are amino acids and not nucleotides
my $a = ($first_seq =~ s/[aA]//g) || 0;
my $c = ($first_seq =~ s/[cC]//g) || 0;
my $g = ($first_seq =~ s/[gG]//g) || 0;
my $t = ($first_seq =~ s/[tTuU]//g) || 0;
my $non = ($first_seq =~ s/[^aAcCgGtTuUXxNn-]//g) || 0;
#Check if sequences are amino acids and not nucleotides ONLY on the 1st sequence
my $a = ($first_seq =~ s{[aA]}{}g) || 0;
my $c = ($first_seq =~ s{[cC]}{}g) || 0;
my $g = ($first_seq =~ s{[gG]}{}g) || 0;
my $t = ($first_seq =~ s{[tTuU]}{}g) || 0;
my $non = ($first_seq =~ s{[^aAcCgGtTuUXxNn-]}{}g) || 0;
if ( ($a+$c+$g+$t) >= (($a+$c+$g+$t+$non)*80/100) ){
failure();
print {*STDERR} "\tYour sequences seem already to be nucleotides\n\tthis program purpose is to turn AMINO ACID alignments into CDS nucleotide alignments\n\n";
print {*STDERR} "\tYour sequences seem already to be nucleotides
\tthe goal of this program is to turn AMINO ACID alignments into CDS nucleotide alignments\n\n";
exit(1);
}
}
undef $fasta_checker;
#Check GigaBlaster status if used
# Check GigaBlaster status if used
if ( $giga==1 ){
if ( head('http://www.igs.cnrs-mrs.fr/Giga2/~database/remoteblast.cgi') ){
$local = 2;
}
else{
$local = 0;
......@@ -213,7 +222,10 @@ if ( $giga==1 ){
}
my $date = sprintf("%d-%02d-%02d_%02d%02d", localtime->year() + 1900, localtime->mon(), localtime->mday(), localtime->hour, localtime->min)."_$$";
# Temporary file extension
my $date = sprintf("%d-%02d-%02d_%02d%02d", localtime->year() + 1900, localtime->mon(), localtime->mday(), localtime->hour, localtime->min);
#Start main program with version # of programs and list original queries
print "\n\t Protogene\t$VERSION\t$date\n\n";
......@@ -226,7 +238,7 @@ undef $VERSION;
for(my $m=0; $m<=$#original_names; $m++){
print "\t$original_names[$m]\n";
}
$date .= "_$$"; #Add PID
$date .= "_$$"; #Add PID to be more uniq
......@@ -401,7 +413,7 @@ for(my $r=0; $r<=$#input_order; $r++){
checkAndCleanStderrFiles("$cache/${date}_ExonerateError") if ( -e "$cache/${date}_ExonerateError" );
checkOutputFiles();
unlink("$msa") if ( $change==1 );
unlink("$msa") if ( $converted==1 );
my @tmpFiles = glob($cache."/${date}*");
if ( exists($tmpFiles[0]) ){
mkdir("$cache/${date}_TMP");
......@@ -639,29 +651,21 @@ sub cacheManagement{
###################### Fasta ######################
#Check input aln
sub checkFastaFile{
my ($msa, $change) = @_;
if ( -e "$msa" ){
#Convert no fasta format to fasta with seq_reformat if available
my $seqreformat = which('t_coffeeX') || ''; #desactive tant que t_coffee seq_reformat pose prob
if ( $seqreformat ne '' ){
my @formatType = `$seqreformat -other_pg seq_reformat -in $msa -print_format`;
my $format = '';
while(<@formatType>){
if ( $_ =~ /FORMAT:/ ){
$format = $_;
chomp($format);
last;
}
}
if ( $format ne '' && $format !~ /FORMAT:fasta_/ ){
system("$seqreformat -other_pg seq_reformat -in $msa -output fasta_aln -out=${msa}.007") if ($format =~ /_aln *$/);
system("$seqreformat -other_pg seq_reformat -in $msa -output fasta_seq -out=${msa}.007") if ($format =~ /_seq *$/);
$msa .= '.007';
$change = 1;
}
# Check input aln format
sub isFastaFile{
my ($msa, $converted) = @_;
if ( -s "$msa" ){ # Check is the file exists and is not empty
# Is FASTA format ?
my $firstLine = `head $msa`;
return($msa, 0) if ( $firstLine =~ /^>/ );
# Convert no fasta format to fasta with readseq
my $reformat = which('readseq') || '';
if ( $reformat ne '' ){
system("$reformat -a -f8 $msa > $msa.007")==0 or do {failure(); print {*STDERR} "\n\tEnable to convert your file to fasta format\n\n";};
$msa .= '.007';
$converted = 1;
}
}
else {
......@@ -670,7 +674,7 @@ sub checkFastaFile{
exit(1);
}
return($msa,$change);
return($msa, $converted);
}
#Prepare Fasta Header of the query name to incorporate our annotations
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment