Commit a4c6d7b8 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Prepare seq before running BLAST

parent c1376e4a
......@@ -133,6 +133,7 @@ if ( $local ){
################## Check input files
# Open and check template file
my ($template_NT, $template_AA, $template_SEQ);
#TODO: template is fasta only now
($template_NT, $template_AA, $template_SEQ) = checkTemplate($template) if ( $template );
......@@ -143,7 +144,7 @@ my $converted = 0; # Accept only fasta format
#($msa, $converted) = isFastaFile($msa, $converted);
my $fasta_checker = -1;
my (@input_order, @original_names, @original_seq); #Use 3 lists in parallel : original_names, seq, order(1->n)
my (@original_names, @original_seq); #Use 2 lists in parallel : original_names, seq
open( my $MSA, '<', "$msa" ) or die "\n\tCannot open your MSA file\n\n";
READ_MSA:
while(<$MSA>){
......@@ -158,7 +159,6 @@ while(<$MSA>){
chomp($name);
push @original_names, $name;
push @input_order, $fasta_checker;
}
elsif ( $_ =~ /[A-Z\-\.]/i ){
my $sequence = $_;
......@@ -213,30 +213,28 @@ undef $fasta_checker;
# Check GigaBlaster status if used
if ( $giga==1 ){
if ( head('http://www.igs.cnrs-mrs.fr/Giga2/~database/remoteblast.cgi') ){
$local = 2;
}
else{
$local = 0;
}
$local = 0;
$local = 2 if ( head('http://www.igs.cnrs-mrs.fr/Giga2/~database/remoteblast.cgi') );
}
# Temporary file extension
my $date = sprintf("%d-%02d-%02d_%02d%02d", localtime->year() + 1900, localtime->mon(), localtime->mday(), localtime->hour, localtime->min);
my $date = sprintf("%d-%02d-%02d_%02dh%02d", localtime->year() + 1900, localtime->mon(), localtime->mday(), localtime->hour, localtime->min);
#Start main program with version # of programs and list original queries
print "\n\t Protogene\t$VERSION\t$date\n\n";
open(my $EXONERATEISHERE, "$exonerate_exe --version |");
my $IsExonerateHere = <$EXONERATEISHERE>;
close $EXONERATEISHERE;
print "\t $IsExonerateHere\n";
undef $IsExonerateHere;
undef $VERSION;
for(my $m=0; $m<=$#original_names; $m++){
print "\t$original_names[$m]\n";
printf("\t>$m is %s\n", $m, $original_names[$m]);
}
$date .= "_$$"; #Add PID to be more uniq
......@@ -245,58 +243,55 @@ $date .= "_$$"; #Add PID to be more uniq
# run_name implementation for web server
$originalMSA = $run_name if ( $run_name ne '' );
unlink("${originalMSA}.cds");
unlink("${originalMSA}.cdsP");
unlink("${originalMSA}.cdsP.html");
unlink("${originalMSA}.out");
unlink("${originalMSA}.boj");
unlink("$originalMSA.cds", "$originalMSA.cdsP", "$originalMSA.cdsP.html",
"$originalMSA.out", "$originalMSA.boj", "$originalMSA.log");
##Build the sequences, from the alignment, to perform webblast
# Build the sequences, from the alignment, to perform blast search
my $quiet = '';
EACH_SEQ:
for(my $r=0; $r<=$#input_order; $r++){
my $right_name = $original_names[$r];
my $templ_name = $original_names[$r];
$right_name =~ s/^>//;
$templ_name =~ s/^([^\s\t]+).*$/$1/;
open(my $READY2BLAST, '>', "$cache/${date}_seq2blast.fas${r}");
for(my $r=0; $r<=$#original_names; $r++){
my $fasta_header = $original_names[$r];
my $short_name = $original_names[$r];
$fasta_header =~ s{^>}{}; # Full FASTA header
$short_name =~ s{^([^\s\t]+).*$}{$1}; # Short name without description
# Prepare sequence $r for BLAST
open(my $READY2BLAST, '>', "$cache/$date.seq2blast.fas.$r");
my $noGap = $original_seq[$r];
$noGap =~ s/[\.-]//g; #remove gaps
print {$READY2BLAST} ">$input_order[$r]\n$noGap\n";
$noGap =~ s{\-}{}g; #remove gaps
print {$READY2BLAST} ">$r\n$noGap\n";
close $READY2BLAST;
undef $noGap;
my @multipleequalblasthits;
if ( exists($template_NT->{$templ_name}) && $template_NT->{$templ_name} ne '' ){
if ( exists($template_NT->{$short_name}) && $template_NT->{$short_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, 'My_own_seq'); #To change for ?
print "\n\n$r done\n\nNucleotidic template for $r:\t$template_NT->{$templ_name}\n\n**********************************************************************\n\n";
print "\n\n$r done\n\nNucleotidic template for $r:\t$template_NT->{$short_name}\n\n**********************************************************************\n\n";
}
elsif ( exists($template_AA->{$templ_name}) && $template_AA->{$templ_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, $template_AA->{$templ_name});
print "\n\n$r done\n\nBlastP template for $r:\t$template_AA->{$templ_name}\n\n**********************************************************************\n\n";
elsif ( exists($template_AA->{$short_name}) && $template_AA->{$short_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, $template_AA->{$short_name});
print "\n\n$r done\n\nBlastP template for $r:\t$template_AA->{$short_name}\n\n**********************************************************************\n\n";
}
else{
## --> send them to webblast
$quiet = '-quiet=on ' if ( $r>0 ); #Show webblast parameters only for the first webblast run
launchWebblast("$cache/${date}_seq2blast.fas${r}", $quiet, $local, $db, $species, $blast_exe, "$cache/${date}.blastp${r}");
launchWebblast("$cache/$date.seq2blast.fas.$r", $quiet, $local, $db, $species, $blast_exe, "$cache/$date.blastp${r}");
##Get the BLASTp hit(s) acc number
open(my $BLASTP, '<', "${cache}/${date}.blastp${r}");
open(my $BLASTP, '<', "$cache/$date.blastp${r}");
while(<$BLASTP>){
@multipleequalblasthits = (@multipleequalblasthits, $1) if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
}
close $BLASTP;
unlink("$cache/${date}.blastp${r}", "${cache}/${date}_seq2blast.fas${r}") if ( $tmp == 0 || exists($multipleequalblasthits[0]) );
unlink("$cache/$date.blastp${r}", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($multipleequalblasthits[0]) );
if ( !exists($multipleequalblasthits[0]) ){
print "\n$input_order[$r] ...\tNo blast result found above filter thresholds for $right_name\n\n";
print "\n$r ...\tNo blast result found above filter thresholds for $fasta_header\n\n";
buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Unavailable', '');
undef $original_seq[$r];
undef $original_names[$r];
undef $input_order[$r];
next EACH_SEQ;
}
}
......@@ -308,18 +303,18 @@ for(my $r=0; $r<=$#input_order; $r++){
my @failureStatus = ('');
my @intronStatus = ('');
for(my $qq=0; $qq<=$#multipleequalblasthits; $qq++){
print "\n${right_name} --> [${multipleequalblasthits[$qq]}]\n";
print "\n$fasta_header --> [$multipleequalblasthits[$qq]]\n";
my @nt_GIs;
my $intronStep = 0;
if ( exists($template_NT->{$templ_name}) && $template_NT->{$templ_name} ne '' && $template_NT->{$templ_name} !~ /^My_Seq$/i ){
downloadSeq($cache, $date, '', '', $template_NT->{$templ_name});
@nt_GIs = $template_NT->{$templ_name};
if ( exists($template_NT->{$short_name}) && $template_NT->{$short_name} ne '' && $template_NT->{$short_name} !~ /^My_Seq$/i ){
downloadSeq($cache, $date, '', '', $template_NT->{$short_name});
@nt_GIs = $template_NT->{$short_name};
}
elsif ( exists($template_NT->{$templ_name}) && $template_NT->{$templ_name} ne '' && $template_NT->{$templ_name} =~ /^My_Seq$/i ){
elsif ( exists($template_NT->{$short_name}) && $template_NT->{$short_name} ne '' && $template_NT->{$short_name} =~ /^My_Seq$/i ){
open(my $MYSEQ, '>', "$cache/My_Seq-$$.fas");
print {$MYSEQ} ">My_Seq-$$ for $templ_name\n".$template_SEQ->{$templ_name}."\n";
print {$MYSEQ} ">My_Seq-$$ for $short_name\n".$template_SEQ->{$short_name}."\n";
close $MYSEQ;
@nt_GIs = "My_Seq-$$";
}
......@@ -327,8 +322,8 @@ for(my $r=0; $r<=$#input_order; $r++){
#Prot ACC -> PUID
my $protGI = blastPAcc2PGI($multipleequalblasthits[$qq]);
if ( $protGI eq '' ){
print "\tNo protein (prot GI) link found for ${multipleequalblasthits[$qq]} in $right_name\n\n";
@failureStatus = ('PUI_unavailable', ${multipleequalblasthits[$qq]});
print "\tNo protein (prot GI) link found for $multipleequalblasthits[$qq] in $fasta_header\n\n";
@failureStatus = ('PUI_unavailable', $multipleequalblasthits[$qq]);
next;
}
# print "\t$protGI\n";
......@@ -336,8 +331,8 @@ for(my $r=0; $r<=$#input_order; $r++){
#PUID -> nt GIs & GeneID
my ($ntGIs, $geneID) = protGI2NTGIs($protGI, $multipleequalblasthits[$qq]);
if ( $ntGIs eq '' && $geneID eq '' ){
print "\tNo nucleotide (nt GIs) link found for ${multipleequalblasthits[$qq]} in $right_name\n\n";
@failureStatus = ('No_nt_link', ${multipleequalblasthits[$qq]});
print "\tNo nucleotide (nt GIs) link found for $multipleequalblasthits[$qq] in $fasta_header\n\n";
@failureStatus = ('No_nt_link', $multipleequalblasthits[$qq]);
next;
}
print " linked with [$ntGIs $geneID]\n";
......@@ -351,7 +346,7 @@ for(my $r=0; $r<=$#input_order; $r++){
while(<@geneIDs>){
my $geneID = $_;
my ($chr, $amont, $aval) = geneID2Chr($geneID, $multipleequalblasthits[$qq]);
print "\n\tNo gene locus found for ${multipleequalblasthits[$qq]} with $geneID in $right_name\n\n" if ($chr eq '' and $geneID ne '');
print "\n\tNo gene locus found for $multipleequalblasthits[$qq] with $geneID in $fasta_header\n\n" if ($chr eq '' and $geneID ne '');
if ( $chr ne '' ){
#Get Chr seq
......@@ -365,9 +360,9 @@ for(my $r=0; $r<=$#input_order; $r++){
#Align Nt seq with our protein query seq
my ($POS, $BOJ) = runExonerate($exonerate_exe, $cache, $date, ${input_order[$r]}, ${original_seq[$r]}, $right_name, $multipleequalblasthits[$qq], $r, @nt_GIs);
my ($POS, $BOJ) = runExonerate($exonerate_exe, $cache, $date, $r, $original_seq[$r], $fasta_header, $multipleequalblasthits[$qq], $r, @nt_GIs);
my ($POSresult, $bestNucleotide) = prepareResults4CDS($POS, $multipleequalblasthits[$qq], ${original_seq[$r]}, $right_name);
my ($POSresult, $bestNucleotide) = prepareResults4CDS($POS, $multipleequalblasthits[$qq], $original_seq[$r], $fasta_header);
$resultPOS .= $POSresult if ( $bestNucleotide ne '' && $POSresult ne '' && $resultPOS !~ /_G_$bestNucleotide/ );
@intronStatus = ($POS->{0}, ${multipleequalblasthits[$qq]}, '') if ( $intronStep==1 && exists($POS->{0}) && $POS->{0} ne '' && $POS->{0} =~ /\:/ );
......@@ -376,7 +371,7 @@ for(my $r=0; $r<=$#input_order; $r++){
my ($intronLess, $nameLess) = ('', '');
$intronLess = $POS->{0} if ( %$BOJ eq 0 && exists($POS->{0}) && $POS->{0} ne '' );
$nameLess = $bestNucleotide if ( %$BOJ eq 0 && $intronStep==0 && ($bestNucleotide =~ /^N[CTWZG]_/ || $bestNucleotide =~ /^AC_/) );
my ($BOJresult, $bestGenomic) = prepareResults4BOJ($BOJ, $multipleequalblasthits[$qq], ${original_seq[$r]}, $right_name, $intronLess, $nameLess);
my ($BOJresult, $bestGenomic) = prepareResults4BOJ($BOJ, $multipleequalblasthits[$qq], $original_seq[$r], $fasta_header, $intronLess, $nameLess);
$resultBOJ .= $BOJresult if ( $bestGenomic ne '' && $BOJresult ne '' && $resultBOJ !~ /_G_$bestGenomic[ \-]/ );
# print " $intronStep $POS->{0} [@intronStatus] {$BOJresult} $bestNucleotide - $bestGenomic \t $BOJ->{0}\n";
}
......@@ -389,25 +384,24 @@ for(my $r=0; $r<=$#input_order; $r++){
}
elsif ( $resultBOJ eq '' ){
if ( $failureStatus[0] eq 'PUI_unavailable' || $failureStatus[0] eq 'No_nt_link' ){
buildFailureBOJOutputFile($failureStatus[1], $failureStatus[0], $right_name, ${original_seq[$r]});
buildFailureBOJOutputFile($failureStatus[1], $failureStatus[0], $fasta_header, $original_seq[$r]);
}
elsif ( $failureStatus[0] eq '' && $intronStatus[0] ne '' ){
buildIntronlessBOJOutputFile($intronStatus[1], $intronStatus[0], $intronStatus[2], $right_name, ${original_seq[$r]});
buildIntronlessBOJOutputFile($intronStatus[1], $intronStatus[0], $intronStatus[2], $fasta_header, $original_seq[$r]);
}
else{
buildFailureBOJOutputFile($multipleequalblasthits[0], 'Unavailable', $right_name, ${original_seq[$r]});
buildFailureBOJOutputFile($multipleequalblasthits[0], 'Unavailable', $fasta_header, $original_seq[$r]);
}
createCDSOutputFile($resultPOS, ${original_seq[$r]}, $right_name);
createCDSOutputFile($resultPOS, $original_seq[$r], $fasta_header);
}
else {
createCDSOutputFile($resultPOS, ${original_seq[$r]}, $right_name);
createCDSOutputFile($resultPOS, $original_seq[$r], $fasta_header);
createBOJOutputFile($resultBOJ, ${original_seq[$r]});
}
undef $original_seq[$r];
undef $original_names[$r];
undef $input_order[$r];
}
......@@ -1079,7 +1073,7 @@ sub downloadSeq{
###################### Exonerate #######################
sub runExonerate{
my ($exonerate_exe, $cache, $date, $order, $original, $right_name, $blastHit, $r, @gis) = @_;
my ($exonerate_exe, $cache, $date, $order, $original, $fasta_header, $blastHit, $r, @gis) = @_;
#Prot to align to nucleotidic sequence(s)
open(my $FASTA, '>', "$cache/${date}_${order}.fas");
......@@ -1113,10 +1107,10 @@ sub runExonerate{
if ( !-e "$cache/${date}_${order}.exon" || (-s "$cache/${date}_${order}.exon") < 520 ){
print "\tProtein-Nucleotide alignment has FAILED for $targetNT with $right_name\n";
print "\tProtein-Nucleotide alignment has FAILED for $targetNT with $fasta_header\n";
next RUN_EXONERATE;
}
print "\tProtein-Nucleotide alignment was successful for $targetNT with $right_name\n";
print "\tProtein-Nucleotide alignment was successful for $targetNT with $fasta_header\n";
#Parse Exonerate output
......@@ -1127,7 +1121,7 @@ sub runExonerate{
%bestBOJ = testBOJ($posBOJ, $gis[$b], %bestBOJ);
my $whichGI = $gis[$b];
$whichGI = 'gi|'.$whichGI.'|' if ( $whichGI =~ /^\d+$/ );
print {*STDERR} "\tNo exon-intron structure found for $whichGI with $right_name\n" if ( %$posBOJ eq 0 );
print {*STDERR} "\tNo exon-intron structure found for $whichGI with $fasta_header\n" if ( %$posBOJ eq 0 );
}
#Remove temp files from exonerate and query sequence for exonerate
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment