Commit 446fe9d3 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Clean BLAST run function. Need to clean BLAST launcher but later

parent fc085288
......@@ -15,8 +15,6 @@ use Carp;
use File::Which qw(which); # Locate external executable programs in the PATH
use Time::localtime; # Use localtime+PID for a pseudo-uniq temp file name
use Getopt::Long; # Options specifications
use File::Copy qw(move); # Avoid external 'mv' command usage
use LWP::Simple; # To test gigablaster availability
......@@ -38,6 +36,23 @@ $cachePath = '/Users/smoretti/Documents/ProtoGene/TTMMPP'; #FIXME
my $cacheStorageTime = 15; # Do not update sequences younger than X days
my $userEMail = 'moretti.sebastien@gmail.com'; # To receive e-mails with encountered problems; leave blank to inactive
### BLAST parameters ###
my $blast_param = { 'evalue' => 0.05,
'matrix' => 'BLOSUM62',
'filter' => 'Off',
'species' => 'All_organisms',
'db1' => 'refseq_protein', # NCBI RefSeq protein
'identity1' => 100,
'cover1' => 100,
'db2' => 'nr', # NCBI nr protein
'identity2' => 95,
'cover2' => 95,
'nbesthits' => 10,
'core' => 1,
};
###################################################
......@@ -51,7 +66,7 @@ my $exonerate_exe = 'exonerate-1.0'; # Exonerate 1.0 because current parse
################## Option management
my ($msa, $revtrans, $pep, $hideBOJ, $run_name, $template, $lim, $cache) = ('', 0, 0, 0, '', '', 0, 'update');
my ($debug, $tmp) = (0, 0);
my ($db, $species, $local, $giga) = ('refseq_protein', 'All_organisms', 0, 0);
my ($db, $species, $local, $giga) = ($blast_param->{'db1'}, $blast_param->{'species'}, 0, 0);
my %opts = ('msa|in=s' => \$msa, # Input sequences
'revtrans' => \$revtrans, # Use to reverse-translate sequences with no match
'pep' => \$pep, # Add the original peptide query beneath the related CDS seq
......@@ -80,7 +95,6 @@ my $test_option_values = Getopt::Long::GetOptions(%opts);
################## Short help message
if ( !$test_option_values || ($msa eq '' && $cache ne 'empty' && $cache ne 'old') ){
#|| $species eq '' || $db eq '' ){
print {*STDERR} "\n\tCannot open the MSA file in FASTA format
\tTry: $0 --msa=path_of_the_fasta_msa_file [Options]
......@@ -89,9 +103,9 @@ if ( !$test_option_values || ($msa eq '' && $cache ne 'empty' && $cache ne 'old'
\t Gallus_gallus, Bos_taurus, Escherichia_coli,
\t Arabidopsis_thaliana, Mus_musculus,
\t Drosophila_Melanogaster, ...
\t default is 'All_organisms'
\t default is '$blast_param->{'species'}'
\t --db=nr, pdb, swissprot, refseq_protein
\t default is 'refseq_protein'
\t default is '$blast_param->{'db1'}'
\t --local to execute a local BLAST query with
\t --db=path_for_a_local_db_blast_formated\n
\t --template to provide your own nucleotidic sequences
......@@ -234,7 +248,7 @@ undef $IsExonerateHere;
undef $VERSION;
for(my $m=0; $m<=$#original_names; $m++){
printf("\t>$m is %s\n", $m, $original_names[$m]);
printf("\t>%-3s is %s\n", $m, $original_names[$m]);
}
$date .= "_$$"; #Add PID to be more uniq
......@@ -248,7 +262,7 @@ unlink("$originalMSA.cds", "$originalMSA.cdsP", "$originalMSA.cdsP.html",
# Build the sequences, from the alignment, to perform blast search
my $quiet = '';
open(my $LOG, '>', "$date.log") or die "\n\tCannot open LOG file\n\n";
EACH_SEQ:
for(my $r=0; $r<=$#original_names; $r++){
my $fasta_header = $original_names[$r];
......@@ -278,19 +292,24 @@ for(my $r=0; $r<=$#original_names; $r++){
print "\n\n$r done\n\nBlastP template for $r:\t$template_AA->{$short_name}\n\n*******************************************************************\n\n";
}
else{
## else send them to webblast
$quiet = '-quiet=on ' if ( $r>0 ); #Show webblast parameters only for the first webblast run
launchWebblast("$cache/$date.seq2blast.fas.$r", $quiet, $local, $db, $species, $blast_exe, "$cache/$date.blastp${r}");
# else send them to webblast
if ($r==0){
# Show BLAST parameters only for the first BLAST run
display_blast_param();
}
run_BLAST("$cache/$date.seq2blast.fas.$r", $local, $blast_param->{'db1'}, $blast_exe, "$cache/$date.blastp.$r");
##Get the BLASTp hit(s) acc number
open(my $BLASTP, '<', "$cache/$date.blastp${r}");
# Get the BLASTp hit(s) acc number
open(my $BLASTP, '<', "$cache/$date.blastp.$r");
while(<$BLASTP>){
@equivalent_blast_hits = (@equivalent_blast_hits, $1) if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
push @equivalent_blast_hits, $1 if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
}
close $BLASTP;
unlink("$cache/$date.blastp${r}", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($equivalent_blast_hits[0]) );
exit;
unlink("$cache/$date.blastp.$r", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($equivalent_blast_hits[0]) );
if ( !exists($equivalent_blast_hits[0]) ){
print "\n$r ...\tNo blast result found above filter thresholds for $fasta_header\n\n";
print "\n$r ...\tNo blast result found above thresholds for $fasta_header\n\n";
buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Unavailable', '');
undef $original_seq[$r];
undef $original_names[$r];
......@@ -405,6 +424,7 @@ for(my $r=0; $r<=$#original_names; $r++){
undef $original_seq[$r];
undef $original_names[$r];
}
close $LOG;
checkAndCleanStderrFiles("$cache/${date}_ExonerateError") if ( -e "$cache/${date}_ExonerateError" );
......@@ -737,37 +757,44 @@ sub reverse_trad{
####################### webblast #######################
sub launchWebblast{
my ($infile, $quiet, $whatBlast, $db, $species, $blast_exe, $outfile) = @_;
my $blast_at = '';
$blast_at = "-database=$db" if ( $whatBlast==0 );
$blast_at = "-database=$db -blast_dir=${blast_exe}" if ( $whatBlast==1 );
$blast_at = "-database=$db -gigablast=yes" if ( $whatBlast==2 );
# print " Evalue treshold : 0.05
# Matrix : BLOSUM62
# Filter : F
# **********************
# For RefSeq Protein db:
# Blast_identity_threshold : 100
# Cover threshold : 100
# **********************
# For NR Protein db:
# Blast_identity_threshold : 95
# Cover threshold : 95";
system("$webblast_exe -program=blastp ${blast_at} -infile=$infile -matrix=BLOSUM62 -evalue=0.05 -method=geneid -filter=Off -organism=$species -identity=100 -cover=100 -hits=10 $quiet -outfile=$outfile");
if ( -z "$outfile" ){ #Another blastP query if no hit with refseq and db was not nr !
$quiet = '-quiet=on';
print {*STDERR} "run BLAST.. against NR because there was no acceptable hit against $db\n"
sub display_blast_param {
print "
Program : BLASTp
Evalue threshold : $blast_param->{'evalue'}
Matrix : $blast_param->{'matrix'}
Filter : $blast_param->{'filter'}
Equivalent best hits used : $blast_param->{'nbesthits'}
Database: NCBI $blast_param->{'db1'}
Blast_identity_threshold : $blast_param->{'identity1'}
Cover threshold : $blast_param->{'cover1'}
if no match
Database: NCBI $blast_param->{'db2'}
Blast_identity_threshold : $blast_param->{'identity2'}
Cover threshold : $blast_param->{'cover2'}
\n";
return;
}
sub run_BLAST{
my ($infile, $whatBlast, $db, $blast_exe, $outfile) = @_;
my $blast_at = $whatBlast==2 ? '-gigablast=yes'
: $whatBlast==1 ? "-blast_dir=$blast_exe"
: '';
# BLAST run 1
system("$webblast_exe -program=blastp -database=$db $blast_at -infile=$infile -matrix=$blast_param->{'matrix'} -evalue=$blast_param->{'evalue'} -method=geneid -filter=$blast_param->{'filter'} -organism=$species -identity=$blast_param->{'identity1'} -cover=$blast_param->{'cover1'} -hits=$blast_param->{'nbesthits'} -quiet=on -outfile=$outfile");
if ( -z "$outfile" ){
# BLAST run 2 if no hit with 100% identity and coverage: refseq100 -> nr95 OR nr100 -> nr95
print "run BLAST.. against $blast_param->{'db2'} because there was no acceptable hit against $blast_param->{'db1'}"
if ( $whatBlast != 1 && $db ne 'nr' );
print {*STDERR} "run BLAST.. again, with decreased thresholds, because there was no acceptable hit against $db\n"
print "run BLAST.. again, with decreased thresholds, because there was no acceptable hit against $db"
if ( $whatBlast == 1 || $db eq 'nr' );
$blast_at =~ s/-database=[^\s]*/-database=nr/ if ( $whatBlast != 1 );
system("$webblast_exe -program=blastp ${blast_at} -infile=$infile -matrix=BLOSUM62 -evalue=0.05 -method=geneid -filter=Off -organism=$species -identity=95 -cover=95 -hits=10 $quiet -outfile=$outfile");
system("$webblast_exe -program=blastp -database=$blast_param->{'db2'} $blast_at -infile=$infile -matrix=$blast_param->{'matrix'} -evalue=$blast_param->{'evalue'} -method=geneid -filter=$blast_param->{'filter'} -organism=$species -identity=$blast_param->{'identity2'} -cover=$blast_param->{'cover2'} -hits=$blast_param->{'nbesthits'} -quiet=on -outfile=$outfile");
}
move('webblast.log', "$cache"); #Move temporary webblast.pl files
......
......@@ -26,7 +26,7 @@ my $BLASTMAT = 'export BLASTMAT=/mnt/local/ncbi/data/'; #matrix direct
my $BLASTDB = 'export BLASTDB=/db/blastnet/database/'; #PDB_seqres directoty #
###############################################################################################
my $runblast = '/mnt/local/bin/runblast.pl';
my $runblast = 'runblast.pl';
my(@list_encoded)=(), my(@list_pdb)=(), my(%deja_vu)=(), my(@pdb_list)=(), my($i)=0, my(@names)=(), my $locale=0, my $distant=0, my $database, my $blast_way;
my($ua)= LWP::UserAgent->new;
......@@ -562,7 +562,7 @@ sub LOCAL_BLAST
}
elsif ( $method=~ /^geneid$|^pdbid$/i && $gigablast=~ /^yes$/i )
{
unless ( $database eq "nr" || $database eq "pdb" || $database eq "refseq_protein" || $database eq "pdbaa" ){
unless ( $database eq 'nr' || $database eq 'pdb' || $database eq 'refseq_protein' || $database eq 'pdbaa' ){
print {*STDERR} "\nsorry invalid database for gigablast\n";exit 1;
}
if ( $database eq 'pdb') { $database = 'pdbaa';}
......@@ -594,13 +594,13 @@ sub LOCAL_BLAST
if ( $_=~ /(BLASTP\s+\S+)/o ) { $version = $1;}
print {$SOR2} $_;
push (@list_pdb, $_);
if ( $_=~ /\s*(.+?)\s/ ){
print {*STDERR} "\n$1 done";
}
# if ( $_=~ /\s*(.+?)\s/ ){
# print {*STDERR} "\n$1 done";
# }
}
close COM;
close $SOR2;
print {*STDERR} "\n";
# print {*STDERR} "\n";
$name_database = $database if ( $name_database =~ m{/} );
unless ($quiet=~ /on/i) {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment