Commit 2efd64ce authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

correct when seq name start with a blank e.g.: > truc && when 1st seq is...

correct when seq name start with a blank e.g.: > truc && when 1st seq is revtrans and checked for nt
parent 091fe091
......@@ -34,23 +34,23 @@ $ENV{'PATH'} .= ':/mnt/local/bin/'; # additional path for executable on the serv
my $Version='3.0.2';
my $uct=15; # UpdateCacheThreshold: number of days before update
my $cachedir='/scratch/frt/tcoffee/ProtoGene_Cache'; # Cache directory
my $Version = '3.0.3';
my $uct = 15; # UpdateCacheThreshold: number of days before update
my $cachedir = '/scratch/frt/tcoffee/ProtoGene_Cache'; # Cache directory
##### User settings ####################################
my $userEMail='moretti.sebastien@gmail.com'; # To receive an e-mail with encountered problems; leave blank to inactive this option
my $webblast_exe=which('webblast.pl') || ''; #
my $blast_prog='blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage
my $exonerate=which('exonerate-1.0') || ''; #
my $userEMail = 'moretti.sebastien@gmail.com'; # To receive an e-mail with encountered problems; leave blank to inactive this option
my $webblast_exe = which('webblast.pl') || ''; #
my $blast_prog = 'blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage
my $exonerate = which('exonerate-1.0') || ''; #
########################################################
my $blast_exe=which($blast_prog) || '';
my $doc=which('perldoc') || '';
#my $date=$time{'yy-mm-dd_hh\hmm-ss.mmm'}; #Used to get a unique file
my $date=`/usr/bin/env date +%y-%m-%d_%T`; #Used to get a unique file
my $blast_exe = which($blast_prog) || '';
my $doc = which('perldoc') || '';
#my $date = $time{'yy-mm-dd_hh\hmm-ss.mmm'}; #Used to get a unique file
my $date = `/usr/bin/env date +%y-%m-%d_%T`; #Used to get a unique file
chomp($date);
$date =~ s/\:/\-/g;
&checkProgramsPresence($webblast_exe,$exonerate); # Check external programs presence in the PATH
&checkProgramsPresence($webblast_exe, $exonerate); # Check external programs presence in the PATH
$blast_exe =~ s/${blast_prog}$//;
&checkCacheAccessibility($cachedir); # Check cache directory accessibility
......@@ -58,9 +58,9 @@ $blast_exe =~ s/${blast_prog}$//;
##Options management
my ($msa,$species,$db,$athome,$revtrans,$pep,$hideBOJ)=('','All_organisms','refseq_protein',0,0,0,0);
my ($version,$help,${run_name},$Cache,$template,$lim,$tmp)=(0,0,'','update','',0,0);
my $giga=0;
my ($msa, $species, $db, $athome, $revtrans, $pep, $hideBOJ) = ('', 'All_organisms', 'refseq_protein', 0, 0, 0, 0);
my ($version, $help, $run_name, $Cache, $template, $lim, $tmp) = (0, 0, '', 'update', '', 0, 0);
my $giga = 0;
GetOptions("msa|in=s" => \$msa, #Input sequences
"orgm|species=s" => \$species, #Organism(s) to blast against
"db|database=s" => \$db, #Database to blast against
......@@ -70,29 +70,29 @@ GetOptions("msa|in=s" => \$msa, #Input sequences
"pep" => \$pep, #Add the original peptide query beneath the linked nt seq
"version|v" => \$version, #Print version information
"help|h" => \$help, #Print full help message
"run_name=s" => \${run_name}, #Use another name, instead of input seq name, for result files
"run_name=s" => \$run_name, #Use another name, instead of input seq name, for result files
"cache=s" => \$Cache, #Specify what cache parameter
"hideBOJ" => \$hideBOJ,
"template=s" => \$template, #Use a template file
"lim=i" => \$lim, #Limite number of input query sequences
"tmp" => \$tmp #To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ...
"tmp" => \$tmp, #To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ...
);
$athome=2 if ($giga==1);
if ($version==1){
$athome = 2 if ( $giga==1 );
if ( $version==1 ){
print "\n\tPROTOGENE : Bona-Fide Back Translation of Protein Multiple Sequence Alignments\n\tversion : $Version\n\n";
exit;
exit 0;
}
if ( $help==1 ){
if ( $doc ne '' ){
system("$doc $0");
system("$doc $0");
}
else{
print STDERR "\n\tperldoc command doesn't seem to be available\n\tto see the documentation in pod format\n\n";
print {*STDERR} "\n\tperldoc command doesn't seem to be available\n\tto see the documentation in pod format\n\n";
}
exit(1);
}
if ( $blast_exe eq '' && $version==0 && $help==0 ){
print STDERR "\n\t$blast_prog program is not reachable\n\tIt could not be in your PATH or not installed\n\tIt is required only for local blast searches\n\n";
print {*STDERR} "\n\t$blast_prog program is not reachable\n\tIt could not be in your PATH or not installed\n\tIt is required only for local blast searches\n\n";
}
#Cache management
......@@ -100,13 +100,13 @@ my $cache = &cacheManagement($Cache) || '.';
#Open and check template file
my ($templG,$templP,$templSeq);
($templG,$templP,$templSeq) = &checkTemplate($template) if ($template);
my ($templG, $templP, $templSeq);
($templG, $templP, $templSeq) = &checkTemplate($template) if ($template);
#Short help message
if ( $msa eq '' || $species eq '' || $db eq '' || $cache eq '' ){
my $appli=basename($0);
my $appli = basename($0);
print {*STDERR} "\n\tCannot open the MSA file in fasta format
\tTry: $appli --msa=path_of_the_fasta_msa_file [Options]
......@@ -143,53 +143,55 @@ my $change = 0;
($msa,$change) = &checkFastaFile($msa,$change);
my $fasta_checker = -1;
my (@input_order,@original_names,@original_seq); #Use 3 lists in parallel : original_names, seq, order(1->n)
open( my $MSA, '<', "$msa") if ( -e "$msa" );
open( my $MSA, '<', "$msa" ) if ( -e "$msa" );
while(<$MSA>){
if ( $_ =~ /^>/ ){
$fasta_checker++;
my $name=$_;
$name =~ s/\r\n//g; #Remove return lines from windows OS '^M'
chomp($name);
@original_names=(@original_names,$name);
@input_order=(@input_order,$fasta_checker);
$fasta_checker++;
my $name = $_;
$name =~ s/\r\n//g; #Remove return lines from windows OS '^M'
$name =~ s/^>[ \t]+/>/;
chomp($name);
@original_names = (@original_names, $name);
@input_order = (@input_order, $fasta_checker);
}
elsif ( $_ !~ /^>/ ){
my $seqq=$_;
$seqq =~ s/\r\n//g;
$seqq =~ s/\./-/g; #for msa with '.' as gap
$seqq =~ s/[^A-Za-z\-\*\n\r]//g; #Remove all the non-gap or non-alphabetic characters from the seq
chomp($seqq);
my $seqq = $_;
$seqq =~ s/\r\n//g;
$seqq =~ s/\./-/g; #for msa with '.' as gap
$seqq =~ s/[^A-Za-z\-\*\n\r]//g; #Remove all the non-gap or non-alphabetic characters from the seq
chomp($seqq);
#fasta sequence on 1 line
if ( ! exists( $original_seq[$fasta_checker] )){
@original_seq=(@original_seq,$seqq);
}
else {
$original_seq[$fasta_checker]=$original_seq[$fasta_checker].$seqq;
}
if ( !exists($original_seq[$fasta_checker]) ){
@original_seq = (@original_seq, $seqq);
}
else {
$original_seq[$fasta_checker] = $original_seq[$fasta_checker].$seqq;
}
}
}
close $MSA;
if ( $fasta_checker == -1 ){
&failure;
print STDERR "\tThe MSA file does NOT seem to be a protein fasta format\n\n";
print {*STDERR} "\tThe MSA file does NOT seem to be a protein fasta format\n\n";
exit(1);
}
elsif ( $lim >0 && $fasta_checker > $lim ){
&failure;
print STDERR "\tThe fasta file is too large, try with less than $lim sequences\n\n";
print {*STDERR} "\tThe fasta file is too large, try with less than $lim sequences\n\n";
exit(1);
}
elsif ( exists( $original_seq[0] ) && $original_seq[0] =~ /[acgtu]/i){
my $first_seq = $original_seq[0];
#Check if sequences are amino acids and not nucleotides
my ( $a, $c, $g, $t, $non ) = ( 0, 0, 0, 0, 0 );
$a = ($original_seq[0] =~ s/[aA]//g);
$c = ($original_seq[0] =~ s/[cC]//g);
$g = ($original_seq[0] =~ s/[gG]//g);
$t = ($original_seq[0] =~ s/[tTuU]//g);
$non = ($original_seq[0] =~ s/[^aAcCgGtTuU-]//g);
$a = ($first_seq =~ s/[aA]//g);
$c = ($first_seq =~ s/[cC]//g);
$g = ($first_seq =~ s/[gG]//g);
$t = ($first_seq =~ s/[tTuU]//g);
$non = ($first_seq =~ s/[^aAcCgGtTuU-]//g);
if ( ($a+$c+$g+$t) >= (($a+$c+$g+$t+$non)*80/100) ){
&failure();
print STDERR "\tYour sequences seem already to be nucleotides\n\tthis program purpose is to turn AMINO ACID alignments into CDS nucleotide alignments\n\n";
print {*STDERR} "\tYour sequences seem already to be nucleotides\n\tthis program purpose is to turn AMINO ACID alignments into CDS nucleotide alignments\n\n";
exit(1);
}
}
......@@ -213,7 +215,7 @@ $date.="_$$"; #Add PID
# run_name implementation for web server
${originalMSA}=${run_name} if (${run_name} ne '');
$originalMSA = $run_name if ( $run_name ne '' );
unlink("${originalMSA}.cds");
unlink("${originalMSA}.cdsP");
......@@ -223,58 +225,57 @@ unlink("${originalMSA}.boj");
# Remove old files from the cache directory
&cacheManagement('old','1'); #everytime ProtoGene is running
&cacheManagement('old', '1'); #everytime ProtoGene is running
##Build the sequences, from the alignment, to perform webblast
my $quiet='';
my $quiet = '';
EACH_SEQ:
for(my $r=0; $r<=$#input_order; $r++){
my $right_name=$original_names[$r];
my $templ_name=$original_names[$r];
$right_name =~ s/^>//;
$templ_name =~ s/^([^\s\t]+).*$/$1/;
my $right_name = $original_names[$r];
my $templ_name = $original_names[$r];
$right_name =~ s/^>//;
$templ_name =~ s/^([^\s\t]+).*$/$1/;
open(my $READY2BLAST, '>', "$cache/${date}_seq2blast.fas${r}");
my $noGap=$original_seq[$r];
$noGap =~ s/[\.-]//g; #remove gaps
my $noGap = $original_seq[$r];
$noGap =~ s/[\.-]//g; #remove gaps
print {$READY2BLAST} ">$input_order[$r]\n$noGap\n";
close $READY2BLAST;
undef $noGap;
my @multipleequalblasthits;
if ( exists($templG->{$templ_name}) and $templG->{$templ_name} ne ''){
@multipleequalblasthits=(@multipleequalblasthits,'My_own_seq'); #To change for ?
print "\n\n$r done\n\nNucleotidic template for $r:\t$templG->{$templ_name}\n\n**********************************************************************\n\n";
if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, 'My_own_seq'); #To change for ?
print "\n\n$r done\n\nNucleotidic template for $r:\t$templG->{$templ_name}\n\n**********************************************************************\n\n";
}
elsif ( exists($templP->{$templ_name}) and $templP->{$templ_name} ne ''){
@multipleequalblasthits=(@multipleequalblasthits,$templP->{$templ_name});
print "\n\n$r done\n\nBlastP template for $r:\t$templP->{$templ_name}\n\n**********************************************************************\n\n";
elsif ( exists($templP->{$templ_name}) && $templP->{$templ_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, $templP->{$templ_name});
print "\n\n$r done\n\nBlastP template for $r:\t$templP->{$templ_name}\n\n**********************************************************************\n\n";
}
else{
## --> send them to webblast
$quiet='-quiet=on ' if ($r>0); #Show webblast parameters only for the first webblast run
&launchWebblast("$cache/${date}_seq2blast.fas${r}",$quiet,$athome,$db,$species,${blast_exe},"$cache/${date}.blastp${r}");
$quiet = '-quiet=on ' if ( $r>0 ); #Show webblast parameters only for the first webblast run
&launchWebblast("$cache/${date}_seq2blast.fas${r}", $quiet, $athome, $db, $species, $blast_exe, "$cache/${date}.blastp${r}");
##Get the BLASTp hit(s) acc number
open(my $BLASTP, '<', "${cache}/${date}.blastp${r}");
while(<$BLASTP>){
@multipleequalblasthits=(@multipleequalblasthits,$1) if ($_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/); #Warning: double '_'
}
close $BLASTP;
unlink("$cache/${date}.blastp${r}","${cache}/${date}_seq2blast.fas${r}") if ($tmp == 0 or exists($multipleequalblasthits[0]) );
if ( ! exists($multipleequalblasthits[0]) ){
print "\n$input_order[$r] ...\tNo blast result found above filter thresholds for $right_name\n\n";
&buildFailureOutputFiles($r,'No_BLASTp_Result','Unavailable','');
undef $original_seq[$r];
undef $original_names[$r];
undef $input_order[$r];
next;
}
open(my $BLASTP, '<', "${cache}/${date}.blastp${r}");
while(<$BLASTP>){
@multipleequalblasthits = (@multipleequalblasthits, $1) if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
}
close $BLASTP;
unlink("$cache/${date}.blastp${r}", "${cache}/${date}_seq2blast.fas${r}") if ( $tmp == 0 || exists($multipleequalblasthits[0]) );
if ( !exists($multipleequalblasthits[0]) ){
print "\n$input_order[$r] ...\tNo blast result found above filter thresholds for $right_name\n\n";
&buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Unavailable', '');
undef $original_seq[$r];
undef $original_names[$r];
undef $input_order[$r];
next EACH_SEQ;
}
}
......@@ -497,9 +498,9 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
=over 8
=item version 3.0.2
=item version 3.0.3
=item on June 15th, 2007
=item on Aug 27th, 2007
=back
......@@ -531,24 +532,24 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
###################### Management ######################
#Failure declaration
sub failure{
print {*STDERR} "\n\nFATAL\n\nTERMINATION STATUS: FAILURE\n\n";
return;
}
sub failure{
print {*STDERR} "\n\nFATAL\n\nTERMINATION STATUS: FAILURE\n\n";
return;
}
#Check external programs presence in the PATH
sub checkProgramsPresence{
my ($webblast_exe,$exonerate)=@_;
my ($webblast_exe, $exonerate) = @_;
if ($webblast_exe eq ''){
&failure;
print {*STDERR} "\twebblast.pl program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
exit(1);
if ( $webblast_exe eq '' ){
&failure;
print {*STDERR} "\twebblast.pl program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
exit(1);
}
if ($exonerate eq ''){
&failure;
print {*STDERR} "\texonerate program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
exit(1);
if ( $exonerate eq '' ){
&failure;
print {*STDERR} "\texonerate program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
exit(1);
}
}
......@@ -556,7 +557,7 @@ sub checkProgramsPresence{
sub checkCacheAccessibility{
my ($cachedir)=@_;
if ( -d "$cachedir/" and -w "$cachedir/" and -x "$cachedir/" and -r "$cachedir/" ){
if ( -d "$cachedir/" && -w "$cachedir/" && -x "$cachedir/" && -r "$cachedir/" ){
}
elsif ( -d "$cachedir/" ){
if ( -o "$cachedir/" ){
......
#
#Version: 3.0.2
#Version: 3.0.3
#OS: Linux
#Author: Sebastien Moretti
#E-mail: moretti.sebastien@gmail.com
#
History of PACMAN enhancements:
3.0.3
Correct seq_name when starting with a blank
e.g.: > truc
Correct when first seq is revtrans and check for nt
3.0.2
Print an error message when input files are already nucleotides
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment