Commit 2a2847eb authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Cleab cache and template checking

parent 1ebccb46
......@@ -17,7 +17,6 @@ use Time::localtime; # Use localtime+PID for a pseudo-uniq
use Getopt::Long; # Options specifications
use Env qw(HOME); # Use only environmental (shell) HOME variable
use File::Copy qw(move); # Avoid external 'mv' command usage
use LWP::Simple; # To test gigablaster availability
......@@ -35,6 +34,7 @@ use Views; # Non-text outputs, e.g. HTML/CSS
$ENV{'PATH'} .= ':/mnt/local/bin/:./'; # Additional path for executables
my $cachePath = '/scratch/cluster/monthly/t_coffee/ProtoGene_Cache'; # Cache directory
$cachePath = '/Users/smoretti/Documents/ProtoGene/TTMMPP'; #FIXME
my $cacheStorageTime = 15; # Do not update sequences younger than X days
my $userEMail = 'moretti.sebastien@gmail.com'; # To receive e-mails with encountered problems; leave blank to inactive
......@@ -44,8 +44,7 @@ my $userEMail = 'moretti.sebastien@gmail.com'; # To
my $VERSION = '4.0.0';
my $webblast_exe = 'webblast.pl';
my $blast_prog = 'blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage
my $blast_exe = $blast_prog;
my $blast_exe = 'blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage
my $exonerate_exe = 'exonerate-1.0'; # Exonerate 1.0 because current parser only works with this version
......@@ -64,7 +63,7 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences
'orgm|species=s' => \$species, # Organism(s) to blast against
'db|database=s' => \$db, # Database to blast against
'local' => \$local, # Use to specify a local db query, definied in $local_db
'local' => \$local, # Use to specify a local db query, defined in $local_db
'giga' => \$giga, # Use GigaBlaster server
'version|V' => sub { print "\n\tPROTOGENE : Bona-Fide Back Translation of Protein Multiple Sequence Alignments\n\tversion : $VERSION\n\n";
......@@ -80,7 +79,8 @@ my $test_option_values = Getopt::Long::GetOptions(%opts);
################## Short help message
if ( !$test_option_values || $msa eq '' || $species eq '' || $db eq '' || $cache eq '' ){
if ( !$test_option_values || ($msa eq '' && $cache ne 'empty' && $cache ne 'old') ){
#|| $species eq '' || $db eq '' ){
print {*STDERR} "\n\tCannot open the MSA file in FASTA format
\tTry: $0 --msa=path_of_the_fasta_msa_file [Options]
......@@ -110,27 +110,36 @@ if ( !$test_option_values || $msa eq '' || $species eq '' || $db eq '' || $cache
}
################## Check external software
checkExternalPrograms($webblast_exe, $exonerate_exe, $blast_exe); # Check external programs presence in the PATH
#FIXME ???
$blast_exe =~ s/${blast_prog}$//;
################## Check cache directory accessibility
checkCacheAccessibility($cachePath);
checkCacheAccessibility($cachePath); # Check cache directory accessibility
# Cache management
$cache = cacheManagement($cache, $cachePath, 0) || '.';
# Remove old files from the cache directory everytime ProtoGene is running
cacheManagement('old', $cachePath, 1);
$local = 2 if ( $giga==1 );
if ( $blast_exe eq '' ){
print {*STDERR} "\n\t$blast_prog program is not reachable\n\tIt could not be in your PATH or not installed\n\tIt is required only for local blast searches\n\n";
################## Check external software in the PATH
checkExternalSoftware($webblast_exe, $exonerate_exe);
if ( $local ){
checkExternalSoftware($blast_exe);
}
#Cache management
$cache = cacheManagement($cache, $cachePath) || '.';
#Open and check template file
my ($templG, $templP, $templSeq);
($templG, $templP, $templSeq) = checkTemplate($template) if ($template);
################## Check input files
# Open and check template file
my ($template_NT, $template_AA, $template_SEQ);
($template_NT, $template_AA, $template_SEQ) = checkTemplate($template) if ( $template );
# Open and Check the input fasta file
$local = 2 if ( $giga==1 );
##Open and Check the fasta file
......@@ -232,12 +241,6 @@ unlink("${originalMSA}.boj");
# Remove old files from the cache directory
cacheManagement('old', $cachePath, '1'); #everytime ProtoGene is running
##Build the sequences, from the alignment, to perform webblast
my $quiet = '';
EACH_SEQ:
......@@ -256,13 +259,13 @@ for(my $r=0; $r<=$#input_order; $r++){
my @multipleequalblasthits;
if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' ){
if ( exists($template_NT->{$templ_name}) && $template_NT->{$templ_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, 'My_own_seq'); #To change for ?
print "\n\n$r done\n\nNucleotidic template for $r:\t$templG->{$templ_name}\n\n**********************************************************************\n\n";
print "\n\n$r done\n\nNucleotidic template for $r:\t$template_NT->{$templ_name}\n\n**********************************************************************\n\n";
}
elsif ( exists($templP->{$templ_name}) && $templP->{$templ_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, $templP->{$templ_name});
print "\n\n$r done\n\nBlastP template for $r:\t$templP->{$templ_name}\n\n**********************************************************************\n\n";
elsif ( exists($template_AA->{$templ_name}) && $template_AA->{$templ_name} ne '' ){
@multipleequalblasthits = (@multipleequalblasthits, $template_AA->{$templ_name});
print "\n\n$r done\n\nBlastP template for $r:\t$template_AA->{$templ_name}\n\n**********************************************************************\n\n";
}
else{
## --> send them to webblast
......@@ -298,13 +301,13 @@ for(my $r=0; $r<=$#input_order; $r++){
my @nt_GIs;
my $intronStep = 0;
if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' && $templG->{$templ_name} !~ /^My_Seq$/i ){
downloadSeq($cache, $date, '', '', $templG->{$templ_name});
@nt_GIs = $templG->{$templ_name};
if ( exists($template_NT->{$templ_name}) && $template_NT->{$templ_name} ne '' && $template_NT->{$templ_name} !~ /^My_Seq$/i ){
downloadSeq($cache, $date, '', '', $template_NT->{$templ_name});
@nt_GIs = $template_NT->{$templ_name};
}
elsif ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' && $templG->{$templ_name} =~ /^My_Seq$/i ){
elsif ( exists($template_NT->{$templ_name}) && $template_NT->{$templ_name} ne '' && $template_NT->{$templ_name} =~ /^My_Seq$/i ){
open(my $MYSEQ, '>', "$cache/My_Seq-$$.fas");
print {$MYSEQ} ">My_Seq-$$ for $templ_name\n".$templSeq->{$templ_name}."\n";
print {$MYSEQ} ">My_Seq-$$ for $templ_name\n".$template_SEQ->{$templ_name}."\n";
close $MYSEQ;
@nt_GIs = "My_Seq-$$";
}
......@@ -474,7 +477,7 @@ with these options available:
=item old: remove the old files in the cache directory
=item empty: empty the whole cache directory $HOME/.ProtoGene/
=item empty: empty completely the cache directory
=item I<--version> to print version information
......@@ -541,14 +544,14 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
###################### Management ######################
#Failure declaration
# Failure declaration
sub failure {
print {*STDERR} "\n\nFATAL\n\nTERMINATION STATUS: FAILURE\n\n";
return;
}
# Check external programs presence in the PATH
sub checkExternalPrograms{
sub checkExternalSoftware {
for my $exe ( @_ ){
if ( ! which($exe) ){
failure();
......@@ -559,63 +562,63 @@ sub checkExternalPrograms{
return;
}
#Check cache directory accessibility: Test and change rights, if needs, of the $HOME/.ProtoGene/ cache directory
sub checkCacheAccessibility{
# Check cache directory accessibility: Test and change rights, if required, of the cache directory
sub checkCacheAccessibility {
my ($cacheDir) = @_;
if ( -d "$cacheDir/" && -w "$cacheDir/" && -x "$cacheDir/" && -r "$cacheDir/" ){
# Good
}
elsif ( -d "$cacheDir/" ){
if ( -o "$cacheDir/" ){
chmod 0754, "$cacheDir/";
}
else{
print {*STDERR} "\n\t** Warning ** :\n\tPermissions do NOT seem to be valid for the current cache directory\n\n";
failure();
print {*STDERR} "\tPermissions do NOT seem to be valid for the '$cacheDir' cache directory\n\n";
exit(2);
}
}
else{
mkdir "$cacheDir/";
mkdir "$cacheDir/"; # FIXME: mkdir is not recursive !
if ( -o "$cacheDir/" ){
chmod 0754, "$cacheDir/"; #0754 est en octal, donc 754 ou '0754' ne sont pas bons !
chmod 0754, "$cacheDir/"; #0754 is in octal, so 754 or '0754' are not good !
}
else{
print {*STDERR} "\n\t** Warning ** :\n\tPermissions do NOT seem to be valid for the current cache directory\n\n";
failure();
print {*STDERR} "\tPermissions do NOT seem to be valid for the '$cacheDir' cache directory\n\n";
exit(2);
}
}
return;
}
#Cache Management
# Cache Behavior Management
sub cacheManagement{
my ($cache, $cacheDir, $inScript) = @_;
$inScript = 0 if ( !defined $inScript );
if ( $cache eq 'empty' ){
my @temp_files = glob($cacheDir.'/*.fas');
if ( exists ($temp_files[0]) ){
unlink $_ while(<@temp_files>);
}
@temp_files = glob($cacheDir.'/*_ExonerateError');
if ( exists ($temp_files[0]) ){
unlink $_ while(<@temp_files>);
}
unlink <$cacheDir/*.fas>;
unlink <$cacheDir/*_ExonerateError>;
unlink("$cacheDir/webblast.log", "$cacheDir/web_tempo.result", "$cacheDir/debug.tempo");
print "\n\tcache directory [$cacheDir] is empty\n\n";
exit 0;
}
elsif ( $cache eq 'old' ){
my @temp_files = glob($cacheDir.'/*.fas');
my @tempFiles = grep { -M $_ > $cacheStorageTime } @temp_files; #Sort list only with $cacheStorageTime days old files
if ( exists ($tempFiles[0]) ){
unlink $_ while(<@tempFiles>);
unlink grep { -M $_ > $cacheStorageTime } glob($cacheDir.'/*.fas'); # Grep list only with $cacheStorageTime days old files
if ( $inScript==0 ){
print "\n\tcache directory [$cacheDir] is empty of its old files\n\n";
exit 0;
}
print "\n\tcache directory [$cacheDir] is empty of its old files\n\n" if ($inScript==0);
exit 0 if ( $inScript==0 );
}
elsif ( $cache eq 'update' || $cache eq 'use' ){
$cache = $cacheDir; #Limit cache files to less than $cacheStorageTime days old ones for 'update'
# DEFAULT: Limit cached files to less than $cacheStorageTime days old ones for 'update'
$cache = $cacheDir;
}
elsif ( $cache eq 'none' ){
# Do not use cache
}
elsif ( -d $cache ){
die "\n\tProtoGene cannot access to your directory\n\n" if ( ! -x $cache );
......@@ -623,7 +626,9 @@ sub cacheManagement{
die "\n\tProtoGene cannot read into your directory\n\n" if ( ! -r $cache );
}
else {
die "\n\tWrong cache argument, or your cache folder doesn't seem to exist\n\n";
failure();
print {*STDERR} "\n\tWrong cache argument, or your cache folder '$cacheDir' doesn't seem to exist\n\n";
exit(3);
}
return($cache);
......@@ -1516,13 +1521,13 @@ sub checkOutputFiles{
####################### Template #######################
sub template_failure {
print {*STDERR} "\tYour template file has NOT the right format. Every lines must look like:\n";
print {*STDERR} "\tYour template file has NOT the right format. Every line must look like:\n";
print {*STDERR} "\t>name1_G_NAcc ... or\n\t>name1 _S_ PAcc ...\n";
print {*STDERR} "\tWith NAcc = accession number or GI (from NCBI) of the nucleotidic target of your query name1\n";
print {*STDERR} "\tWith PAcc = accession number (from NCBI) of the peptidic target of your query name1\n";
print {*STDERR} "\tWith NAcc = identifier or GI (from NCBI) of the nucleotidic target of your query name1\n";
print {*STDERR} "\tWith PAcc = identifier (from NCBI) of the peptidic target of your query name1\n";
print {*STDERR} "\t\tname1 must NOT contain space, '\\s', character\n";
print {*STDERR} "\t\taccession number must NOT be version number of NCBI\n";
print {*STDERR} "\tYou can provide your own nucleotidic sequence, below corresponding template line, with NAcc='My_Seq'\n\n";
print {*STDERR} "\t\tidentifiers must NOT have version number at the end (NCBI)\n";
print {*STDERR} "\tYou can provide your own nucleotidic sequence, below corresponding template line, with NAcc=My_Seq\n\n";
print {*STDERR} "\tE.g.: >CDK2_hs Human Cyclin-Dependent Kinase 2\n";
print {*STDERR} "\t MENFQKVEKIGEGTYGVVYKARNKLTGEVVALK....\n\n";
print {*STDERR} "\t\ttemplate file should be\n";
......@@ -1535,84 +1540,63 @@ sub template_failure {
print {*STDERR} "\t\tif you provide your own nucleotidic target sequence\n\n";
}
sub checkTemplate{
sub checkTemplate {
my ($templateFile) = @_;
if ( -e "$templateFile" && -s "$templateFile" ){
my $fasta_checker = -1;
my (%blastpTarget, %nucTarget, %nucSeq);
if ( -s "$templateFile" ){
my (%proteinTarget, %nucleotideTarget, %nucleotideSeq);
my $fasta_checker = 0;
my $realSeqName = '';
open(my $TEMPLATE, '<', "$templateFile");
my $realSeqName = '';
READ_TEMPLATE:
while(<$TEMPLATE>){
if ( $_ =~ /^>/ && $_ !~ /_[GS]_ *[^ ]*/ ){
failure();
template_failure();
close $TEMPLATE;
exit 1;
return;
}
if ( $_ =~ /^>/ ){
$realSeqName = '';
next READ_TEMPLATE if ( $_ !~ /_[GS]_ *[^ ]*/ );
$fasta_checker++;
my $name = $_;
$name =~ s/\r\n//g; #Remove return lines from windows OS '^M'
$name =~ s{\r\n}{}g; # Remove return lines from windows OS '^M'
$name =~ s{\r}{\n}g; # Replace return lines from Mac OS
chomp($name);
$realSeqName = $name;
$realSeqName =~ s/^(>.*)_[GS]_.*$/$1/;
$realSeqName =~ s/^(>.*)_[GS]_.*$/$1/;
$realSeqName =~ s/ //g;
$realSeqName =~ s/^(>.*?)_[GS]_/$1/;
$realSeqName =~ s{\s}{}g;
$realSeqName = 'wHaT' if ($realSeqName !~ /[\w\>]/);
# print "\nname = $realSeqName\n";
#SI pas la bonne struct _G_ . _S_ . ...
my $protTarget = $name;
$protTarget =~ s/^.*_S_ *([^ ]+).*$/$1/;
$protTarget = '' if ( $protTarget !~ /^[\w\_]+$/ || $protTarget eq 'No_BLASTp_Result' || $protTarget =~ /^My_own_seq$/ );
%blastpTarget = (%blastpTarget, $realSeqName => $protTarget);
# print "PAcc = $blastpTarget{$realSeqName}\n";
my $nuclSeq = $name;
$nuclSeq =~ s/^.*_G_ *([^ ]+).*$/$1/;
$nuclSeq = '' if ( $nuclSeq !~ /^[\w\_]+$/ || $nuclSeq eq 'No_nt_link' || $nuclSeq =~ /navailable$/ || $nuclSeq eq 'Failed_Aln' );
%nucTarget = (%nucTarget, $realSeqName => $nuclSeq);
%nucSeq = (%nucSeq, $realSeqName => '');
# print "NAcc = $nucTarget{$realSeqName}\n";
}
elsif ( $_ !~ /^>/ && exists($nucTarget{$realSeqName}) && $nucTarget{$realSeqName} =~ /^My_Seq$/i ){
my $seqq = $_;
$seqq =~ s/\r\n//g;
$seqq =~ s/[\.\-]//g; #for msa with '.' as gap
$seqq =~ s/[^A-Za-z\*\n\r]//g; #Remove all the non-gap or non-alphabetic characters from the seq
chomp($seqq);
#fasta sequence on 1 line
if ( $nucSeq{$realSeqName} eq '' ){
$nucSeq{$realSeqName} = $seqq;
# print "Seq = $seqq\t$fasta_checker\n" if ($seqq ne '');
}
else {
$nucSeq{$realSeqName} .= $seqq;
# print " $seqq\t$fasta_checker\n" if ($seqq ne '');
}
my $protTarget = $name;
$protTarget =~ s/^.*_S_ *([^ ]+)/$1/;
$protTarget = '' if ( $protTarget !~ /^[\w]+$/ || $protTarget eq 'No_BLASTp_Result' || $protTarget =~ /^My_own_seq$/ );
$proteinTarget{$realSeqName} = $protTarget;
my $nuclTarget = $name;
$nuclTarget =~ s/^.*_G_ *([^ ]+)/$1/;
$nuclTarget = '' if ( $nuclTarget !~ /^[\w]+$/ || $nuclTarget eq 'No_nt_link' || $nuclTarget =~ /navailable$/ || $nuclTarget eq 'Failed_Aln' );
$nucleotideTarget{$realSeqName} = $nuclTarget;
$nucleotideSeq{$realSeqName} = '';
}
else{
next READ_TEMPLATE; #For empty lines !?
elsif ( $_ !~ /^>/ && exists($nucleotideTarget{$realSeqName}) && $nucleotideTarget{$realSeqName} =~ /^My_Seq$/i ){
my $sequence = $_;
$sequence =~ s{\s}{}g;
$sequence =~ s/[\.\-]//g; # for msa with '.' as gap
$sequence =~ s/[^A-Za-z\*]//g; # Remove all the non-gap or non-alphabetic characters from the seq
chomp($sequence);
$nucleotideSeq{$realSeqName} .= $sequence;
}
}
close $TEMPLATE;
if ( $fasta_checker == -1 ){
if ( $fasta_checker == 0 ){
failure();
template_failure();
exit 1;
return;
}
my ($listProtT, $listNucT, $listNtSeq) = (\%blastpTarget, \%nucTarget, \%nucSeq);
return($listNucT, $listProtT, $listNtSeq) if ( $fasta_checker >= 0 );
}
else{
return('', '', '');
return (\%proteinTarget, \%nucleotideTarget, \%nucleotideSeq);
}
return(undef, undef, undef);
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment