Commit 1151b3f3 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Start to use new NCBI GI format + eutils https + Start new cache mecanism + Cleaning

parent 8aba9ba8
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
use strict; use strict;
use warnings; use warnings;
use diagnostics; use diagnostics;
use Carp;
use File::Which qw(which); # Locate external executable programs in the PATH use File::Which qw(which); # Locate external executable programs in the PATH
use Time::localtime; # Use localtime+PID for a pseudo-uniq temp file name use Time::localtime; # Use localtime+PID for a pseudo-uniq temp file name
...@@ -20,7 +19,8 @@ use LWP::Simple; # To test gigablaster availability ...@@ -20,7 +19,8 @@ use LWP::Simple; # To test gigablaster availability
use Mail::Send; # Send warnings and errors files by e-mail ==> only if the $userEMail variable is defined use Mail::Send; # Send warnings and errors files by e-mail ==> only if the $userEMail variable is defined
use lib '/mnt/common/share/ProtoGene/'; # Local path for ProtoGene's own perl modules # TODO use FinBin for lib and binary paths !!!
use lib '/software/SequenceAnalysis/ProtoGene/4.2.0/lib/'; # Local path for ProtoGene's own perl modules
use Exonerate; # Exonerate runner, parser, ... use Exonerate; # Exonerate runner, parser, ...
use Views; # Non-text outputs, e.g. HTML/CSS use Views; # Non-text outputs, e.g. HTML/CSS
#use CheckOutput; # Check output for cds consistancy with query #use CheckOutput; # Check output for cds consistancy with query
...@@ -28,9 +28,7 @@ use Views; # Non-text outputs, e.g. HTML/CSS ...@@ -28,9 +28,7 @@ use Views; # Non-text outputs, e.g. HTML/CSS
################## CONFIGURATION ################## ################## CONFIGURATION ##################
#$ENV{'PATH'} .= ':/mnt/local/bin/:./'; # Additional path for executables my $cachePath = '/scratch/beegfs/monthly/tcoffee/ProtoGene_Cache'; # Cache directory
my $cachePath = '/scratch/fhgfs/tcoffee/ProtoGene_Cache'; # Cache directory
my $cacheStorageTime = 15; # Do not update sequences younger than X days my $cacheStorageTime = 15; # Do not update sequences younger than X days
my $userEMail = 'moretti.sebastien@gmail.com'; # To receive e-mails with encountered problems; leave blank to inactive my $userEMail = 'moretti.sebastien@gmail.com'; # To receive e-mails with encountered problems; leave blank to inactive
...@@ -49,22 +47,23 @@ my $blast_param = { 'evalue' => 0.05, ...@@ -49,22 +47,23 @@ my $blast_param = { 'evalue' => 0.05,
'cover2' => 95, 'cover2' => 95,
'nbesthits' => 10, 'nbesthits' => 10,
'core' => 1, 'core' => 1, # TODO really used ?
}; };
################################################### ###################################################
my $VERSION = '4.2.0'; my $VERSION = '4.2.2';
my $webblast_exe = '/mnt/common/share/ProtoGene/webblast.pl'; my $webblast_exe = '/software/SequenceAnalysis/ProtoGene/4.2.0/bin/webblast.pl';
my $blast_exe = 'blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage my $blast_exe = 'blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage
my $exonerate_exe = 'exonerate'; # Exonerate 1.0 because current parser only works with this version my $exonerate_exe = 'exonerate'; # Exonerate 1.0 because current parser only works with this version
################## Option management ################## Option management
my ($msa, $revtrans, $pep, $hideBOJ, $run_name, $template, $lim, $cache) = ('', 0, 0, 0, '', '', 0, 'update'); my ($msa, $revtrans, $pep, $hideBOJ, $run_name, $template, $lim) = ('', 0, 0, 0, '', '', 0);
my ($debug, $tmp) = (0, 0); my ($cache, $cleancache) = ('update', 'update');
my ($db, $species, $local, $giga) = ($blast_param->{'db1'}, $blast_param->{'species'}, 0, 0); my ($debug, $tmp) = (0, 0);
my ($db, $species, $local, $giga) = ($blast_param->{'db1'}, $blast_param->{'species'}, 0, 0);
my %opts = ('msa|in=s' => \$msa, # Input sequences my %opts = ('msa|in=s' => \$msa, # Input sequences
'revtrans:s' => \$revtrans, # Use to reverse-translate sequences with no match 'revtrans:s' => \$revtrans, # Use to reverse-translate sequences with no match
'pep' => \$pep, # Add the original peptide query beneath the related CDS seq 'pep' => \$pep, # Add the original peptide query beneath the related CDS seq
...@@ -72,7 +71,8 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences ...@@ -72,7 +71,8 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences
'run_name=s' => \$run_name, # Use another name, instead of input seq name, for result files 'run_name=s' => \$run_name, # Use another name, instead of input seq name, for result files
'template=s' => \$template, # Use a template file 'template=s' => \$template, # Use a template file
'lim=i' => \$lim, # Limit number of input query sequences 'lim=i' => \$lim, # Limit number of input query sequences
'cache=s' => \$cache, # Cache behavior 'cachedir=s' => \$cache, # Cache directory
'cacheclean=s' => \$cleancache, # Cache behavior
'orgm|species=s' => \$species, # Organism(s) to blast against 'orgm|species=s' => \$species, # Organism(s) to blast against
'db|database=s' => \$db, # Database to blast against 'db|database=s' => \$db, # Database to blast against
...@@ -89,36 +89,38 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences ...@@ -89,36 +89,38 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences
'tmp' => \$tmp, # To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ... 'tmp' => \$tmp, # To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ...
); );
my $test_option_values = Getopt::Long::GetOptions(%opts); my $test_option_values = Getopt::Long::GetOptions(%opts);
$revtrans = 1 if ( $revtrans eq '' ); # Allow revtrans to be a boolean or a string option (for tcoffee web server)
################## Short help message ################## Short help message
if ( !$test_option_values || ($msa eq '' && $cache ne 'empty' && $cache ne 'old') ){ if ( !$test_option_values || ($msa eq '' && $cleancache ne 'empty' && $cleancache ne 'old') ){
print {*STDERR} "\n\tCannot open the MSA file in FASTA format print {*STDERR} "\n\tCannot open the MSA file in FASTA format
\tTry: $0 --msa=path_of_the_fasta_msa_file [Options] \tTry: $0 --msa=path_of_the_fasta_msa_file [Options]
\tOptions: --orgm=All_organisms, Bacteria, Viruses, Vertebrata, \tOptions: --orgm =All_organisms, Bacteria, Viruses, Vertebrata,
\t Eukaryota, Mammalia, Primates, Homo_sapiens, \t Eukaryota, Mammalia, Primates, Homo_sapiens,
\t Gallus_gallus, Bos_taurus, Escherichia_coli, \t Gallus_gallus, Bos_taurus, Escherichia_coli,
\t Arabidopsis_thaliana, Mus_musculus, \t Arabidopsis_thaliana, Mus_musculus,
\t Drosophila_Melanogaster, ... \t Drosophila_Melanogaster, ...
\t default is '$blast_param->{'species'}' \t default is '$blast_param->{'species'}'
\t --db=nr, pdb, swissprot, refseq_protein \t --db =nr, pdb, swissprot, refseq_protein
\t default is '$blast_param->{'db1'}' \t default is '$blast_param->{'db1'}'
\t --local to execute a local BLAST query with \t --local to execute a local BLAST query with
\t --db=path_for_a_local_db_blast_formated\n \t --db=path_for_a_local_db_blast_formated\n
\t --template to provide your own nucleotidic sequences \t --template to provide your own nucleotidic sequences
\t following the cds file format \t following the cds file format
\t --revtrans reverse-translates sequences with no \t --revtrans reverse-translates sequences with no
\t blast hit, in IUB (IUPAC) depiction code \t blast hit, in IUB (IUPAC) depiction code
\t They are removed from the alignement by default \t They are removed from the alignement by default
\t --pep adds the original peptide query beneath the \t --pep adds the original peptide query beneath the
\t back-translated sequence \t back-translated sequence
\t --cache=none, update, use, 'own_PATH_directory', old, empty \t --cachedir ='own_PATH_directory'
\t to select the cache mode \t (default is '$cachePath')
\t default is 'update'\n \t --cacheclean=none, update, use, old, empty
\t --debug prints extra information when running \t to select the cache behavior (default is 'update')\n
\t --version prints version information \t --debug prints extra information when running
\t --help prints a full help message\n\n"; \t --version prints version information
\t --help prints a full help message\n\n";
exit(1); exit(1);
} }
...@@ -256,7 +258,7 @@ $date .= "_$$"; #Add PID to be more uniq ...@@ -256,7 +258,7 @@ $date .= "_$$"; #Add PID to be more uniq
# run_name implementation for web server # run_name implementation for web server
$originalMSA = $run_name if ( $run_name ne '' ); $originalMSA = $run_name if ( $run_name ne '' );
unlink("$originalMSA.cds", "$originalMSA.cdsP", "$originalMSA.cdsP.html", unlink("$originalMSA.cds", "$originalMSA.cdsP", "$originalMSA.cdsP.html",
"$originalMSA.out", "$originalMSA.boj", "$originalMSA.log"); "$originalMSA.out", "$originalMSA.boj", "$originalMSA.log");
...@@ -302,10 +304,10 @@ for(my $r=0; $r<=$#original_names; $r++){ ...@@ -302,10 +304,10 @@ for(my $r=0; $r<=$#original_names; $r++){
# Get the BLASTp hit(s) acc number # Get the BLASTp hit(s) acc number
open(my $BLASTP, '<', "$cache/$date.blastp.$r"); open(my $BLASTP, '<', "$cache/$date.blastp.$r");
while(<$BLASTP>){ while(<$BLASTP>){
push @equivalent_blast_hits, $1 if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_' push @equivalent_blast_hits, $1 if ( $_ =~ /^>.+?\@.+?__([^\s\|\.]+).*$/ ); #Warning: double '_'
} }
close $BLASTP; close $BLASTP;
unlink("$cache/$date.blastp.$r", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($equivalent_blast_hits[0]) ); unlink("$cache/$date.blastp.$r", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($equivalent_blast_hits[0]) );
if ( !exists($equivalent_blast_hits[0]) ){ if ( !exists($equivalent_blast_hits[0]) ){
print "\n$r ...\tNo blast result found above thresholds for $fasta_header\n\n"; print "\n$r ...\tNo blast result found above thresholds for $fasta_header\n\n";
buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Not_searched'); buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Not_searched');
...@@ -437,14 +439,14 @@ for(my $r=0; $r<=$#original_names; $r++){ ...@@ -437,14 +439,14 @@ for(my $r=0; $r<=$#original_names; $r++){
} }
checkAndCleanStderrFiles("$cache/$date.ExonerateError") if ( -e "$cache/$date.ExonerateError" ); checkAndCleanStderrFiles("$cache/$date.ExonerateError") if ( -e "$cache/$date.ExonerateError" );
checkOutputFiles(); checkOutputFiles();
unlink("$msa") if ( $converted==1 ); unlink("$msa") if ( $converted==1 );
unlink 'blast_result.txt'; unlink 'blast_result.txt';
my @tmpFiles = glob($cache."/$date*"); my @tmpFiles = glob($cache."/$date*");
if ( exists($tmpFiles[0]) ){ if ( exists($tmpFiles[0]) ){
mkdir("$cache/$date.TMP"); mkdir("$cache/$date.TMP");
move("$_", "$cache/$date.TMP") while(<@tmpFiles>); move("$_", "$cache/$date.TMP") while(<@tmpFiles>);
} }
...@@ -506,7 +508,13 @@ with these options available: ...@@ -506,7 +508,13 @@ with these options available:
=item I<--pep> to add the original peptide query beneath the back-translated sequence =item I<--pep> to add the original peptide query beneath the back-translated sequence
=item I<--cache>=none, update, use, own_path_directory, old, empty =item I<--cachedir>=your_own_path_directory
=item cachedir sets cache directory
=item I<--cacheclean>=none, update, use, old, empty
=item cacheclean manages cache behavior
=item none: no cache usage, none temporary files are stored =item none: no cache usage, none temporary files are stored
...@@ -514,8 +522,6 @@ with these options available: ...@@ -514,8 +522,6 @@ with these options available:
=item use: force use cache, whatever the age of files =item use: force use cache, whatever the age of files
=item own_path_directory: use my own directory, and its files
=item old: remove the old files in the cache directory =item old: remove the old files in the cache directory
=item empty: empty completely the cache directory =item empty: empty completely the cache directory
...@@ -540,15 +546,15 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g ...@@ -540,15 +546,15 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
=item B<Perl 5.6 or better> is required ! =item B<Perl 5.6 or better> is required !
=item Standard Perl modules B<lib>, B<strict>, B<warnings>, B<diagnostics>, B<Carp> are required =item Standard Perl modules B<lib>, B<strict>, B<warnings>, B<diagnostics> are required
=item as well as some other current ones : B<Getopt::Long>, B<File::Which>, B<File::Copy>, B<Mail::Send>, B<Time::localtime>, B<LWP::Simple> =item as well as some other current ones : B<Getopt::Long>, B<File::Which>, B<File::Copy>, B<Mail::Send>, B<Time::localtime>, B<LWP::Simple>
=item - =item -
=item I<exonerate> from http://www.ebi.ac.uk/~guy/exonerate/ =item I<exonerate> version 2 from http://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate
=item I<blast> from http://www.ncbi.nlm.nih.gov/BLAST/download.shtml or http://blast.wustl.edu/ =item I<blast> from ftp://ftp.ncbi.nih.gov/blast/executables/
=back =back
...@@ -556,9 +562,9 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g ...@@ -556,9 +562,9 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
=over 8 =over 8
=item version 4.2.0 =item version 4.2.2
=item on Aug 08th, 2013 =item on Sep 09th, 2016
=back =back
...@@ -885,7 +891,7 @@ sub blastPAcc2PGI{ ...@@ -885,7 +891,7 @@ sub blastPAcc2PGI{
my $protGI = ''; my $protGI = '';
#FIXME: should be ${blastHit}[pacc] but something is broken at NCBI #FIXME: should be ${blastHit}[pacc] but something is broken at NCBI
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch"); my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){ if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$protGI = $1; $protGI = $1;
} }
...@@ -899,7 +905,7 @@ sub protGI2NTGIs{ ...@@ -899,7 +905,7 @@ sub protGI2NTGIs{
my $ntGIs = ''; my $ntGIs = '';
my $geneID = ''; my $geneID = '';
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch"); my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my @xml = split("\n", $content); my @xml = split("\n", $content);
my $flag = 0; my $flag = 0;
...@@ -936,8 +942,8 @@ sub geneID2Chr{ ...@@ -936,8 +942,8 @@ sub geneID2Chr{
my $chr = ''; my $chr = '';
my ($amont, $aval) = ('', ''); my ($amont, $aval) = ('', '');
# my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch"); # my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch"); my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my @xml = split("\n", $content); my @xml = split("\n", $content);
my $flag = 0; my $flag = 0;
...@@ -986,10 +992,10 @@ sub downloadSeqFromGIs{ ...@@ -986,10 +992,10 @@ sub downloadSeqFromGIs{
GET_SEQ: GET_SEQ:
for(my $a=0; $a<=$#acc; $a++){ for(my $a=0; $a<=$#acc; $a++){
if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){ if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]-$amont.fas") ; fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]-$amont.fas") ;
} }
else{ else{
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'", "$cache/$acc[$a].fas"); fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'", "$cache/$acc[$a].fas");
} }
} }
...@@ -1008,17 +1014,17 @@ sub download_seq{ ...@@ -1008,17 +1014,17 @@ sub download_seq{
if ( $pacc2puid !~ /^[NAX][CGTSWZMR]_/ ){ #Not RefSeq acc if ( $pacc2puid !~ /^[NAX][CGTSWZMR]_/ ){ #Not RefSeq acc
#pacc = primary acc NOT prot acc ! #265666 -> S55551 #pacc = primary acc NOT prot acc ! #265666 -> S55551
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch"); my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){ if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$pacc2puid = $1; $pacc2puid = $1;
} }
} }
if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){ if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]--$from-$to.fas"); fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]--$from-$to.fas");
} }
else{ else{
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a].fas"); fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a].fas");
} }
#FIXME Don't remember exactly what all this function does #FIXME Don't remember exactly what all this function does
...@@ -1583,6 +1589,3 @@ sub checkTemplate { ...@@ -1583,6 +1589,3 @@ sub checkTemplate {
return(undef, undef, undef); return(undef, undef, undef);
} }
########################################################
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment