Commit 1151b3f3 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Start to use new NCBI GI format + eutils https + Start new cache mecanism + Cleaning

parent 8aba9ba8
......@@ -10,7 +10,6 @@
use strict;
use warnings;
use diagnostics;
use Carp;
use File::Which qw(which); # Locate external executable programs in the PATH
use Time::localtime; # Use localtime+PID for a pseudo-uniq temp file name
......@@ -20,7 +19,8 @@ use LWP::Simple; # To test gigablaster availability
use Mail::Send; # Send warnings and errors files by e-mail ==> only if the $userEMail variable is defined
use lib '/mnt/common/share/ProtoGene/'; # Local path for ProtoGene's own perl modules
# TODO use FinBin for lib and binary paths !!!
use lib '/software/SequenceAnalysis/ProtoGene/4.2.0/lib/'; # Local path for ProtoGene's own perl modules
use Exonerate; # Exonerate runner, parser, ...
use Views; # Non-text outputs, e.g. HTML/CSS
#use CheckOutput; # Check output for cds consistancy with query
......@@ -28,9 +28,7 @@ use Views; # Non-text outputs, e.g. HTML/CSS
################## CONFIGURATION ##################
#$ENV{'PATH'} .= ':/mnt/local/bin/:./'; # Additional path for executables
my $cachePath = '/scratch/fhgfs/tcoffee/ProtoGene_Cache'; # Cache directory
my $cachePath = '/scratch/beegfs/monthly/tcoffee/ProtoGene_Cache'; # Cache directory
my $cacheStorageTime = 15; # Do not update sequences younger than X days
my $userEMail = 'moretti.sebastien@gmail.com'; # To receive e-mails with encountered problems; leave blank to inactive
......@@ -49,22 +47,23 @@ my $blast_param = { 'evalue' => 0.05,
'cover2' => 95,
'nbesthits' => 10,
'core' => 1,
'core' => 1, # TODO really used ?
};
###################################################
my $VERSION = '4.2.0';
my $VERSION = '4.2.2';
my $webblast_exe = '/mnt/common/share/ProtoGene/webblast.pl';
my $webblast_exe = '/software/SequenceAnalysis/ProtoGene/4.2.0/bin/webblast.pl';
my $blast_exe = 'blastall'; # Or wu-blastall for Wu-BLAST; for local blast usage
my $exonerate_exe = 'exonerate'; # Exonerate 1.0 because current parser only works with this version
################## Option management
my ($msa, $revtrans, $pep, $hideBOJ, $run_name, $template, $lim, $cache) = ('', 0, 0, 0, '', '', 0, 'update');
my ($debug, $tmp) = (0, 0);
my ($db, $species, $local, $giga) = ($blast_param->{'db1'}, $blast_param->{'species'}, 0, 0);
my ($msa, $revtrans, $pep, $hideBOJ, $run_name, $template, $lim) = ('', 0, 0, 0, '', '', 0);
my ($cache, $cleancache) = ('update', 'update');
my ($debug, $tmp) = (0, 0);
my ($db, $species, $local, $giga) = ($blast_param->{'db1'}, $blast_param->{'species'}, 0, 0);
my %opts = ('msa|in=s' => \$msa, # Input sequences
'revtrans:s' => \$revtrans, # Use to reverse-translate sequences with no match
'pep' => \$pep, # Add the original peptide query beneath the related CDS seq
......@@ -72,7 +71,8 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences
'run_name=s' => \$run_name, # Use another name, instead of input seq name, for result files
'template=s' => \$template, # Use a template file
'lim=i' => \$lim, # Limit number of input query sequences
'cache=s' => \$cache, # Cache behavior
'cachedir=s' => \$cache, # Cache directory
'cacheclean=s' => \$cleancache, # Cache behavior
'orgm|species=s' => \$species, # Organism(s) to blast against
'db|database=s' => \$db, # Database to blast against
......@@ -89,36 +89,38 @@ my %opts = ('msa|in=s' => \$msa, # Input sequences
'tmp' => \$tmp, # To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ...
);
my $test_option_values = Getopt::Long::GetOptions(%opts);
$revtrans = 1 if ( $revtrans eq '' ); # Allow revtrans to be a boolean or a string option (for tcoffee web server)
################## Short help message
if ( !$test_option_values || ($msa eq '' && $cache ne 'empty' && $cache ne 'old') ){
if ( !$test_option_values || ($msa eq '' && $cleancache ne 'empty' && $cleancache ne 'old') ){
print {*STDERR} "\n\tCannot open the MSA file in FASTA format
\tTry: $0 --msa=path_of_the_fasta_msa_file [Options]
\tOptions: --orgm=All_organisms, Bacteria, Viruses, Vertebrata,
\t Eukaryota, Mammalia, Primates, Homo_sapiens,
\t Gallus_gallus, Bos_taurus, Escherichia_coli,
\t Arabidopsis_thaliana, Mus_musculus,
\t Drosophila_Melanogaster, ...
\t default is '$blast_param->{'species'}'
\t --db=nr, pdb, swissprot, refseq_protein
\t default is '$blast_param->{'db1'}'
\t --local to execute a local BLAST query with
\t --db=path_for_a_local_db_blast_formated\n
\t --template to provide your own nucleotidic sequences
\t following the cds file format
\t --revtrans reverse-translates sequences with no
\t blast hit, in IUB (IUPAC) depiction code
\t They are removed from the alignement by default
\t --pep adds the original peptide query beneath the
\t back-translated sequence
\t --cache=none, update, use, 'own_PATH_directory', old, empty
\t to select the cache mode
\t default is 'update'\n
\t --debug prints extra information when running
\t --version prints version information
\t --help prints a full help message\n\n";
\tOptions: --orgm =All_organisms, Bacteria, Viruses, Vertebrata,
\t Eukaryota, Mammalia, Primates, Homo_sapiens,
\t Gallus_gallus, Bos_taurus, Escherichia_coli,
\t Arabidopsis_thaliana, Mus_musculus,
\t Drosophila_Melanogaster, ...
\t default is '$blast_param->{'species'}'
\t --db =nr, pdb, swissprot, refseq_protein
\t default is '$blast_param->{'db1'}'
\t --local to execute a local BLAST query with
\t --db=path_for_a_local_db_blast_formated\n
\t --template to provide your own nucleotidic sequences
\t following the cds file format
\t --revtrans reverse-translates sequences with no
\t blast hit, in IUB (IUPAC) depiction code
\t They are removed from the alignement by default
\t --pep adds the original peptide query beneath the
\t back-translated sequence
\t --cachedir ='own_PATH_directory'
\t (default is '$cachePath')
\t --cacheclean=none, update, use, old, empty
\t to select the cache behavior (default is 'update')\n
\t --debug prints extra information when running
\t --version prints version information
\t --help prints a full help message\n\n";
exit(1);
}
......@@ -256,7 +258,7 @@ $date .= "_$$"; #Add PID to be more uniq
# run_name implementation for web server
$originalMSA = $run_name if ( $run_name ne '' );
$originalMSA = $run_name if ( $run_name ne '' );
unlink("$originalMSA.cds", "$originalMSA.cdsP", "$originalMSA.cdsP.html",
"$originalMSA.out", "$originalMSA.boj", "$originalMSA.log");
......@@ -302,10 +304,10 @@ for(my $r=0; $r<=$#original_names; $r++){
# Get the BLASTp hit(s) acc number
open(my $BLASTP, '<', "$cache/$date.blastp.$r");
while(<$BLASTP>){
push @equivalent_blast_hits, $1 if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
push @equivalent_blast_hits, $1 if ( $_ =~ /^>.+?\@.+?__([^\s\|\.]+).*$/ ); #Warning: double '_'
}
close $BLASTP;
unlink("$cache/$date.blastp.$r", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($equivalent_blast_hits[0]) );
unlink("$cache/$date.blastp.$r", "$cache/$date.seq2blast.fas.$r") if ( $tmp == 0 || exists($equivalent_blast_hits[0]) );
if ( !exists($equivalent_blast_hits[0]) ){
print "\n$r ...\tNo blast result found above thresholds for $fasta_header\n\n";
buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Not_searched');
......@@ -437,14 +439,14 @@ for(my $r=0; $r<=$#original_names; $r++){
}
checkAndCleanStderrFiles("$cache/$date.ExonerateError") if ( -e "$cache/$date.ExonerateError" );
checkAndCleanStderrFiles("$cache/$date.ExonerateError") if ( -e "$cache/$date.ExonerateError" );
checkOutputFiles();
unlink("$msa") if ( $converted==1 );
unlink("$msa") if ( $converted==1 );
unlink 'blast_result.txt';
my @tmpFiles = glob($cache."/$date*");
if ( exists($tmpFiles[0]) ){
mkdir("$cache/$date.TMP");
move("$_", "$cache/$date.TMP") while(<@tmpFiles>);
move("$_", "$cache/$date.TMP") while(<@tmpFiles>);
}
......@@ -506,7 +508,13 @@ with these options available:
=item I<--pep> to add the original peptide query beneath the back-translated sequence
=item I<--cache>=none, update, use, own_path_directory, old, empty
=item I<--cachedir>=your_own_path_directory
=item cachedir sets cache directory
=item I<--cacheclean>=none, update, use, old, empty
=item cacheclean manages cache behavior
=item none: no cache usage, none temporary files are stored
......@@ -514,8 +522,6 @@ with these options available:
=item use: force use cache, whatever the age of files
=item own_path_directory: use my own directory, and its files
=item old: remove the old files in the cache directory
=item empty: empty completely the cache directory
......@@ -540,15 +546,15 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
=item B<Perl 5.6 or better> is required !
=item Standard Perl modules B<lib>, B<strict>, B<warnings>, B<diagnostics>, B<Carp> are required
=item Standard Perl modules B<lib>, B<strict>, B<warnings>, B<diagnostics> are required
=item as well as some other current ones : B<Getopt::Long>, B<File::Which>, B<File::Copy>, B<Mail::Send>, B<Time::localtime>, B<LWP::Simple>
=item -
=item I<exonerate> from http://www.ebi.ac.uk/~guy/exonerate/
=item I<exonerate> version 2 from http://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate
=item I<blast> from http://www.ncbi.nlm.nih.gov/BLAST/download.shtml or http://blast.wustl.edu/
=item I<blast> from ftp://ftp.ncbi.nih.gov/blast/executables/
=back
......@@ -556,9 +562,9 @@ PROTOGENE re-builds the original alignment with nucleotidic information it has g
=over 8
=item version 4.2.0
=item version 4.2.2
=item on Aug 08th, 2013
=item on Sep 09th, 2016
=back
......@@ -885,7 +891,7 @@ sub blastPAcc2PGI{
my $protGI = '';
#FIXME: should be ${blastHit}[pacc] but something is broken at NCBI
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$protGI = $1;
}
......@@ -899,7 +905,7 @@ sub protGI2NTGIs{
my $ntGIs = '';
my $geneID = '';
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my @xml = split("\n", $content);
my $flag = 0;
......@@ -936,8 +942,8 @@ sub geneID2Chr{
my $chr = '';
my ($amont, $aval) = ('', '');
# my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
# my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my @xml = split("\n", $content);
my $flag = 0;
......@@ -986,10 +992,10 @@ sub downloadSeqFromGIs{
GET_SEQ:
for(my $a=0; $a<=$#acc; $a++){
if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]-$amont.fas") ;
fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]-$amont.fas") ;
}
else{
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'", "$cache/$acc[$a].fas");
fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$acc[$a]&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'", "$cache/$acc[$a].fas");
}
}
......@@ -1008,17 +1014,17 @@ sub download_seq{
if ( $pacc2puid !~ /^[NAX][CGTSWZMR]_/ ){ #Not RefSeq acc
#pacc = primary acc NOT prot acc ! #265666 -> S55551
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$pacc2puid = $1;
}
}
if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]--$from-$to.fas");
fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]--$from-$to.fas");
}
else{
fetch_fasta("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a].fas");
fetch_fasta("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a].fas");
}
#FIXME Don't remember exactly what all this function does
......@@ -1583,6 +1589,3 @@ sub checkTemplate {
return(undef, undef, undef);
}
########################################################
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment