ProtoGene.pl 63.1 KB
Newer Older
Sebastien Moretti's avatar
Sebastien Moretti committed
1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env perl

########################################################
#                                                      #
# ProtoGene : Bona-Fide Back Translation of Protein    #
#             Multiple Sequence Alignments             #
#                                                      #
########################################################

use strict;
use warnings;
use diagnostics;
Sebastien Moretti's avatar
Sebastien Moretti committed
13
14
use Carp;

Sebastien Moretti's avatar
Sebastien Moretti committed
15
16
17
use Env qw(HOME);                          # Use only environmental (shell) HOME variable
use File::Basename;
use File::Copy qw(move);                   # Avoid external 'mv' command usage
18
use LWP::Simple;                           # To test gigablaster availability
Sebastien Moretti's avatar
Sebastien Moretti committed
19

20
use lib '/mnt/common/share/ProtoGene/';    # Local path for ProtoGene's own perl modules
Sebastien Moretti's avatar
Sebastien Moretti committed
21
22
23
24
25
26
use lib '/mnt/local/lib/tcoffee_perl/';
use Getopt::Long;                          # For options specifications
use File::Which qw(which);                 # Locate external executable programs in the PATH
#use Time::Format qw(%time);               # Use local time for part of pseudo-uniq temp file name
use Mail::Send;                            # Send warnings and errors files by e-mail ==> only if the $userEMail variable is defined

Sebastien Moretti's avatar
Sebastien Moretti committed
27
use Exonerate;                             # Exonerate runner, parser, ...
28
use Views;                                 # Non-text outputs, e.g. HTML/CSS
Sebastien Moretti's avatar
Sebastien Moretti committed
29
#use CheckOutput;                       # Check output for cds consistancy with query
Sebastien Moretti's avatar
Sebastien Moretti committed
30
31
32
33
34
35



############## Specific to our server
$ENV{'PATH'} .= ':/mnt/local/bin/'; # additional path for executable on the server
##############
Sebastien Moretti's avatar
Sebastien Moretti committed
36
#TODO: remove problematic char when "** FATAL ERROR **: Unrecognised symbol '0' (ascii:48) fi..."
Sebastien Moretti's avatar
Sebastien Moretti committed
37
38


39
my $VERSION  = '4.0.0';
40
41
my $uct      = 15;                                                  # UpdateCacheThreshold: number of days before update
my $cachedir = '/scratch/cluster/monthly/t_coffee/ProtoGene_Cache'; # Cache directory
Sebastien Moretti's avatar
Sebastien Moretti committed
42
##### User settings ####################################
43
my $userEMail    = 'moretti.sebastien@gmail.com';      # To receive an e-mail with encountered problems; leave blank to inactive this option
44
my $webblast_exe = which('webblast.pl')   || '';       #
45
46
my $blast_prog   = 'blastall';                         # Or wu-blastall for Wu-BLAST; for local blast usage
my $exonerate    = which('exonerate-1.0') || '';       #
Sebastien Moretti's avatar
Sebastien Moretti committed
47
########################################################
48
my $blast_exe = which($blast_prog) || '';
49
my $doc       = which('perldoc')   || '';
50
51
#my $date = $time{'yy-mm-dd_hh\hmm-ss.mmm'};   #Used to get a unique file
my $date  = `/usr/bin/env date +%y-%m-%d_%T`; #Used to get a unique file
Sebastien Moretti's avatar
Sebastien Moretti committed
52
53
54
chomp($date);
$date =~ s/\:/\-/g;

Sebastien Moretti's avatar
Sebastien Moretti committed
55
checkProgramsPresence($webblast_exe, $exonerate); # Check external programs presence in the PATH
Sebastien Moretti's avatar
Sebastien Moretti committed
56
57
$blast_exe =~ s/${blast_prog}$//;

Sebastien Moretti's avatar
Sebastien Moretti committed
58
checkCacheAccessibility($cachedir); # Check cache directory accessibility
Sebastien Moretti's avatar
Sebastien Moretti committed
59
60
61
62



##Options management
63
my ($msa, $species, $db, $athome, $revtrans, $pep, $hideBOJ, $debug)   = ('', 'All_organisms', 'refseq_protein', 0, 0, 0, 0, 0);
64
65
my ($version, $help, $run_name, $Cache, $template, $lim, $tmp) = (0, 0, '', 'update', '', 0, 0);
my $giga = 0;
Sebastien Moretti's avatar
Sebastien Moretti committed
66
67
68
69
70
71
72
73
74
GetOptions("msa|in=s"       => \$msa,        #Input sequences
           "orgm|species=s" => \$species,    #Organism(s) to blast against
           "db|database=s"  => \$db,         #Database to blast against
           "local"          => \$athome,     #Use to specify a local db query, definied in $local_db
           "giga"           => \$giga,       #Use GigaBlaster server
           "revtrans"       => \$revtrans,   #Use to reverse-translate sequences with no match
           "pep"            => \$pep,        #Add the original peptide query beneath the linked nt seq
           "version|v"      => \$version,    #Print version information
           "help|h"         => \$help,       #Print full help message
75
           "run_name=s"     => \$run_name,   #Use another name, instead of input seq name, for result files
Sebastien Moretti's avatar
Sebastien Moretti committed
76
           "cache=s"        => \$Cache,      #Specify what cache parameter
77
           "hideBOJ"        => \$hideBOJ,
78
           "debug"          => \$debug,      #Verbose output
Sebastien Moretti's avatar
Sebastien Moretti committed
79
80
           "template=s"     => \$template,   #Use a template file
           "lim=i"          => \$lim,        #Limite number of input query sequences
81
           "tmp"            => \$tmp,        #To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ...
Sebastien Moretti's avatar
Sebastien Moretti committed
82
           );
83
84
$athome = 2 if ( $giga==1 );
if ( $version==1 ){
85
    print "\n\tPROTOGENE : Bona-Fide Back Translation of Protein Multiple Sequence Alignments\n\tversion   : $VERSION\n\n";
86
    exit 0;
Sebastien Moretti's avatar
Sebastien Moretti committed
87
}
88
89
if ( $help==1 ){
    if ( $doc ne '' ){
90
        system("$doc $0");
Sebastien Moretti's avatar
Sebastien Moretti committed
91
92
    }
    else{
93
        print {*STDERR} "\n\tperldoc command doesn't seem to be available\n\tto see the documentation in pod format\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
94
    }
95
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
96
}
97
if ( $blast_exe eq '' && $version==0 && $help==0 ){
98
    print {*STDERR} "\n\t$blast_prog program is not reachable\n\tIt could not be in your PATH or not installed\n\tIt is required only for local blast searches\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
99
100
101
}

#Cache management
Sebastien Moretti's avatar
Sebastien Moretti committed
102
my $cache = cacheManagement($Cache) || '.';
Sebastien Moretti's avatar
Sebastien Moretti committed
103
104
105


#Open and check template file
106
my ($templG, $templP, $templSeq);
Sebastien Moretti's avatar
Sebastien Moretti committed
107
($templG, $templP, $templSeq) = checkTemplate($template) if ($template);
Sebastien Moretti's avatar
Sebastien Moretti committed
108
109
110


#Short help message
111
if ( $msa eq '' || $species eq '' || $db eq '' || $cache eq '' ){
112
    my $appli = basename($0);
113
    print {*STDERR} "\n\tCannot open the MSA file in FASTA format
Sebastien Moretti's avatar
Sebastien Moretti committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
\tTry:  $appli --msa=path_of_the_fasta_msa_file [Options]

\tOptions: --orgm=All_organisms, Bacteria, Viruses, Vertebrata,
\t                Eukaryota, Mammalia, Primates, Homo_sapiens,
\t                Gallus_gallus, Bos_taurus, Escherichia_coli,
\t                Arabidopsis_thaliana, Mus_musculus,
\t                Drosophila_Melanogaster, ...
\t                default is 'All_organisms'
\t         --db=nr, pdb, swissprot, refseq_protein
\t                default is 'refseq_protein'
\t         --local to execute a local BLAST query with
\t                --db=path_for_a_local_db_blast_formated
\t         --template to provide your own nucleotidic sequences
\t                following the cds file format
\t         --revtrans to reverse-translate sequences with no
\t                blast hit, in IUB (IUPAC) depiction code
\t                They are removed from the alignement by default
\t         --pep  to add the original peptide query beneath the
\t                back-translated sequence
\t         --cache=none, update, use, own_PATH_directory, old, empty
\t                to select the cache mode
\t                default is 'update'
\t         --version to print version information
\t         --help to print a full help message\n\n";
138
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
139
140
141
142
143
}



##Open and Check the fasta file
144
145
my $originalMSA   = $msa;
my $change        = 0;
Sebastien Moretti's avatar
Sebastien Moretti committed
146
($msa, $change)   = checkFastaFile($msa, $change);
147
my $fasta_checker = -1;
148
my (@input_order, @original_names, @original_seq); #Use 3 lists in parallel : original_names, seq, order(1->n)
149
open( my $MSA, '<', "$msa" ) if ( -e "$msa" );
150
151
while(<$MSA>){
    if ( $_ =~ /^>/ ){
152
153
154
155
156
157
158
        $fasta_checker++;
        my $name = $_;
        $name    =~ s/\r\n//g; #Remove return lines from windows OS '^M'
        $name    =~ s/^>[ \t]+/>/;
        chomp($name);
        @original_names = (@original_names, $name);
        @input_order    = (@input_order, $fasta_checker);
Sebastien Moretti's avatar
Sebastien Moretti committed
159
    }
160
    elsif ( $_ !~ /^>/ ){
161
162
163
        my $seqq = $_;
        $seqq    =~ s/\r\n//g;
        $seqq    =~ s/\./-/g; #for msa with '.' as gap
164
        $seqq    =~ s{[BJOUZ]}{X}ig; #U here for selenocystein
165
166
        $seqq    =~ s/[^A-Za-z\-\*\n\r]//g; #Remove all the non-gap or non-alphabetic characters from the seq
        chomp($seqq);
Sebastien Moretti's avatar
Sebastien Moretti committed
167
        #fasta sequence on 1 line
168
        if ( !exists($original_seq[$fasta_checker]) ){
169
170
171
172
173
            @original_seq = (@original_seq, $seqq);
        }
        else {
            $original_seq[$fasta_checker] = $original_seq[$fasta_checker].$seqq;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
174
175
    }
}
176
close $MSA;
177
if ( $fasta_checker == -1 ){
Sebastien Moretti's avatar
Sebastien Moretti committed
178
    failure();
Sebastien Moretti's avatar
Sebastien Moretti committed
179
    print {*STDERR} "\tThe MSA file does NOT seem to be a protein FASTA format\n\tSee <a href='http://en.wikipedia.org/wiki/FASTA_format' target='_blank'>FASTA format</a>\n\n";
180
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
181
}
182
elsif ( $lim >0 && $fasta_checker > $lim ){
Sebastien Moretti's avatar
Sebastien Moretti committed
183
    failure();
184
    print {*STDERR} "\tThe FASTA file is too large, try with less than $lim sequences\n\tor split your file\n\n";
185
186
    exit(1);
}
187
elsif ( exists( $original_seq[0] ) && $original_seq[0] =~ /[acgtu]/i ){
188
    my $first_seq = $original_seq[0];
189
    #Check if sequences are amino acids and not nucleotides
190
191
192
    my $a   = ($first_seq =~ s/[aA]//g)   || 0;
    my $c   = ($first_seq =~ s/[cC]//g)   || 0;
    my $g   = ($first_seq =~ s/[gG]//g)   || 0;
193
194
    my $t   = ($first_seq =~ s/[tTuU]//g) || 0;
    my $non = ($first_seq =~ s/[^aAcCgGtTuUXxNn-]//g) || 0;
195
    if ( ($a+$c+$g+$t) >= (($a+$c+$g+$t+$non)*80/100) ){
Sebastien Moretti's avatar
Sebastien Moretti committed
196
        failure();
197
        print {*STDERR} "\tYour sequences seem already to be nucleotides\n\tthis program purpose is to turn AMINO ACID alignments into CDS nucleotide alignments\n\n";
198
199
        exit(1);
    }
Sebastien Moretti's avatar
Sebastien Moretti committed
200
201
202
203
}
undef $fasta_checker;


204
205
206
207
208
209
210
211
212
#Check GigaBlaster status if used
if ( $giga==1 ){
    if ( head('http://www.igs.cnrs-mrs.fr/Giga2/~database/remoteblast.cgi') ){
    }
    else{
        $athome = 0;
    }
}

Sebastien Moretti's avatar
Sebastien Moretti committed
213
214

#Start main program with version # of programs and list original queries
215
print "\n\t   Protogene\t$VERSION\t$date\n\n";
216
217
open(my $EXONERATEISHERE, "$exonerate --version |");
my $IsExonerateHere = <$EXONERATEISHERE>;
218
close $EXONERATEISHERE;
Sebastien Moretti's avatar
Sebastien Moretti committed
219
220
print "\t   $IsExonerateHere\n";
undef $IsExonerateHere;
221
undef $VERSION;
Sebastien Moretti's avatar
Sebastien Moretti committed
222
223
224
for(my $m=0; $m<=$#original_names; $m++){
    print "\t$original_names[$m]\n";
}
225
$date .= "_$$"; #Add PID
Sebastien Moretti's avatar
Sebastien Moretti committed
226
227
228
229



# run_name implementation for web server
230
$originalMSA = $run_name if ( $run_name ne '' );
Sebastien Moretti's avatar
Sebastien Moretti committed
231
232
233

unlink("${originalMSA}.cds");
unlink("${originalMSA}.cdsP");
Sebastien Moretti's avatar
Sebastien Moretti committed
234
unlink("${originalMSA}.cdsP.html");
Sebastien Moretti's avatar
Sebastien Moretti committed
235
236
237
238
239
240
unlink("${originalMSA}.out");
unlink("${originalMSA}.boj");



# Remove old files from the cache directory
Sebastien Moretti's avatar
Sebastien Moretti committed
241
cacheManagement('old', '1'); #everytime ProtoGene is running
Sebastien Moretti's avatar
Sebastien Moretti committed
242
243
244
245
246




##Build the sequences, from the alignment, to perform webblast
247
248
my $quiet = '';
EACH_SEQ:
Sebastien Moretti's avatar
Sebastien Moretti committed
249
for(my $r=0; $r<=$#input_order; $r++){
250
251
252
253
    my $right_name = $original_names[$r];
    my $templ_name = $original_names[$r];
    $right_name    =~ s/^>//;
    $templ_name    =~ s/^([^\s\t]+).*$/$1/;
Sebastien Moretti's avatar
Sebastien Moretti committed
254

255
    open(my $READY2BLAST, '>', "$cache/${date}_seq2blast.fas${r}");
256
257
    my $noGap = $original_seq[$r];
    $noGap    =~ s/[\.-]//g; #remove gaps
258
259
    print {$READY2BLAST} ">$input_order[$r]\n$noGap\n";
    close $READY2BLAST;
Sebastien Moretti's avatar
Sebastien Moretti committed
260
261
262
263
    undef $noGap;


    my @multipleequalblasthits;
264
265
266
    if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' ){
        @multipleequalblasthits = (@multipleequalblasthits, 'My_own_seq'); #To change for ?
        print "\n\n$r done\n\nNucleotidic template for $r:\t$templG->{$templ_name}\n\n**********************************************************************\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
267
    }
268
269
270
    elsif ( exists($templP->{$templ_name}) && $templP->{$templ_name} ne '' ){
        @multipleequalblasthits = (@multipleequalblasthits, $templP->{$templ_name});
        print "\n\n$r done\n\nBlastP template for $r:\t$templP->{$templ_name}\n\n**********************************************************************\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
271
272
    }
    else{
273
    ##   --> send them to webblast
274
        $quiet = '-quiet=on ' if ( $r>0 ); #Show webblast parameters only for the first webblast run
Sebastien Moretti's avatar
Sebastien Moretti committed
275
        launchWebblast("$cache/${date}_seq2blast.fas${r}", $quiet, $athome, $db, $species, $blast_exe, "$cache/${date}.blastp${r}");
Sebastien Moretti's avatar
Sebastien Moretti committed
276

277
    ##Get the BLASTp hit(s)  acc number
278
279
280
281
282
283
284
285
        open(my $BLASTP, '<', "${cache}/${date}.blastp${r}");
        while(<$BLASTP>){
            @multipleequalblasthits = (@multipleequalblasthits, $1) if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
        }
        close $BLASTP;
        unlink("$cache/${date}.blastp${r}", "${cache}/${date}_seq2blast.fas${r}") if ( $tmp == 0 || exists($multipleequalblasthits[0]) );
        if ( !exists($multipleequalblasthits[0]) ){
            print "\n$input_order[$r] ...\tNo blast result found above filter thresholds for $right_name\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
286
            buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Unavailable', '');
287
288
289
290
291
            undef $original_seq[$r];
            undef $original_names[$r];
            undef $input_order[$r];
            next EACH_SEQ;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
292
293
294
295
296
    }



    #When multiple equal blast hits  ==> use, and add, every hits
297
298
299
    my ($resultPOS, $resultBOJ) = ('', '');
    my @failureStatus = ('');
    my @intronStatus  = ('');
Sebastien Moretti's avatar
Sebastien Moretti committed
300
    for(my $qq=0; $qq<=$#multipleequalblasthits; $qq++){
301
302
303
304
305
306
        print "\n${right_name}  -->  [${multipleequalblasthits[$qq]}]\n";


        my @nt_GIs;
        my $intronStep = 0;
        if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' && $templG->{$templ_name} !~ /^My_Seq$/i ){
Sebastien Moretti's avatar
Sebastien Moretti committed
307
            downloadSeq($cache, $date, '', '', $templG->{$templ_name});
308
309
310
311
312
313
314
315
316
317
            @nt_GIs = $templG->{$templ_name};
        }
        elsif ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' && $templG->{$templ_name} =~ /^My_Seq$/i ){
            open(my $MYSEQ, '>', "$cache/My_Seq-$$.fas");
            print {$MYSEQ} ">My_Seq-$$ for $templ_name\n".$templSeq->{$templ_name}."\n";
            close $MYSEQ;
            @nt_GIs = "My_Seq-$$";
        }
        else{
        #Prot ACC -> PUID
Sebastien Moretti's avatar
Sebastien Moretti committed
318
            my $protGI = blastPAcc2PGI($multipleequalblasthits[$qq]);
319
320
321
322
323
            if ( $protGI eq '' ){
                print "\tNo protein (prot GI) link found for ${multipleequalblasthits[$qq]} in $right_name\n\n";
                @failureStatus = ('PUI_unavailable', ${multipleequalblasthits[$qq]});
                next;
            }
Sebastien Moretti's avatar
Sebastien Moretti committed
324
325
#           print "\t$protGI\n";

326
        #PUID -> nt GIs & GeneID
Sebastien Moretti's avatar
Sebastien Moretti committed
327
            my ($ntGIs, $geneID) = protGI2NTGIs($protGI, $multipleequalblasthits[$qq]);
328
329
330
331
332
333
334
335
336
337
            if ( $ntGIs eq '' && $geneID eq '' ){
                print "\tNo nucleotide (nt GIs) link found for ${multipleequalblasthits[$qq]} in $right_name\n\n";
                @failureStatus = ('No_nt_link', ${multipleequalblasthits[$qq]});
                next;
            }
            print " linked with [$ntGIs $geneID]\n";


        #Get transcript and contig seq
            @nt_GIs = split(/,/, $ntGIs);
Sebastien Moretti's avatar
Sebastien Moretti committed
338
            downloadSeqFromGIs($cache, $date, '', '', @nt_GIs);
339
340
341
342
        #GeneID -> Chr
            my @geneIDs = split(/,/, $geneID);
            while(<@geneIDs>){
                my $geneID = $_;
Sebastien Moretti's avatar
Sebastien Moretti committed
343
                my ($chr, $amont, $aval) = geneID2Chr($geneID, $multipleequalblasthits[$qq]);
344
345
346
347
                print "\n\tNo gene locus found for ${multipleequalblasthits[$qq]} with $geneID in $right_name\n\n" if ($chr eq '' and $geneID ne '');

                if ( $chr ne '' ){
            #Get Chr seq
Sebastien Moretti's avatar
Sebastien Moretti committed
348
                    downloadSeq($cache, $date, $amont, $aval, $chr);
349
350
351
352
353
354
355
356
357
                    $intronStep = 1;

                    @nt_GIs = ("$chr:$amont-$aval", @nt_GIs);
                }
            }
        }


    #Align Nt seq with our protein query seq
Sebastien Moretti's avatar
Sebastien Moretti committed
358
        my ($POS, $BOJ) = runExonerate($exonerate, $cache, $date, ${input_order[$r]}, ${original_seq[$r]}, $right_name, $multipleequalblasthits[$qq], $r, @nt_GIs);
359

Sebastien Moretti's avatar
Sebastien Moretti committed
360
        my ($POSresult, $bestNucleotide) = prepareResults4CDS($POS, $multipleequalblasthits[$qq], ${original_seq[$r]}, $right_name);
361
362
363
364
365
366
367
368
        $resultPOS .= $POSresult if ( $bestNucleotide ne '' && $POSresult ne '' && $resultPOS !~ /_G_$bestNucleotide/ );

        @intronStatus = ($POS->{0}, ${multipleequalblasthits[$qq]}, '')              if ( $intronStep==1 && exists($POS->{0}) && $POS->{0} ne '' && $POS->{0} =~ /\:/ );
        @intronStatus = ($POS->{0}, ${multipleequalblasthits[$qq]}, $bestNucleotide) if ( $intronStep==0 && ($bestNucleotide =~ /^N[CTWZG]_/ || $bestNucleotide =~ /^AC_/) );

        my ($intronLess, $nameLess) = ('', '');
        $intronLess = $POS->{0}       if ( %$BOJ eq 0 && exists($POS->{0}) && $POS->{0} ne '' );
        $nameLess   = $bestNucleotide if ( %$BOJ eq 0 && $intronStep==0 && ($bestNucleotide =~ /^N[CTWZG]_/ || $bestNucleotide =~ /^AC_/) );
Sebastien Moretti's avatar
Sebastien Moretti committed
369
        my ($BOJresult, $bestGenomic) = prepareResults4BOJ($BOJ, $multipleequalblasthits[$qq], ${original_seq[$r]}, $right_name, $intronLess, $nameLess);
370
        $resultBOJ .= $BOJresult      if ( $bestGenomic ne '' && $BOJresult ne '' && $resultBOJ !~ /_G_$bestGenomic[ \-]/ );
371
#    print " $intronStep $POS->{0} [@intronStatus] {$BOJresult} $bestNucleotide - $bestGenomic \t $BOJ->{0}\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
372
373
374
    }


375
    if ( $resultPOS eq '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
376
377
        buildFailureOutputFiles($r, $failureStatus[1], $failureStatus[0], '')     if ( $failureStatus[0] ne '' );
        buildFailureOutputFiles($r, $multipleequalblasthits[0], 'Failed_Aln', '') if ( $failureStatus[0] eq '' );
378
        next;
Sebastien Moretti's avatar
Sebastien Moretti committed
379
    }
380
381
    elsif ( $resultBOJ eq '' ){
        if ( $failureStatus[0] eq 'PUI_unavailable' || $failureStatus[0] eq 'No_nt_link' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
382
            buildFailureBOJOutputFile($failureStatus[1], $failureStatus[0], $right_name, ${original_seq[$r]});
383
384
        }
        elsif ( $failureStatus[0] eq '' && $intronStatus[0] ne '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
385
            buildIntronlessBOJOutputFile($intronStatus[1], $intronStatus[0], $intronStatus[2], $right_name, ${original_seq[$r]});
386
387
        }
        else{
Sebastien Moretti's avatar
Sebastien Moretti committed
388
            buildFailureBOJOutputFile($multipleequalblasthits[0], 'Unavailable', $right_name, ${original_seq[$r]});
389
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
390
        createCDSOutputFile($resultPOS, ${original_seq[$r]}, $right_name);
Sebastien Moretti's avatar
Sebastien Moretti committed
391
392
    }
    else {
Sebastien Moretti's avatar
Sebastien Moretti committed
393
394
        createCDSOutputFile($resultPOS, ${original_seq[$r]}, $right_name);
        createBOJOutputFile($resultBOJ, ${original_seq[$r]});
Sebastien Moretti's avatar
Sebastien Moretti committed
395
396
397
398
399
400
401
402
403
    }


    undef $original_seq[$r];
    undef $original_names[$r];
    undef $input_order[$r];
}


Sebastien Moretti's avatar
Sebastien Moretti committed
404
405
checkAndCleanStderrFiles("$cache/${date}_ExonerateError") if ( -e "$cache/${date}_ExonerateError" );
checkOutputFiles();
406
407
unlink("$msa") if ( $change==1 );
my @tmpFiles = glob($cache."/${date}*");
Sebastien Moretti's avatar
Sebastien Moretti committed
408
409
if ( exists($tmpFiles[0]) ){
    mkdir("$cache/${date}_TMP");
410
    move("$_", "$cache/${date}_TMP") while(<@tmpFiles>);
Sebastien Moretti's avatar
Sebastien Moretti committed
411
412
413
414
415
416
417
}


#Ouput Checker


#STDOUT Log output for our server
418
if ( -s "${originalMSA}.cds" || -s "${originalMSA}.out"){
Sebastien Moretti's avatar
Sebastien Moretti committed
419
420
    print "\n\nTERMINATION STATUS: SUCCESS\n";
    print "OUTPUT RESULTS\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
421
422
423
424
425
    print "    #### File Type=        MSA Format= fasta_CDS Name= ${originalMSA}.cds\n" 
        if ( -s "${originalMSA}.cds");
    print "    #### File Type=        MSA Format= fasta_CDS+Query Name= ${originalMSA}.cdsP\n" 
        if ( -s "${originalMSA}.cdsP");
    print "    #### File Type=        MSA Format= colored_fasta_CDS Name= ${originalMSA}.cdsP.html\n" 
Sebastien Moretti's avatar
Sebastien Moretti committed
426
        if ( Views::Html("${originalMSA}.cdsP") && -s "${originalMSA}.cdsP.html");
Sebastien Moretti's avatar
Sebastien Moretti committed
427
428
429
430
    print "    #### File Type=        MSA Format= Rejected_seq Name= ${originalMSA}.out\n" 
        if ( -s "${originalMSA}.out");
    print "    #### File Type=        MSA Format= fasta_Exon-Boundaries Name= ${originalMSA}.boj\n" 
        if ( -s "${originalMSA}.boj" && -s "${originalMSA}.cds" && $hideBOJ==0);
Sebastien Moretti's avatar
Sebastien Moretti committed
431
432
}
else{
Sebastien Moretti's avatar
Sebastien Moretti committed
433
    failure();
434
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
435
436
}

437
exit(0);
Sebastien Moretti's avatar
Sebastien Moretti committed
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519


=head1 NAME

ProtoGene.pl - Converts a peptidic alignment to a nucleotidic alignment through database search

=head1 SYNOPSIS

B<ProtoGene.pl> --msa=path_of_the_fasta_msa_file [options]

with these options available:

=over 4

=item I<--orgm>=targeted species in the database search

=item All_organisms [default], Bacteria, Viruses, Vertebrata, Eukaryota, Mammalia, Primates, Homo_sapiens, Gallus_gallus, Bos_taurus, Escherichia_coli, Arabidopsis_thaliana, Mus_musculus, Drosophila_Melanogaster, ...

=item I<--db>=targeted database in the database search

=item nr, pdb, swissprot, refseq_protein [default]

=item I<--local> to execute a local BLAST query with  --db=path_for_a_local_db_blast_formated

=item I<--template> to provide your own nucleotidic sequences following the cds file format

=item I<--revtrans> to reverse-translate sequences with no blast hit, in IUB (IUPAC) depiction code

=item They are removed from the alignement by default

=item I<--pep> to add the original peptide query beneath the back-translated sequence

=item I<--cache>=none, update, use, own_path_directory, old, empty

=item none: no cache usage, none temporary files are stored

=item update: use cache but update old files [default]

=item use: force use cache, whatever the age of files

=item own_path_directory: use my own directory, and its files

=item old: remove the old files in the cache directory

=item empty: empty the whole cache directory $HOME/.ProtoGene/

=item I<--version> to print version information

=item I<--help> to print this help message

=back

=head1 DESCRIPTION

PROTOGENE : Bona-Fide Back Translation of Protein Multiple Sequence Alignments
It converts a peptidic alignment to a nucleotidic alignment through database search.
Blast queries allow to get nucleotidic sequences from where peptidic sequences come from.
Exonerate aligns nucleotidic and peptidic sequences together.
PROTOGENE re-builds the original alignment with nucleotidic information it has gotten.

=head1 REQUIREMENT

=over 4

=item B<Perl 5.6 or better> is required !

=item Standard Perl modules B<lib>, B<strict>,  B<warnings>,  B<diagnostics>, B<Env> are required

=item and some other current ones : B<Getopt::Long>,  B<File::Which>,  B<File::Copy>, B<Mail::Send>

=item -

=item I<exonerate> from http://www.ebi.ac.uk/~guy/exonerate/

=item I<blast> from http://www.ncbi.nlm.nih.gov/BLAST/download.shtml or http://blast.wustl.edu/

=back

=head1 VERSION

=over 8

Sebastien Moretti's avatar
Sebastien Moretti committed
520
=item version 3.4.4
Sebastien Moretti's avatar
Sebastien Moretti committed
521

Sebastien Moretti's avatar
Sebastien Moretti committed
522
=item on Feb 25th, 2010
Sebastien Moretti's avatar
Sebastien Moretti committed
523
524
525
526
527
528
529
530
531

=back

=head1 AUTHORS

=over 8

=item Sebastien MORETTI

Sebastien Moretti's avatar
Sebastien Moretti committed
532
=item moretti.sebastien [AT] gmail.com
Sebastien Moretti's avatar
Sebastien Moretti committed
533
534
535

=item Frederic REINIER

Sebastien Moretti's avatar
Sebastien Moretti committed
536
=item reinier [AT] crs4.it
Sebastien Moretti's avatar
Sebastien Moretti committed
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553

=item Lab. Information Genomique et Structurale - IGS

=item CNRS - Life Sciences

=item Marseille, France

=item http://www.igs.cnrs-mrs.fr/

=back

=cut


######################  Management  ######################

#Failure declaration
Sebastien Moretti's avatar
Sebastien Moretti committed
554
sub failure {
555
556
557
    print {*STDERR} "\n\nFATAL\n\nTERMINATION STATUS: FAILURE\n\n";
    return;
}
Sebastien Moretti's avatar
Sebastien Moretti committed
558
559
560

#Check external programs presence in the PATH
sub checkProgramsPresence{
561
    my ($webblast_exe, $exonerate) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
562

563
    if ( $webblast_exe eq '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
564
        failure();
565
566
        print {*STDERR} "\twebblast.pl program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
        exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
567
    }
568
    if ( $exonerate eq '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
569
        failure();
570
571
        print {*STDERR} "\texonerate program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
        exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
572
573
574
575
576
    }
}

#Check cache directory accessibility: Test and change rights, if needs, of the $HOME/.ProtoGene/ cache directory
sub checkCacheAccessibility{
577
    my ($cachedir) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
578

579
    if ( -d "$cachedir/" && -w "$cachedir/" && -x "$cachedir/" && -r "$cachedir/" ){
Sebastien Moretti's avatar
Sebastien Moretti committed
580
581
    }
    elsif ( -d "$cachedir/" ){
582
583
        if ( -o "$cachedir/" ){
            chmod 0754, "$cachedir/";
584
        }
585
        else{
Sebastien Moretti's avatar
Sebastien Moretti committed
586
            print {*STDERR} "\n\t** Warning ** :\n\tPermissions do NOT seem to be valid for the current cache directory\n\n";
587
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
588
589
    }
    else{
590
591
592
593
594
        mkdir "$cachedir/";
        if ( -o "$cachedir/" ){
            chmod 0754, "$cachedir/"; #0754 est en octal, donc 754 ou '0754' ne sont pas bons !
        }
        else{
Sebastien Moretti's avatar
Sebastien Moretti committed
595
            print {*STDERR} "\n\t** Warning ** :\n\tPermissions do NOT seem to be valid for the current cache directory\n\n";
596
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
597
    }
598
    return;
Sebastien Moretti's avatar
Sebastien Moretti committed
599
600
601
602
}

#Cache Management
sub cacheManagement{
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
    my ($Cache, $inScript) = @_;
    $inScript = 0 if ( !defined $inScript );

    if ( $Cache eq 'empty' ){
        my @temp_files = glob($cachedir.'/*.fas');
        if ( exists ($temp_files[0]) ){
            unlink $_ while(<@temp_files>);
        }
        @temp_files = glob($cachedir.'/*_ExonerateError');
        if ( exists ($temp_files[0]) ){
            unlink $_ while(<@temp_files>);
        }
        unlink("$cachedir/webblast.log", "$cachedir/web_tempo.result", "$cachedir/debug.tempo");
        print "\n\tcache directory [$cachedir] is empty\n\n";
        exit 0;
Sebastien Moretti's avatar
Sebastien Moretti committed
618
    }
619
620
621
622
623
624
625
626
    elsif ( $Cache eq 'old' ){
        my @temp_files = glob($cachedir.'/*.fas');
        my @tempFiles  = grep { -M $_ > $uct } @temp_files; #Sort list only with $uct days old files
        if ( exists ($tempFiles[0]) ){
            unlink $_ while(<@tempFiles>);
        }
        print "\n\tcache directory [$cachedir] is empty of its old files\n\n" if ($inScript==0);
        exit 0 if ( $inScript==0 );
Sebastien Moretti's avatar
Sebastien Moretti committed
627
    }
628
629
    elsif ( $Cache eq 'update' || $Cache eq 'use' ){
        $cache = $cachedir; #Limit cache files to less than $uct days old ones for 'update'
Sebastien Moretti's avatar
Sebastien Moretti committed
630
    }
631
    elsif ( $Cache eq 'none' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
632
633
    }
    elsif ( -d $Cache ){
Sebastien Moretti's avatar
Sebastien Moretti committed
634
635
636
        die "\n\tProtoGene cannot access to your directory\n\n"  if ( ! -x $Cache );
        die "\n\tProtoGene cannot write into your directory\n\n" if ( ! -w $Cache );
        die "\n\tProtoGene cannot read into your directory\n\n"  if ( ! -r $Cache );
637
        $cache = $Cache;
Sebastien Moretti's avatar
Sebastien Moretti committed
638
639
    }
    else {
640
        die "\n\tWrong cache argument, or your cache folder doesn't seem to exist\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
641
642
643
644
645
646
647
648
649
650
651
652
    }

    return($cache);
}

##########################################################


######################    Fasta    ######################

#Check input aln
sub checkFastaFile{
653
    my ($msa, $change) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
654

655
    if ( -e "$msa" ){
Sebastien Moretti's avatar
Sebastien Moretti committed
656
    #Convert no fasta format to fasta with seq_reformat if available
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
        my $seqreformat = which('t_coffeeX') || ''; #desactive tant que t_coffee seq_reformat pose prob
        if ( $seqreformat ne '' ){
            my @formatType = `$seqreformat -other_pg seq_reformat -in $msa -print_format`;
            my $format     = '';
            while(<@formatType>){
                if ( $_ =~ /FORMAT:/ ){
                    $format = $_;
                    chomp($format);
                    last;
                }
            }
            if ( $format ne '' && $format !~ /FORMAT:fasta_/ ){
                system("$seqreformat -other_pg seq_reformat -in $msa -output fasta_aln -out=${msa}.007") if ($format =~ /_aln *$/);
                system("$seqreformat -other_pg seq_reformat -in $msa -output fasta_seq -out=${msa}.007") if ($format =~ /_seq *$/);
                $msa    .= '.007';
                $change  = 1;
            }
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
675
676
    }
    else {
Sebastien Moretti's avatar
Sebastien Moretti committed
677
        failure();
678
679
        print {*STDERR} "\tThe file \'$msa\' doesn't not seem to be reachable\n\n";
        exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
680
681
682
683
684
685
686
    }

    return($msa,$change);
}

#Prepare Fasta Header of the query name to incorporate our annotations
sub fastaHeaders4ProtoGene{
687
    my ($QueryName) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
688
689

    $QueryName =~ s/  +/ /g;
690
691
692
693
    my ($cqacc, $cqdesc) = ($QueryName, $QueryName);
    $cqacc  =~ s/^ *([^ ]*) .*$/$1/;
    $cqacc  =~ s/[\s\t]*$//g;
    $cqacc  =  $cqacc.'_G_@@';
Sebastien Moretti's avatar
Sebastien Moretti committed
694
695
696
697
    $cqdesc =~ s/^ *[^ ]* *(.*)$/$1/;
    $cqdesc =~ s/^[\| \.]*//;
    $cqdesc =~ s/[\|\. ]*$//;

698
    $QueryName = $cqacc.$cqdesc;
Sebastien Moretti's avatar
Sebastien Moretti committed
699
700
701
702
703
    return($QueryName);
}

#Prepare Fasta Header to put it on the fasta header of the result outputs
sub getFastaHeaderAnnot{
704
    my ($bestTarget) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
705
706

    #Get annotation from nucleotide file header
707
708
709
    open(my $BEST, '<', "$cache/${bestTarget}.fas");
    my $annot=<$BEST> || '';
    close $BEST;
Sebastien Moretti's avatar
Sebastien Moretti committed
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
    chomp($annot);
    $annot =~ s/^ *[^ ]* //;
    $annot =~ s/[\. ]*$//;
    $annot =~ s/^ *//;
    $annot =~ s/  +/ /g;

    return($annot);
}

#########################################################


###################### Translation ######################

#Direct Reverse translation
sub reverse_trad{
726
    my ($aa) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
727
728
729
730
731
732
733

    my %reverse_code=('A' => 'GCN', 'F' => 'TTy', 'K' => 'AAr', 'P' => 'CCN', 'T' => 'ACN',
                      'C' => 'TGy', 'G' => 'GGN', 'L' => 'yTN', 'Q' => 'CAr', 'V' => 'GTN',
                      'D' => 'GAy', 'H' => 'CAy', 'M' => 'ATG', 'R' => 'mGN', 'W' => 'TGG',
                      'E' => 'GAr', 'I' => 'ATh', 'N' => 'AAy', 'S' => 'wsN', 'Y' => 'TAy',
                      'X' => 'NNN', '-' => '---', '*' => 'Trr'); # '*' is for stop codon

734
735
736
737
738
739
    my $triplet = 'nnn';
    while( my ($x,$y) =each(%reverse_code) ){
        if ( uc($aa) eq $x ){
            $triplet = $y;
            last;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
740
741
742
743
744
745
746
747
748
749
    }

    return($triplet);
}

#########################################################


####################### webblast #######################
sub launchWebblast{
750
751
    my ($infile, $quiet, $whatBlast, $db, $species, $blast_exe, $outfile) = @_;
    my $blast_at = '';
Sebastien Moretti's avatar
Sebastien Moretti committed
752

753
754
755
    $blast_at    = "-database=$db"                         if ( $whatBlast==0 );
    $blast_at    = "-database=$db -blast_dir=${blast_exe}" if ( $whatBlast==1 );
    $blast_at    = "-database=$db -gigablast=yes"          if ( $whatBlast==2 );
Sebastien Moretti's avatar
Sebastien Moretti committed
756
757
758
759
760
761
762
763
764
765
766
767
768

#    print "             Evalue treshold : 0.05
#             Matrix : BLOSUM62
#             Filter : F
#             **********************
#             For RefSeq Protein db:
#             Blast_identity_threshold : 100
#             Cover threshold : 100
#             **********************
#             For NR Protein db:
#             Blast_identity_threshold : 95
#             Cover threshold : 95";

769
    system("$webblast_exe -program=blastp ${blast_at} -infile=$infile -matrix=BLOSUM62 -evalue=0.05 -method=geneid -filter=Off -organism=$species -identity=100 -cover=100 -hits=10 $quiet -outfile=$outfile");
Sebastien Moretti's avatar
Sebastien Moretti committed
770
771


772
773
774
775
776
777
778
    if ( -z "$outfile" ){ #Another blastP query if no hit with refseq and db was not nr !
        $quiet = '-quiet=on';
        print {*STDERR} "run BLAST.. against NR because there was no acceptable hit against $db\n"
            if ( $whatBlast != 1 && $db ne 'nr' );
        print {*STDERR} "run BLAST.. again, with decreased thresholds, because there was no acceptable hit against $db\n"
            if ( $whatBlast == 1 || $db eq 'nr' );
        $blast_at =~ s/-database=[^\s]*/-database=nr/ if ( $whatBlast != 1 );
779
        system("$webblast_exe -program=blastp ${blast_at} -infile=$infile -matrix=BLOSUM62 -evalue=0.05 -method=geneid -filter=Off -organism=$species -identity=95 -cover=95 -hits=10 $quiet -outfile=$outfile");
Sebastien Moretti's avatar
Sebastien Moretti committed
780
781
    }

782
783
784
785
786
787
788
    move('webblast.log',     "$cache"); #Move temporary webblast.pl files
    move('web_tempo.result', "$cache");
    move('debug.tempo',      "$cache");

    unlink("$cache/webblast.log")     if ( -z "$cache/webblast.log" );
    unlink("$cache/web_tempo.result") if ( -z "$cache/web_tempo.result" );
    unlink("$cache/debug.tempo")      if ( -z "$cache/debug.tempo" );
Sebastien Moretti's avatar
Sebastien Moretti committed
789

790
    return;
Sebastien Moretti's avatar
Sebastien Moretti committed
791
792
793
794
795
796
797
798
799
800
}


########################################################


##################### NCBI requests #####################

#Prot ACC -> PUID
sub blastPAcc2PGI{
801
    my ($blastHit) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
802

803
804
805
    my $protGI = '';
    my $count  = 0;
    GET_PUI:
Sebastien Moretti's avatar
Sebastien Moretti committed
806
    for(my $rep=0;$rep <= 4; $rep++){
807
        $count++;
808
809
#        system("wget -q -O $cache/${date}_${blastHit}protGi.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=${blastHit}[pacc]&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
        system("wget -q -O $cache/${date}_${blastHit}protGi.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=${blastHit}&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
810
811
812
        open(my $GIP, '<', "$cache/${date}_${blastHit}protGi.tmp");
        PROT_GI:
        while(<$GIP>){
813
            if ( $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
814
815
816
817
818
819
820
                $protGI = $1;
                last PROT_GI; #OK if only one PUID
            }
        }
        close $GIP;
        $rep = $rep-15 if ( -z "$cache/${date}_${blastHit}protGi.tmp" && $count==1 );
        last GET_PUI if ( $protGI =~ /^\d+$/ );
Sebastien Moretti's avatar
Sebastien Moretti committed
821
    }
822
    unlink("$cache/${date}_${blastHit}protGi.tmp") if ( $tmp == 0 || $protGI ne '' );
Sebastien Moretti's avatar
Sebastien Moretti committed
823
824
825
826
827
828

    return($protGI);
}

#PUID -> nt GIs
sub protGI2NTGIs{
829
    my ($protGI, $blastHit) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
830

831
832
833
834
    my $ntGIs  = '';
    my $geneID = '';
    my $count  = 0;
    GET_NTUI:
Sebastien Moretti's avatar
Sebastien Moretti committed
835
    for(my $rep=0;$rep <= 4; $rep++){
836
        $count++;
Sebastien Moretti's avatar
Sebastien Moretti committed
837
        #nt GIs
838
        system("wget -q -O $cache/${date}_${blastHit}nucleo.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,nucleotide,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
839
840
841
842
843
844
845
846
847
848
849
        open(my $GIN, '<', "$cache/${date}_${blastHit}nucleo.tmp");
        my $flag = 0;
        NT_GI:
        while(<$GIN>){
            if ( $_   =~ /<LinkName>protein_nuc[a-z]+<\/LinkName>/ ){ #for nuccleotide and nuccore
                $flag = 1;
            }
            elsif ( $_ =~ /\<LinkName\>protein_gene\<\/LinkName\>/ ){
                $flag  = 2;
            }
            elsif ( $flag==1 && $_ =~ /^.*\<Id\>(\d+)\<\/Id\>.*$/ ){
850
851
852
                my $match = $1;
                $ntGIs  .= ",$match" if ( $ntGIs  ne '' && $ntGIs  !~ /,$match,/ && $ntGIs  !~ /^$match,/ );
                $ntGIs   = $match    if ( $ntGIs  eq '' );
853
854
            }
            elsif ( $flag==2 && $_ =~ /^.*\<Id\>(\d+)\<\/Id\>.*$/ ){
855
856
857
                my $match = $1;
                $geneID .= ",$match" if ( $geneID ne '' && $geneID !~ /,$match,/ && $geneID !~ /^$match,/ );
                $geneID  = $match    if ( $geneID eq '' );
858
859
860
861
862
            }
        }
        close $GIN;
        $rep = $rep-15 if ( -z "$cache/${date}_${blastHit}nucleo.tmp" && $count==1 );
        last GET_NTUI if ( $ntGIs ne '' || $geneID ne '' );
Sebastien Moretti's avatar
Sebastien Moretti committed
863
    }
864
    unlink("$cache/${date}_${blastHit}nucleo.tmp") if ( $tmp==0 || ($ntGIs ne '' || $geneID ne '') );
Sebastien Moretti's avatar
Sebastien Moretti committed
865

866
    return($ntGIs, $geneID);
Sebastien Moretti's avatar
Sebastien Moretti committed
867
868
869
}

sub geneID2Chr{
870
    my ($geneID, $blastHit) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
871

872
873
874
875
    my $chr   = '';
    my ($amont, $aval) = ('', '');
    my $count = 0;
    GET_CHR:
Sebastien Moretti's avatar
Sebastien Moretti committed
876
    for(my $rep=0;$rep <= 8; $rep++){
877
        $count++;
Sebastien Moretti's avatar
Sebastien Moretti committed
878
        #Gene Acc
879
880
#        system("wget -q -O $cache/${date}_${blastHit}gene.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
        system("wget -q -O $cache/${date}_${blastHit}gene.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
881
882
883
884
885
886
887
        open(my $GACC, '<', "$cache/${date}_${blastHit}gene.tmp");
        my $flag = 0;
        GENE_CHR:
        while(<$GACC>){
            if ( $_  =~ /\<ERROR\>Empty id list \- nothing todo\<\/ERROR\>/ && $flag==0 ){
                $rep = $rep-15 if ( $count==1 );
            }
888
889
890
#            if ( $_   =~ /\<Entrezgene_locus\>/ && $flag==0 ){
            if ( $_   =~ /<Item Name="GenomicInfoType" Type="Structure">/ && $flag==0 ){
               $flag = 1;
891
            }
892
893
894
895
896
897
898
#            elsif ( $_ =~ /\<Gene-commentary_type value=\"genomic\"\>/ && $flag==1 ){
#                $flag  = 2;
#            }
#            elsif ( $_ =~ /\<Gene-commentary_type value=/ && $flag==2 ){
#                last GENE_CHR;
#            }
#            elsif ( $_ =~ /\<Gene-commentary_accession\>([\w\_\-\.]+)\<\/Gene-commentary_accession\>/ && $flag==2 ){
Sebastien Moretti's avatar
Sebastien Moretti committed
899
            elsif ( $_ =~ /<Item Name="ChrAccVer" Type="String">([^<]+?)\.?\d*<\/Item>/ && $flag==1 ){
900
901
902
                $chr   = $1;
                $flag  = 3;
            }
903
904
#            elsif ( $_ =~ /\<Seq-interval_from\>(\d+)\<\/Seq-interval_from\>/ && $flag==3 ){
            elsif ( $_ =~ /<Item Name="ChrStart" Type="Integer">(\d+)<\/Item>/ && $flag==3 ){
905
906
907
                $amont = $1;
                $flag  = 4;
            }
908
909
#            elsif ( $_ =~ /\<Seq-interval_to\>(\d+)\<\/Seq-interval_to\>/ && $flag==4 ){
            elsif ( $_ =~ /<Item Name="ChrStop" Type="Integer">(\d+)<\/Item>/ && $flag==4 ){
910
911
912
913
914
915
916
                $aval  = $1;
                last GENE_CHR;
            }
        }
        close $GACC;
        $rep = $rep-15 if ( -z "$cache/${date}_${blastHit}gene.tmp" && $count==1 );
        last GET_CHR   if ( $chr ne '' && $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
Sebastien Moretti's avatar
Sebastien Moretti committed
917
    }
918
919
920
921
    unlink("$cache/${date}_${blastHit}gene.tmp") if ( $tmp == 0 || $chr ne '' );
    if ( $amont eq '' || $aval eq '' ){
        $amont = '';
        $aval = '';
Sebastien Moretti's avatar
Sebastien Moretti committed
922
    }
923
924
925
926
927
928
929
930
931
    elsif ( $amont ne '' && $aval ne '' ){
        if ( $amont > $aval ){
            $amont = $amont+5000;
            $aval  = $aval-5000;
        }
        else{
            $amont = $amont-5000;
            $aval  = $aval+5000;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
932
    }
933
934
    $amont = 1 if ( $amont ne '' && $amont <= 0 );
    $aval  = 1 if ( $aval ne ''  && $aval <= 0 );
Sebastien Moretti's avatar
Sebastien Moretti committed
935

936
    return($chr, $amont, $aval);
Sebastien Moretti's avatar
Sebastien Moretti committed
937
938
939
}

sub downloadSeqFromGIs{
940
    my ($cache, $date, $amont, $aval, @acc) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
941

942
    GET_SEQ:
Sebastien Moretti's avatar
Sebastien Moretti committed
943
    for(my $a=0; $a<=$#acc; $a++){
944
945
946
947
        my $whatNumber = 0;
        if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
            GET_CHR_SEQ:
            for(my $rep=0;$rep <= 4; $rep++){
948
                system("wget -q -O $cache/${acc[$a]}-${amont}.fas 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${acc[$a]}&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch'") if ( !-e "$cache/${acc[$a]}-${amont}.fas" || -z "$cache/${acc[$a]}-${amont}.fas" || $Cache eq 'none' || ($Cache eq 'update' && -M "$cache/${acc[$a]}-${amont}.fas" > $uct ) );
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
                open(my $CIBLE, '<', "$cache/${acc[$a]}-${amont}.fas");
                my $counter = 0;
                my $lines   = 0;
                CHECK_CHR_SEQ:
                while(<$CIBLE>){
                    $lines++              if ( $_ !~ /^>/ );
                    $counter = $counter+2 if ( $counter==1 && $_ !~ /^\w/ && $lines==1 && $_ !~ /^>/ );
                    $counter++            if ( $_ =~ /^>/ );
                    $counter = $counter+2 if ( $_ !~ /^>/ && ($_ =~ /Error:/ || $_ =~ /[<>]/) );
#                   last CHECK_CHR_SEQ    if ( $_ !~ /^>/ && $counter==1 && $_ =~ /^\w/ );
                }
                close $CIBLE;
                $whatNumber++;
                last GET_CHR_SEQ if ( $whatNumber==20 );
                if ( $counter != 1 ){
                    $rep = $rep-1;
                    unlink("$cache/${acc[$a]}-${amont}.fas");
                }
            }
        }
        else{
            GET_OTHER_SEQ:
            for(my $rep=0;$rep <= 4; $rep++){
972
                system("wget -q -O $cache/${acc[$a]}.fas 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${acc[$a]}&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'") if ( !-e "$cache/${acc[$a]}.fas" || -z "$cache/${acc[$a]}.fas" || $Cache eq 'none' || ($Cache eq 'update' && -M "$cache/${acc[$a]}.fas" > $uct ) );
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
                open(my $CIBLE, '<', "$cache/${acc[$a]}.fas");
                my $counter = 0;
                my $lines   = 0;
                CHECK_OTHER_SEQ:
                while(<$CIBLE>){
                    $lines++              if ( $_ !~ /^>/ );
                    $counter = $counter+2 if ( $counter==1 && $_ !~ /^\w/ && $lines==1 && $_ !~ /^>/ );
                    $counter++            if ( $_ =~ /^>/ );
                    $counter = $counter+2 if ( $_ !~ /^>/ && ($_ =~ /Error:/ || $_ =~ /[<>]/) );
                }
                close $CIBLE;
                $whatNumber++;
                last GET_OTHER_SEQ if ( $whatNumber==20 );
                if ( $counter != 1 ){
                    $rep=$rep-1;
                    unlink("$cache/${acc[$a]}.fas");
                }
            }
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
992
    }
993
994

    return;
Sebastien Moretti's avatar
Sebastien Moretti committed
995
996
997
}

sub downloadSeq{
998
    my ($cache, $date, $amont, $aval, @acc) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
999

1000
    my $cp   = 0;