ProtoGene.pl 64.6 KB
Newer Older
Sebastien Moretti's avatar
Sebastien Moretti committed
1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env perl

########################################################
#                                                      #
# ProtoGene : Bona-Fide Back Translation of Protein    #
#             Multiple Sequence Alignments             #
#                                                      #
########################################################

use strict;
use warnings;
use diagnostics;
Sebastien Moretti's avatar
Sebastien Moretti committed
13
14
use Carp;

15
16
17
use Time::localtime;                       # Use localtime+PID for a pseudo-uniq temp file name
use Getopt::Long;                          # Options specifications

Sebastien Moretti's avatar
Sebastien Moretti committed
18
19
use Env qw(HOME);                          # Use only environmental (shell) HOME variable
use File::Copy qw(move);                   # Avoid external 'mv' command usage
20
use LWP::Simple;                           # To test gigablaster availability
Sebastien Moretti's avatar
Sebastien Moretti committed
21
22
23
24

use File::Which qw(which);                 # Locate external executable programs in the PATH
use Mail::Send;                            # Send warnings and errors files by e-mail ==> only if the $userEMail variable is defined

25
26
use lib '/mnt/common/share/ProtoGene/';    # Local path for ProtoGene's own perl modules
###use lib '/mnt/local/lib/tcoffee_perl/';
Sebastien Moretti's avatar
Sebastien Moretti committed
27
use Exonerate;                             # Exonerate runner, parser, ...
28
use Views;                                 # Non-text outputs, e.g. HTML/CSS
29
#use CheckOutput;                         # Check output for cds consistancy with query
Sebastien Moretti's avatar
Sebastien Moretti committed
30
31
32



33
34
35
36
37
38
39
40
################## CONFIGURATION ##################
$ENV{'PATH'}        .= ':/mnt/local/bin/';                                  # Additional path for executables

my $cachePath        = '/scratch/cluster/monthly/t_coffee/ProtoGene_Cache'; # Cache directory
my $cacheStorageTime = 15;                                                  # Do not update sequences younger than X days

my $userEMail        = 'moretti.sebastien@gmail.com';                       # To receive e-mails with encountered problems; leave blank to inactive
###################################################
Sebastien Moretti's avatar
Sebastien Moretti committed
41
42


43
my $VERSION  = '4.0.0';
Sebastien Moretti's avatar
Sebastien Moretti committed
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
my $webblast_exe  = which('webblast.pl')   || '';
my $blast_prog    = 'blastall';                         # Or wu-blastall for Wu-BLAST; for local blast usage
my $blast_exe     = which($blast_prog)     || '';
my $exonerate_exe = which('exonerate-1.0') || '';       # Exonerate 1.0 because current parser only works with this version


################## Option management
my ($msa, $revtrans, $pep, $hideBOJ, $run_name, $template, $lim, $cache) = ('', 0, 0, 0, '', '', 0, 'update');
my ($debug, $tmp)                                                        = (0, 0);
my ($db, $species, $local, $giga)                                        = ('refseq_protein', 'All_organisms', 0, 0);
GetOptions('msa|in=s'        => \$msa,        # Input sequences
           'revtrans'        => \$revtrans,   # Use to reverse-translate sequences with no match
           'pep'             => \$pep,        # Add the original peptide query beneath the related CDS seq
           'hideBOJ'         => \$hideBOJ,    # Hide BOJ output
           'run_name=s'      => \$run_name,   # Use another name, instead of input seq name, for result files
           'template=s'      => \$template,   # Use a template file
           'lim=i'           => \$lim,        # Limit number of input query sequences
           'cache=s'         => \$cache,      # Cache behavior

           'orgm|species=s'  => \$species,    # Organism(s) to blast against
           'db|database=s'   => \$db,         # Database to blast against
           'local'           => \$local,      # Use to specify a local db query, definied in $local_db
           'giga'            => \$giga,       # Use GigaBlaster server

           'version|V'       => sub { print "\n\tPROTOGENE : Bona-Fide Back Translation of Protein Multiple Sequence Alignments\n\tversion   : $VERSION\n\n";
                                      exit 0;
                                },            # Print version information
           'help|H|?'        => sub { system('perldoc', "$0")==0 || print {*STDERR} "\n\tCannot open the documentation with 'perldoc'\n\n";
                                      exit 0;
                                    },        # Print full help message
           'debug'           => \$debug,      # Verbose output
           'tmp'             => \$tmp,        # To keep traces of fake intermediate files like fake xml from NCBI, fake aln, ...
Sebastien Moretti's avatar
Sebastien Moretti committed
77
           );
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

################## Short help message
if ( $msa eq '' || $species eq '' || $db eq '' || $cache eq '' ){
    print {*STDERR} "\n\tCannot open the MSA file in FASTA format
\tTry:  $0 --msa=path_of_the_fasta_msa_file [Options]

\tOptions: --orgm=All_organisms, Bacteria, Viruses, Vertebrata,
\t                Eukaryota, Mammalia, Primates, Homo_sapiens,
\t                Gallus_gallus, Bos_taurus, Escherichia_coli,
\t                Arabidopsis_thaliana, Mus_musculus,
\t                Drosophila_Melanogaster, ...
\t                default is 'All_organisms'
\t         --db=nr, pdb, swissprot, refseq_protein
\t                default is 'refseq_protein'
\t         --local to execute a local BLAST query with
\t                --db=path_for_a_local_db_blast_formated\n
\t         --template to provide your own nucleotidic sequences
\t                following the cds file format
\t         --revtrans to reverse-translate sequences with no
\t                blast hit, in IUB (IUPAC) depiction code
\t                They are removed from the alignement by default
\t         --pep  to add the original peptide query beneath the
\t                back-translated sequence
\t         --cache=none, update, use, 'own_PATH_directory', old, empty
\t                to select the cache mode
\t                default is 'update'\n
\t         --version to print version information
\t         --help    to print a full help message\n\n";
106
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
107
}
108
109
110
111
112
113
114
115
116
117
118
119



checkProgramsPresence($webblast_exe, $exonerate_exe); # Check external programs presence in the PATH
$blast_exe =~ s/${blast_prog}$//;

checkCacheAccessibility($cachePath); # Check cache directory accessibility



$local = 2 if ( $giga==1 );
if ( $blast_exe eq '' ){
120
    print {*STDERR} "\n\t$blast_prog program is not reachable\n\tIt could not be in your PATH or not installed\n\tIt is required only for local blast searches\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
121
122
123
}

#Cache management
124
$cache = cacheManagement($cache, $cachePath) || '.';
Sebastien Moretti's avatar
Sebastien Moretti committed
125
126
127


#Open and check template file
128
my ($templG, $templP, $templSeq);
Sebastien Moretti's avatar
Sebastien Moretti committed
129
($templG, $templP, $templSeq) = checkTemplate($template) if ($template);
Sebastien Moretti's avatar
Sebastien Moretti committed
130
131
132


#Short help message
133
if ( $msa eq '' || $species eq '' || $db eq '' || $cache eq '' ){
134
    my $appli = basename($0);
135
    print {*STDERR} "\n\tCannot open the MSA file in FASTA format
Sebastien Moretti's avatar
Sebastien Moretti committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
\tTry:  $appli --msa=path_of_the_fasta_msa_file [Options]

\tOptions: --orgm=All_organisms, Bacteria, Viruses, Vertebrata,
\t                Eukaryota, Mammalia, Primates, Homo_sapiens,
\t                Gallus_gallus, Bos_taurus, Escherichia_coli,
\t                Arabidopsis_thaliana, Mus_musculus,
\t                Drosophila_Melanogaster, ...
\t                default is 'All_organisms'
\t         --db=nr, pdb, swissprot, refseq_protein
\t                default is 'refseq_protein'
\t         --local to execute a local BLAST query with
\t                --db=path_for_a_local_db_blast_formated
\t         --template to provide your own nucleotidic sequences
\t                following the cds file format
\t         --revtrans to reverse-translate sequences with no
\t                blast hit, in IUB (IUPAC) depiction code
\t                They are removed from the alignement by default
\t         --pep  to add the original peptide query beneath the
\t                back-translated sequence
\t         --cache=none, update, use, own_PATH_directory, old, empty
\t                to select the cache mode
\t                default is 'update'
\t         --version to print version information
\t         --help to print a full help message\n\n";
160
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
161
162
163
164
165
}



##Open and Check the fasta file
166
167
my $originalMSA   = $msa;
my $change        = 0;
Sebastien Moretti's avatar
Sebastien Moretti committed
168
($msa, $change)   = checkFastaFile($msa, $change);
169
my $fasta_checker = -1;
170
my (@input_order, @original_names, @original_seq); #Use 3 lists in parallel : original_names, seq, order(1->n)
171
open( my $MSA, '<', "$msa" ) if ( -e "$msa" );
172
173
while(<$MSA>){
    if ( $_ =~ /^>/ ){
174
175
176
177
178
179
180
        $fasta_checker++;
        my $name = $_;
        $name    =~ s/\r\n//g; #Remove return lines from windows OS '^M'
        $name    =~ s/^>[ \t]+/>/;
        chomp($name);
        @original_names = (@original_names, $name);
        @input_order    = (@input_order, $fasta_checker);
Sebastien Moretti's avatar
Sebastien Moretti committed
181
    }
182
    elsif ( $_ !~ /^>/ ){
183
184
185
        my $seqq = $_;
        $seqq    =~ s/\r\n//g;
        $seqq    =~ s/\./-/g; #for msa with '.' as gap
186
        $seqq    =~ s{[BJOUZ]}{X}ig; #U here for selenocystein
187
188
        $seqq    =~ s/[^A-Za-z\-\*\n\r]//g; #Remove all the non-gap or non-alphabetic characters from the seq
        chomp($seqq);
Sebastien Moretti's avatar
Sebastien Moretti committed
189
        #fasta sequence on 1 line
190
        if ( !exists($original_seq[$fasta_checker]) ){
191
192
193
194
195
            @original_seq = (@original_seq, $seqq);
        }
        else {
            $original_seq[$fasta_checker] = $original_seq[$fasta_checker].$seqq;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
196
197
    }
}
198
close $MSA;
199
if ( $fasta_checker == -1 ){
Sebastien Moretti's avatar
Sebastien Moretti committed
200
    failure();
Sebastien Moretti's avatar
Sebastien Moretti committed
201
    print {*STDERR} "\tThe MSA file does NOT seem to be a protein FASTA format\n\tSee <a href='http://en.wikipedia.org/wiki/FASTA_format' target='_blank'>FASTA format</a>\n\n";
202
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
203
}
204
elsif ( $lim >0 && $fasta_checker > $lim ){
Sebastien Moretti's avatar
Sebastien Moretti committed
205
    failure();
206
    print {*STDERR} "\tThe FASTA file is too large, try with less than $lim sequences\n\tor split your file\n\n";
207
208
    exit(1);
}
209
elsif ( exists( $original_seq[0] ) && $original_seq[0] =~ /[acgtu]/i ){
210
    my $first_seq = $original_seq[0];
211
    #Check if sequences are amino acids and not nucleotides
212
213
214
    my $a   = ($first_seq =~ s/[aA]//g)   || 0;
    my $c   = ($first_seq =~ s/[cC]//g)   || 0;
    my $g   = ($first_seq =~ s/[gG]//g)   || 0;
215
216
    my $t   = ($first_seq =~ s/[tTuU]//g) || 0;
    my $non = ($first_seq =~ s/[^aAcCgGtTuUXxNn-]//g) || 0;
217
    if ( ($a+$c+$g+$t) >= (($a+$c+$g+$t+$non)*80/100) ){
Sebastien Moretti's avatar
Sebastien Moretti committed
218
        failure();
219
        print {*STDERR} "\tYour sequences seem already to be nucleotides\n\tthis program purpose is to turn AMINO ACID alignments into CDS nucleotide alignments\n\n";
220
221
        exit(1);
    }
Sebastien Moretti's avatar
Sebastien Moretti committed
222
223
224
225
}
undef $fasta_checker;


226
227
228
229
230
#Check GigaBlaster status if used
if ( $giga==1 ){
    if ( head('http://www.igs.cnrs-mrs.fr/Giga2/~database/remoteblast.cgi') ){
    }
    else{
231
        $local = 0;
232
233
234
    }
}

Sebastien Moretti's avatar
Sebastien Moretti committed
235

236
237
my $date = sprintf("%d-%02d-%02d_%02d%02d", localtime->year() + 1900, localtime->mon(), localtime->mday(), localtime->hour, localtime->min)."_$$";

Sebastien Moretti's avatar
Sebastien Moretti committed
238
#Start main program with version # of programs and list original queries
239
print "\n\t   Protogene\t$VERSION\t$date\n\n";
240
open(my $EXONERATEISHERE, "$exonerate_exe --version |");
241
my $IsExonerateHere = <$EXONERATEISHERE>;
242
close $EXONERATEISHERE;
Sebastien Moretti's avatar
Sebastien Moretti committed
243
244
print "\t   $IsExonerateHere\n";
undef $IsExonerateHere;
245
undef $VERSION;
Sebastien Moretti's avatar
Sebastien Moretti committed
246
247
248
for(my $m=0; $m<=$#original_names; $m++){
    print "\t$original_names[$m]\n";
}
249
$date .= "_$$"; #Add PID
Sebastien Moretti's avatar
Sebastien Moretti committed
250
251
252
253



# run_name implementation for web server
254
$originalMSA = $run_name if ( $run_name ne '' );
Sebastien Moretti's avatar
Sebastien Moretti committed
255
256
257

unlink("${originalMSA}.cds");
unlink("${originalMSA}.cdsP");
Sebastien Moretti's avatar
Sebastien Moretti committed
258
unlink("${originalMSA}.cdsP.html");
Sebastien Moretti's avatar
Sebastien Moretti committed
259
260
261
262
263
264
unlink("${originalMSA}.out");
unlink("${originalMSA}.boj");



# Remove old files from the cache directory
265
cacheManagement('old', $cachePath, '1'); #everytime ProtoGene is running
Sebastien Moretti's avatar
Sebastien Moretti committed
266
267
268
269
270




##Build the sequences, from the alignment, to perform webblast
271
272
my $quiet = '';
EACH_SEQ:
Sebastien Moretti's avatar
Sebastien Moretti committed
273
for(my $r=0; $r<=$#input_order; $r++){
274
275
276
277
    my $right_name = $original_names[$r];
    my $templ_name = $original_names[$r];
    $right_name    =~ s/^>//;
    $templ_name    =~ s/^([^\s\t]+).*$/$1/;
Sebastien Moretti's avatar
Sebastien Moretti committed
278

279
    open(my $READY2BLAST, '>', "$cache/${date}_seq2blast.fas${r}");
280
281
    my $noGap = $original_seq[$r];
    $noGap    =~ s/[\.-]//g; #remove gaps
282
283
    print {$READY2BLAST} ">$input_order[$r]\n$noGap\n";
    close $READY2BLAST;
Sebastien Moretti's avatar
Sebastien Moretti committed
284
285
286
287
    undef $noGap;


    my @multipleequalblasthits;
288
289
290
    if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' ){
        @multipleequalblasthits = (@multipleequalblasthits, 'My_own_seq'); #To change for ?
        print "\n\n$r done\n\nNucleotidic template for $r:\t$templG->{$templ_name}\n\n**********************************************************************\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
291
    }
292
293
294
    elsif ( exists($templP->{$templ_name}) && $templP->{$templ_name} ne '' ){
        @multipleequalblasthits = (@multipleequalblasthits, $templP->{$templ_name});
        print "\n\n$r done\n\nBlastP template for $r:\t$templP->{$templ_name}\n\n**********************************************************************\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
295
296
    }
    else{
297
    ##   --> send them to webblast
298
        $quiet = '-quiet=on ' if ( $r>0 ); #Show webblast parameters only for the first webblast run
299
        launchWebblast("$cache/${date}_seq2blast.fas${r}", $quiet, $local, $db, $species, $blast_exe, "$cache/${date}.blastp${r}");
Sebastien Moretti's avatar
Sebastien Moretti committed
300

301
    ##Get the BLASTp hit(s)  acc number
302
303
304
305
306
307
308
309
        open(my $BLASTP, '<', "${cache}/${date}.blastp${r}");
        while(<$BLASTP>){
            @multipleequalblasthits = (@multipleequalblasthits, $1) if ( $_ =~ /^>\s*\d+@?\w?\w?\w?__([^\s\|\.]+).*$/ ); #Warning: double '_'
        }
        close $BLASTP;
        unlink("$cache/${date}.blastp${r}", "${cache}/${date}_seq2blast.fas${r}") if ( $tmp == 0 || exists($multipleequalblasthits[0]) );
        if ( !exists($multipleequalblasthits[0]) ){
            print "\n$input_order[$r] ...\tNo blast result found above filter thresholds for $right_name\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
310
            buildFailureOutputFiles($r, 'No_BLASTp_Result', 'Unavailable', '');
311
312
313
314
315
            undef $original_seq[$r];
            undef $original_names[$r];
            undef $input_order[$r];
            next EACH_SEQ;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
316
317
318
319
320
    }



    #When multiple equal blast hits  ==> use, and add, every hits
321
322
323
    my ($resultPOS, $resultBOJ) = ('', '');
    my @failureStatus = ('');
    my @intronStatus  = ('');
Sebastien Moretti's avatar
Sebastien Moretti committed
324
    for(my $qq=0; $qq<=$#multipleequalblasthits; $qq++){
325
326
327
328
329
330
        print "\n${right_name}  -->  [${multipleequalblasthits[$qq]}]\n";


        my @nt_GIs;
        my $intronStep = 0;
        if ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' && $templG->{$templ_name} !~ /^My_Seq$/i ){
Sebastien Moretti's avatar
Sebastien Moretti committed
331
            downloadSeq($cache, $date, '', '', $templG->{$templ_name});
332
333
334
335
336
337
338
339
340
341
            @nt_GIs = $templG->{$templ_name};
        }
        elsif ( exists($templG->{$templ_name}) && $templG->{$templ_name} ne '' && $templG->{$templ_name} =~ /^My_Seq$/i ){
            open(my $MYSEQ, '>', "$cache/My_Seq-$$.fas");
            print {$MYSEQ} ">My_Seq-$$ for $templ_name\n".$templSeq->{$templ_name}."\n";
            close $MYSEQ;
            @nt_GIs = "My_Seq-$$";
        }
        else{
        #Prot ACC -> PUID
Sebastien Moretti's avatar
Sebastien Moretti committed
342
            my $protGI = blastPAcc2PGI($multipleequalblasthits[$qq]);
343
344
345
346
347
            if ( $protGI eq '' ){
                print "\tNo protein (prot GI) link found for ${multipleequalblasthits[$qq]} in $right_name\n\n";
                @failureStatus = ('PUI_unavailable', ${multipleequalblasthits[$qq]});
                next;
            }
Sebastien Moretti's avatar
Sebastien Moretti committed
348
349
#           print "\t$protGI\n";

350
        #PUID -> nt GIs & GeneID
Sebastien Moretti's avatar
Sebastien Moretti committed
351
            my ($ntGIs, $geneID) = protGI2NTGIs($protGI, $multipleequalblasthits[$qq]);
352
353
354
355
356
357
358
359
360
361
            if ( $ntGIs eq '' && $geneID eq '' ){
                print "\tNo nucleotide (nt GIs) link found for ${multipleequalblasthits[$qq]} in $right_name\n\n";
                @failureStatus = ('No_nt_link', ${multipleequalblasthits[$qq]});
                next;
            }
            print " linked with [$ntGIs $geneID]\n";


        #Get transcript and contig seq
            @nt_GIs = split(/,/, $ntGIs);
Sebastien Moretti's avatar
Sebastien Moretti committed
362
            downloadSeqFromGIs($cache, $date, '', '', @nt_GIs);
363
364
365
366
        #GeneID -> Chr
            my @geneIDs = split(/,/, $geneID);
            while(<@geneIDs>){
                my $geneID = $_;
Sebastien Moretti's avatar
Sebastien Moretti committed
367
                my ($chr, $amont, $aval) = geneID2Chr($geneID, $multipleequalblasthits[$qq]);
368
369
370
371
                print "\n\tNo gene locus found for ${multipleequalblasthits[$qq]} with $geneID in $right_name\n\n" if ($chr eq '' and $geneID ne '');

                if ( $chr ne '' ){
            #Get Chr seq
Sebastien Moretti's avatar
Sebastien Moretti committed
372
                    downloadSeq($cache, $date, $amont, $aval, $chr);
373
374
375
376
377
378
379
380
381
                    $intronStep = 1;

                    @nt_GIs = ("$chr:$amont-$aval", @nt_GIs);
                }
            }
        }


    #Align Nt seq with our protein query seq
382
        my ($POS, $BOJ) = runExonerate($exonerate_exe, $cache, $date, ${input_order[$r]}, ${original_seq[$r]}, $right_name, $multipleequalblasthits[$qq], $r, @nt_GIs);
383

Sebastien Moretti's avatar
Sebastien Moretti committed
384
        my ($POSresult, $bestNucleotide) = prepareResults4CDS($POS, $multipleequalblasthits[$qq], ${original_seq[$r]}, $right_name);
385
386
387
388
389
390
391
392
        $resultPOS .= $POSresult if ( $bestNucleotide ne '' && $POSresult ne '' && $resultPOS !~ /_G_$bestNucleotide/ );

        @intronStatus = ($POS->{0}, ${multipleequalblasthits[$qq]}, '')              if ( $intronStep==1 && exists($POS->{0}) && $POS->{0} ne '' && $POS->{0} =~ /\:/ );
        @intronStatus = ($POS->{0}, ${multipleequalblasthits[$qq]}, $bestNucleotide) if ( $intronStep==0 && ($bestNucleotide =~ /^N[CTWZG]_/ || $bestNucleotide =~ /^AC_/) );

        my ($intronLess, $nameLess) = ('', '');
        $intronLess = $POS->{0}       if ( %$BOJ eq 0 && exists($POS->{0}) && $POS->{0} ne '' );
        $nameLess   = $bestNucleotide if ( %$BOJ eq 0 && $intronStep==0 && ($bestNucleotide =~ /^N[CTWZG]_/ || $bestNucleotide =~ /^AC_/) );
Sebastien Moretti's avatar
Sebastien Moretti committed
393
        my ($BOJresult, $bestGenomic) = prepareResults4BOJ($BOJ, $multipleequalblasthits[$qq], ${original_seq[$r]}, $right_name, $intronLess, $nameLess);
394
        $resultBOJ .= $BOJresult      if ( $bestGenomic ne '' && $BOJresult ne '' && $resultBOJ !~ /_G_$bestGenomic[ \-]/ );
395
#    print " $intronStep $POS->{0} [@intronStatus] {$BOJresult} $bestNucleotide - $bestGenomic \t $BOJ->{0}\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
396
397
398
    }


399
    if ( $resultPOS eq '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
400
401
        buildFailureOutputFiles($r, $failureStatus[1], $failureStatus[0], '')     if ( $failureStatus[0] ne '' );
        buildFailureOutputFiles($r, $multipleequalblasthits[0], 'Failed_Aln', '') if ( $failureStatus[0] eq '' );
402
        next;
Sebastien Moretti's avatar
Sebastien Moretti committed
403
    }
404
405
    elsif ( $resultBOJ eq '' ){
        if ( $failureStatus[0] eq 'PUI_unavailable' || $failureStatus[0] eq 'No_nt_link' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
406
            buildFailureBOJOutputFile($failureStatus[1], $failureStatus[0], $right_name, ${original_seq[$r]});
407
408
        }
        elsif ( $failureStatus[0] eq '' && $intronStatus[0] ne '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
409
            buildIntronlessBOJOutputFile($intronStatus[1], $intronStatus[0], $intronStatus[2], $right_name, ${original_seq[$r]});
410
411
        }
        else{
Sebastien Moretti's avatar
Sebastien Moretti committed
412
            buildFailureBOJOutputFile($multipleequalblasthits[0], 'Unavailable', $right_name, ${original_seq[$r]});
413
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
414
        createCDSOutputFile($resultPOS, ${original_seq[$r]}, $right_name);
Sebastien Moretti's avatar
Sebastien Moretti committed
415
416
    }
    else {
Sebastien Moretti's avatar
Sebastien Moretti committed
417
418
        createCDSOutputFile($resultPOS, ${original_seq[$r]}, $right_name);
        createBOJOutputFile($resultBOJ, ${original_seq[$r]});
Sebastien Moretti's avatar
Sebastien Moretti committed
419
420
421
422
423
424
425
426
427
    }


    undef $original_seq[$r];
    undef $original_names[$r];
    undef $input_order[$r];
}


Sebastien Moretti's avatar
Sebastien Moretti committed
428
429
checkAndCleanStderrFiles("$cache/${date}_ExonerateError") if ( -e "$cache/${date}_ExonerateError" );
checkOutputFiles();
430
431
unlink("$msa") if ( $change==1 );
my @tmpFiles = glob($cache."/${date}*");
Sebastien Moretti's avatar
Sebastien Moretti committed
432
433
if ( exists($tmpFiles[0]) ){
    mkdir("$cache/${date}_TMP");
434
    move("$_", "$cache/${date}_TMP") while(<@tmpFiles>);
Sebastien Moretti's avatar
Sebastien Moretti committed
435
436
437
438
439
440
441
}


#Ouput Checker


#STDOUT Log output for our server
442
if ( -s "${originalMSA}.cds" || -s "${originalMSA}.out"){
Sebastien Moretti's avatar
Sebastien Moretti committed
443
444
    print "\n\nTERMINATION STATUS: SUCCESS\n";
    print "OUTPUT RESULTS\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
445
446
447
448
449
    print "    #### File Type=        MSA Format= fasta_CDS Name= ${originalMSA}.cds\n" 
        if ( -s "${originalMSA}.cds");
    print "    #### File Type=        MSA Format= fasta_CDS+Query Name= ${originalMSA}.cdsP\n" 
        if ( -s "${originalMSA}.cdsP");
    print "    #### File Type=        MSA Format= colored_fasta_CDS Name= ${originalMSA}.cdsP.html\n" 
Sebastien Moretti's avatar
Sebastien Moretti committed
450
        if ( Views::Html("${originalMSA}.cdsP") && -s "${originalMSA}.cdsP.html");
Sebastien Moretti's avatar
Sebastien Moretti committed
451
452
453
454
    print "    #### File Type=        MSA Format= Rejected_seq Name= ${originalMSA}.out\n" 
        if ( -s "${originalMSA}.out");
    print "    #### File Type=        MSA Format= fasta_Exon-Boundaries Name= ${originalMSA}.boj\n" 
        if ( -s "${originalMSA}.boj" && -s "${originalMSA}.cds" && $hideBOJ==0);
Sebastien Moretti's avatar
Sebastien Moretti committed
455
456
}
else{
Sebastien Moretti's avatar
Sebastien Moretti committed
457
    failure();
458
    exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
459
460
}

461
exit(0);
Sebastien Moretti's avatar
Sebastien Moretti committed
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543


=head1 NAME

ProtoGene.pl - Converts a peptidic alignment to a nucleotidic alignment through database search

=head1 SYNOPSIS

B<ProtoGene.pl> --msa=path_of_the_fasta_msa_file [options]

with these options available:

=over 4

=item I<--orgm>=targeted species in the database search

=item All_organisms [default], Bacteria, Viruses, Vertebrata, Eukaryota, Mammalia, Primates, Homo_sapiens, Gallus_gallus, Bos_taurus, Escherichia_coli, Arabidopsis_thaliana, Mus_musculus, Drosophila_Melanogaster, ...

=item I<--db>=targeted database in the database search

=item nr, pdb, swissprot, refseq_protein [default]

=item I<--local> to execute a local BLAST query with  --db=path_for_a_local_db_blast_formated

=item I<--template> to provide your own nucleotidic sequences following the cds file format

=item I<--revtrans> to reverse-translate sequences with no blast hit, in IUB (IUPAC) depiction code

=item They are removed from the alignement by default

=item I<--pep> to add the original peptide query beneath the back-translated sequence

=item I<--cache>=none, update, use, own_path_directory, old, empty

=item none: no cache usage, none temporary files are stored

=item update: use cache but update old files [default]

=item use: force use cache, whatever the age of files

=item own_path_directory: use my own directory, and its files

=item old: remove the old files in the cache directory

=item empty: empty the whole cache directory $HOME/.ProtoGene/

=item I<--version> to print version information

=item I<--help> to print this help message

=back

=head1 DESCRIPTION

PROTOGENE : Bona-Fide Back Translation of Protein Multiple Sequence Alignments
It converts a peptidic alignment to a nucleotidic alignment through database search.
Blast queries allow to get nucleotidic sequences from where peptidic sequences come from.
Exonerate aligns nucleotidic and peptidic sequences together.
PROTOGENE re-builds the original alignment with nucleotidic information it has gotten.

=head1 REQUIREMENT

=over 4

=item B<Perl 5.6 or better> is required !

=item Standard Perl modules B<lib>, B<strict>,  B<warnings>,  B<diagnostics>, B<Env> are required

=item and some other current ones : B<Getopt::Long>,  B<File::Which>,  B<File::Copy>, B<Mail::Send>

=item -

=item I<exonerate> from http://www.ebi.ac.uk/~guy/exonerate/

=item I<blast> from http://www.ncbi.nlm.nih.gov/BLAST/download.shtml or http://blast.wustl.edu/

=back

=head1 VERSION

=over 8

544
=item version 4.0.0
Sebastien Moretti's avatar
Sebastien Moretti committed
545

546
=item on Apr 02nd, 2011
Sebastien Moretti's avatar
Sebastien Moretti committed
547
548
549
550
551
552
553
554
555

=back

=head1 AUTHORS

=over 8

=item Sebastien MORETTI

Sebastien Moretti's avatar
Sebastien Moretti committed
556
=item moretti.sebastien [AT] gmail.com
Sebastien Moretti's avatar
Sebastien Moretti committed
557

558
=item Vital-IT computing center
Sebastien Moretti's avatar
Sebastien Moretti committed
559

560
=item Swiss Institute of Bioinformatics
Sebastien Moretti's avatar
Sebastien Moretti committed
561

562
=item Lausanne, Switzerland
Sebastien Moretti's avatar
Sebastien Moretti committed
563

564
=item http://www.vital-it.ch/
Sebastien Moretti's avatar
Sebastien Moretti committed
565
566
567
568
569
570
571
572
573

=back

=cut


######################  Management  ######################

#Failure declaration
Sebastien Moretti's avatar
Sebastien Moretti committed
574
sub failure {
575
576
577
    print {*STDERR} "\n\nFATAL\n\nTERMINATION STATUS: FAILURE\n\n";
    return;
}
Sebastien Moretti's avatar
Sebastien Moretti committed
578
579
580

#Check external programs presence in the PATH
sub checkProgramsPresence{
581
    my ($webblast_exe, $exonerate_exe) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
582

583
    if ( $webblast_exe eq '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
584
        failure();
585
586
        print {*STDERR} "\twebblast.pl program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
        exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
587
    }
588
    if ( $exonerate_exe eq '' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
589
        failure();
590
591
        print {*STDERR} "\texonerate program is not reachable\n\tIt could not be in your PATH or not installed\n\n";
        exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
592
593
594
595
596
    }
}

#Check cache directory accessibility: Test and change rights, if needs, of the $HOME/.ProtoGene/ cache directory
sub checkCacheAccessibility{
597
    my ($cacheDir) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
598

599
    if ( -d "$cacheDir/" && -w "$cacheDir/" && -x "$cacheDir/" && -r "$cacheDir/" ){
Sebastien Moretti's avatar
Sebastien Moretti committed
600
    }
601
602
603
    elsif ( -d "$cacheDir/" ){
        if ( -o "$cacheDir/" ){
            chmod 0754, "$cacheDir/";
604
        }
605
        else{
Sebastien Moretti's avatar
Sebastien Moretti committed
606
            print {*STDERR} "\n\t** Warning ** :\n\tPermissions do NOT seem to be valid for the current cache directory\n\n";
607
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
608
609
    }
    else{
610
611
612
        mkdir "$cacheDir/";
        if ( -o "$cacheDir/" ){
            chmod 0754, "$cacheDir/"; #0754 est en octal, donc 754 ou '0754' ne sont pas bons !
613
614
        }
        else{
Sebastien Moretti's avatar
Sebastien Moretti committed
615
            print {*STDERR} "\n\t** Warning ** :\n\tPermissions do NOT seem to be valid for the current cache directory\n\n";
616
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
617
    }
618
    return;
Sebastien Moretti's avatar
Sebastien Moretti committed
619
620
621
622
}

#Cache Management
sub cacheManagement{
623
    my ($cache, $cacheDir, $inScript) = @_;
624
625
    $inScript = 0 if ( !defined $inScript );

626
627
    if ( $cache eq 'empty' ){
        my @temp_files = glob($cacheDir.'/*.fas');
628
629
630
        if ( exists ($temp_files[0]) ){
            unlink $_ while(<@temp_files>);
        }
631
        @temp_files = glob($cacheDir.'/*_ExonerateError');
632
633
634
        if ( exists ($temp_files[0]) ){
            unlink $_ while(<@temp_files>);
        }
635
636
        unlink("$cacheDir/webblast.log", "$cacheDir/web_tempo.result", "$cacheDir/debug.tempo");
        print "\n\tcache directory [$cacheDir] is empty\n\n";
637
        exit 0;
Sebastien Moretti's avatar
Sebastien Moretti committed
638
    }
639
640
641
    elsif ( $cache eq 'old' ){
        my @temp_files = glob($cacheDir.'/*.fas');
        my @tempFiles  = grep { -M $_ > $cacheStorageTime } @temp_files; #Sort list only with $cacheStorageTime days old files
642
643
644
        if ( exists ($tempFiles[0]) ){
            unlink $_ while(<@tempFiles>);
        }
645
        print "\n\tcache directory [$cacheDir] is empty of its old files\n\n" if ($inScript==0);
646
        exit 0 if ( $inScript==0 );
Sebastien Moretti's avatar
Sebastien Moretti committed
647
    }
648
649
    elsif ( $cache eq 'update' || $cache eq 'use' ){
        $cache = $cacheDir; #Limit cache files to less than $cacheStorageTime days old ones for 'update'
Sebastien Moretti's avatar
Sebastien Moretti committed
650
    }
651
    elsif ( $cache eq 'none' ){
Sebastien Moretti's avatar
Sebastien Moretti committed
652
    }
653
654
655
656
    elsif ( -d $cache ){
        die "\n\tProtoGene cannot access to your directory\n\n"  if ( ! -x $cache );
        die "\n\tProtoGene cannot write into your directory\n\n" if ( ! -w $cache );
        die "\n\tProtoGene cannot read into your directory\n\n"  if ( ! -r $cache );
Sebastien Moretti's avatar
Sebastien Moretti committed
657
658
    }
    else {
659
        die "\n\tWrong cache argument, or your cache folder doesn't seem to exist\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
660
661
662
663
664
665
666
667
668
669
670
671
    }

    return($cache);
}

##########################################################


######################    Fasta    ######################

#Check input aln
sub checkFastaFile{
672
    my ($msa, $change) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
673

674
    if ( -e "$msa" ){
Sebastien Moretti's avatar
Sebastien Moretti committed
675
    #Convert no fasta format to fasta with seq_reformat if available
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
        my $seqreformat = which('t_coffeeX') || ''; #desactive tant que t_coffee seq_reformat pose prob
        if ( $seqreformat ne '' ){
            my @formatType = `$seqreformat -other_pg seq_reformat -in $msa -print_format`;
            my $format     = '';
            while(<@formatType>){
                if ( $_ =~ /FORMAT:/ ){
                    $format = $_;
                    chomp($format);
                    last;
                }
            }
            if ( $format ne '' && $format !~ /FORMAT:fasta_/ ){
                system("$seqreformat -other_pg seq_reformat -in $msa -output fasta_aln -out=${msa}.007") if ($format =~ /_aln *$/);
                system("$seqreformat -other_pg seq_reformat -in $msa -output fasta_seq -out=${msa}.007") if ($format =~ /_seq *$/);
                $msa    .= '.007';
                $change  = 1;
            }
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
694
695
    }
    else {
Sebastien Moretti's avatar
Sebastien Moretti committed
696
        failure();
697
698
        print {*STDERR} "\tThe file \'$msa\' doesn't not seem to be reachable\n\n";
        exit(1);
Sebastien Moretti's avatar
Sebastien Moretti committed
699
700
701
702
703
704
705
    }

    return($msa,$change);
}

#Prepare Fasta Header of the query name to incorporate our annotations
sub fastaHeaders4ProtoGene{
706
    my ($QueryName) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
707
708

    $QueryName =~ s/  +/ /g;
709
710
711
712
    my ($cqacc, $cqdesc) = ($QueryName, $QueryName);
    $cqacc  =~ s/^ *([^ ]*) .*$/$1/;
    $cqacc  =~ s/[\s\t]*$//g;
    $cqacc  =  $cqacc.'_G_@@';
Sebastien Moretti's avatar
Sebastien Moretti committed
713
714
715
716
    $cqdesc =~ s/^ *[^ ]* *(.*)$/$1/;
    $cqdesc =~ s/^[\| \.]*//;
    $cqdesc =~ s/[\|\. ]*$//;

717
    $QueryName = $cqacc.$cqdesc;
Sebastien Moretti's avatar
Sebastien Moretti committed
718
719
720
721
722
    return($QueryName);
}

#Prepare Fasta Header to put it on the fasta header of the result outputs
sub getFastaHeaderAnnot{
723
    my ($bestTarget) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
724
725

    #Get annotation from nucleotide file header
726
727
728
    open(my $BEST, '<', "$cache/${bestTarget}.fas");
    my $annot=<$BEST> || '';
    close $BEST;
Sebastien Moretti's avatar
Sebastien Moretti committed
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
    chomp($annot);
    $annot =~ s/^ *[^ ]* //;
    $annot =~ s/[\. ]*$//;
    $annot =~ s/^ *//;
    $annot =~ s/  +/ /g;

    return($annot);
}

#########################################################


###################### Translation ######################

#Direct Reverse translation
sub reverse_trad{
745
    my ($aa) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
746
747
748
749
750
751
752

    my %reverse_code=('A' => 'GCN', 'F' => 'TTy', 'K' => 'AAr', 'P' => 'CCN', 'T' => 'ACN',
                      'C' => 'TGy', 'G' => 'GGN', 'L' => 'yTN', 'Q' => 'CAr', 'V' => 'GTN',
                      'D' => 'GAy', 'H' => 'CAy', 'M' => 'ATG', 'R' => 'mGN', 'W' => 'TGG',
                      'E' => 'GAr', 'I' => 'ATh', 'N' => 'AAy', 'S' => 'wsN', 'Y' => 'TAy',
                      'X' => 'NNN', '-' => '---', '*' => 'Trr'); # '*' is for stop codon

753
754
755
756
757
758
    my $triplet = 'nnn';
    while( my ($x,$y) =each(%reverse_code) ){
        if ( uc($aa) eq $x ){
            $triplet = $y;
            last;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
759
760
761
762
763
764
765
766
767
768
    }

    return($triplet);
}

#########################################################


####################### webblast #######################
sub launchWebblast{
769
770
    my ($infile, $quiet, $whatBlast, $db, $species, $blast_exe, $outfile) = @_;
    my $blast_at = '';
Sebastien Moretti's avatar
Sebastien Moretti committed
771

772
773
774
    $blast_at    = "-database=$db"                         if ( $whatBlast==0 );
    $blast_at    = "-database=$db -blast_dir=${blast_exe}" if ( $whatBlast==1 );
    $blast_at    = "-database=$db -gigablast=yes"          if ( $whatBlast==2 );
Sebastien Moretti's avatar
Sebastien Moretti committed
775
776
777
778
779
780
781
782
783
784
785
786
787

#    print "             Evalue treshold : 0.05
#             Matrix : BLOSUM62
#             Filter : F
#             **********************
#             For RefSeq Protein db:
#             Blast_identity_threshold : 100
#             Cover threshold : 100
#             **********************
#             For NR Protein db:
#             Blast_identity_threshold : 95
#             Cover threshold : 95";

788
    system("$webblast_exe -program=blastp ${blast_at} -infile=$infile -matrix=BLOSUM62 -evalue=0.05 -method=geneid -filter=Off -organism=$species -identity=100 -cover=100 -hits=10 $quiet -outfile=$outfile");
Sebastien Moretti's avatar
Sebastien Moretti committed
789
790


791
792
793
794
795
796
797
    if ( -z "$outfile" ){ #Another blastP query if no hit with refseq and db was not nr !
        $quiet = '-quiet=on';
        print {*STDERR} "run BLAST.. against NR because there was no acceptable hit against $db\n"
            if ( $whatBlast != 1 && $db ne 'nr' );
        print {*STDERR} "run BLAST.. again, with decreased thresholds, because there was no acceptable hit against $db\n"
            if ( $whatBlast == 1 || $db eq 'nr' );
        $blast_at =~ s/-database=[^\s]*/-database=nr/ if ( $whatBlast != 1 );
798
        system("$webblast_exe -program=blastp ${blast_at} -infile=$infile -matrix=BLOSUM62 -evalue=0.05 -method=geneid -filter=Off -organism=$species -identity=95 -cover=95 -hits=10 $quiet -outfile=$outfile");
Sebastien Moretti's avatar
Sebastien Moretti committed
799
800
    }

801
802
803
804
805
806
807
    move('webblast.log',     "$cache"); #Move temporary webblast.pl files
    move('web_tempo.result', "$cache");
    move('debug.tempo',      "$cache");

    unlink("$cache/webblast.log")     if ( -z "$cache/webblast.log" );
    unlink("$cache/web_tempo.result") if ( -z "$cache/web_tempo.result" );
    unlink("$cache/debug.tempo")      if ( -z "$cache/debug.tempo" );
Sebastien Moretti's avatar
Sebastien Moretti committed
808

809
    return;
Sebastien Moretti's avatar
Sebastien Moretti committed
810
811
812
813
814
815
816
817
818
819
}


########################################################


##################### NCBI requests #####################

#Prot ACC -> PUID
sub blastPAcc2PGI{
820
    my ($blastHit) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
821

822
823
824
    my $protGI = '';
    my $count  = 0;
    GET_PUI:
Sebastien Moretti's avatar
Sebastien Moretti committed
825
    for(my $rep=0;$rep <= 4; $rep++){
826
        $count++;
827
828
#        system("wget -q -O $cache/${date}_${blastHit}protGi.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=${blastHit}[pacc]&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
        system("wget -q -O $cache/${date}_${blastHit}protGi.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=${blastHit}&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
829
830
831
        open(my $GIP, '<', "$cache/${date}_${blastHit}protGi.tmp");
        PROT_GI:
        while(<$GIP>){
832
            if ( $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
833
834
835
836
837
838
839
                $protGI = $1;
                last PROT_GI; #OK if only one PUID
            }
        }
        close $GIP;
        $rep = $rep-15 if ( -z "$cache/${date}_${blastHit}protGi.tmp" && $count==1 );
        last GET_PUI if ( $protGI =~ /^\d+$/ );
Sebastien Moretti's avatar
Sebastien Moretti committed
840
    }
841
    unlink("$cache/${date}_${blastHit}protGi.tmp") if ( $tmp == 0 || $protGI ne '' );
Sebastien Moretti's avatar
Sebastien Moretti committed
842
843
844
845
846
847

    return($protGI);
}

#PUID -> nt GIs
sub protGI2NTGIs{
848
    my ($protGI, $blastHit) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
849

850
851
852
853
    my $ntGIs  = '';
    my $geneID = '';
    my $count  = 0;
    GET_NTUI:
Sebastien Moretti's avatar
Sebastien Moretti committed
854
    for(my $rep=0;$rep <= 4; $rep++){
855
        $count++;
Sebastien Moretti's avatar
Sebastien Moretti committed
856
        #nt GIs
857
        system("wget -q -O $cache/${date}_${blastHit}nucleo.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,nucleotide,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
858
859
860
861
862
863
864
865
866
867
868
        open(my $GIN, '<', "$cache/${date}_${blastHit}nucleo.tmp");
        my $flag = 0;
        NT_GI:
        while(<$GIN>){
            if ( $_   =~ /<LinkName>protein_nuc[a-z]+<\/LinkName>/ ){ #for nuccleotide and nuccore
                $flag = 1;
            }
            elsif ( $_ =~ /\<LinkName\>protein_gene\<\/LinkName\>/ ){
                $flag  = 2;
            }
            elsif ( $flag==1 && $_ =~ /^.*\<Id\>(\d+)\<\/Id\>.*$/ ){
869
870
871
                my $match = $1;
                $ntGIs  .= ",$match" if ( $ntGIs  ne '' && $ntGIs  !~ /,$match,/ && $ntGIs  !~ /^$match,/ );
                $ntGIs   = $match    if ( $ntGIs  eq '' );
872
873
            }
            elsif ( $flag==2 && $_ =~ /^.*\<Id\>(\d+)\<\/Id\>.*$/ ){
874
875
876
                my $match = $1;
                $geneID .= ",$match" if ( $geneID ne '' && $geneID !~ /,$match,/ && $geneID !~ /^$match,/ );
                $geneID  = $match    if ( $geneID eq '' );
877
878
879
880
881
            }
        }
        close $GIN;
        $rep = $rep-15 if ( -z "$cache/${date}_${blastHit}nucleo.tmp" && $count==1 );
        last GET_NTUI if ( $ntGIs ne '' || $geneID ne '' );
Sebastien Moretti's avatar
Sebastien Moretti committed
882
    }
883
    unlink("$cache/${date}_${blastHit}nucleo.tmp") if ( $tmp==0 || ($ntGIs ne '' || $geneID ne '') );
Sebastien Moretti's avatar
Sebastien Moretti committed
884

885
    return($ntGIs, $geneID);
Sebastien Moretti's avatar
Sebastien Moretti committed
886
887
888
}

sub geneID2Chr{
889
    my ($geneID, $blastHit) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
890

891
892
893
894
    my $chr   = '';
    my ($amont, $aval) = ('', '');
    my $count = 0;
    GET_CHR:
Sebastien Moretti's avatar
Sebastien Moretti committed
895
    for(my $rep=0;$rep <= 8; $rep++){
896
        $count++;
Sebastien Moretti's avatar
Sebastien Moretti committed
897
        #Gene Acc
898
899
#        system("wget -q -O $cache/${date}_${blastHit}gene.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
        system("wget -q -O $cache/${date}_${blastHit}gene.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
900
901
902
903
904
905
906
        open(my $GACC, '<', "$cache/${date}_${blastHit}gene.tmp");
        my $flag = 0;
        GENE_CHR:
        while(<$GACC>){
            if ( $_  =~ /\<ERROR\>Empty id list \- nothing todo\<\/ERROR\>/ && $flag==0 ){
                $rep = $rep-15 if ( $count==1 );
            }
907
908
909
#            if ( $_   =~ /\<Entrezgene_locus\>/ && $flag==0 ){
            if ( $_   =~ /<Item Name="GenomicInfoType" Type="Structure">/ && $flag==0 ){
               $flag = 1;
910
            }
911
912
913
914
915
916
917
#            elsif ( $_ =~ /\<Gene-commentary_type value=\"genomic\"\>/ && $flag==1 ){
#                $flag  = 2;
#            }
#            elsif ( $_ =~ /\<Gene-commentary_type value=/ && $flag==2 ){
#                last GENE_CHR;
#            }
#            elsif ( $_ =~ /\<Gene-commentary_accession\>([\w\_\-\.]+)\<\/Gene-commentary_accession\>/ && $flag==2 ){
Sebastien Moretti's avatar
Sebastien Moretti committed
918
            elsif ( $_ =~ /<Item Name="ChrAccVer" Type="String">([^<]+?)\.?\d*<\/Item>/ && $flag==1 ){
919
920
921
                $chr   = $1;
                $flag  = 3;
            }
922
923
#            elsif ( $_ =~ /\<Seq-interval_from\>(\d+)\<\/Seq-interval_from\>/ && $flag==3 ){
            elsif ( $_ =~ /<Item Name="ChrStart" Type="Integer">(\d+)<\/Item>/ && $flag==3 ){
924
925
926
                $amont = $1;
                $flag  = 4;
            }
927
928
#            elsif ( $_ =~ /\<Seq-interval_to\>(\d+)\<\/Seq-interval_to\>/ && $flag==4 ){
            elsif ( $_ =~ /<Item Name="ChrStop" Type="Integer">(\d+)<\/Item>/ && $flag==4 ){
929
930
931
932
933
934
935
                $aval  = $1;
                last GENE_CHR;
            }
        }
        close $GACC;
        $rep = $rep-15 if ( -z "$cache/${date}_${blastHit}gene.tmp" && $count==1 );
        last GET_CHR   if ( $chr ne '' && $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
Sebastien Moretti's avatar
Sebastien Moretti committed
936
    }
937
938
939
940
    unlink("$cache/${date}_${blastHit}gene.tmp") if ( $tmp == 0 || $chr ne '' );
    if ( $amont eq '' || $aval eq '' ){
        $amont = '';
        $aval = '';
Sebastien Moretti's avatar
Sebastien Moretti committed
941
    }
942
943
944
945
946
947
948
949
950
    elsif ( $amont ne '' && $aval ne '' ){
        if ( $amont > $aval ){
            $amont = $amont+5000;
            $aval  = $aval-5000;
        }
        else{
            $amont = $amont-5000;
            $aval  = $aval+5000;
        }
Sebastien Moretti's avatar
Sebastien Moretti committed
951
    }
952
953
    $amont = 1 if ( $amont ne '' && $amont <= 0 );
    $aval  = 1 if ( $aval ne ''  && $aval <= 0 );
Sebastien Moretti's avatar
Sebastien Moretti committed
954

955
    return($chr, $amont, $aval);
Sebastien Moretti's avatar
Sebastien Moretti committed
956
957
958
}

sub downloadSeqFromGIs{
959
    my ($cache, $date, $amont, $aval, @acc) = @_;
Sebastien Moretti's avatar
Sebastien Moretti committed
960

961
    GET_SEQ:
Sebastien Moretti's avatar
Sebastien Moretti committed
962
    for(my $a=0; $a<=$#acc; $a++){
963
964
965
966
        my $whatNumber = 0;
        if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
            GET_CHR_SEQ:
            for(my $rep=0;$rep <= 4; $rep++){
967
                system("wget -q -O $cache/${acc[$a]}-${amont}.fas 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${acc[$a]}&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch'") if ( !-e "$cache/${acc[$a]}-${amont}.fas" || -z "$cache/${acc[$a]}-${amont}.fas" || $cache eq 'none' || ($cache eq 'update' && -M "$cache/${acc[$a]}-${amont}.fas" > $cacheStorageTime ) );
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
                open(my $CIBLE, '<', "$cache/${acc[$a]}-${amont}.fas");
                my $counter = 0;
                my $lines   = 0;
                CHECK_CHR_SEQ:
                while(<$CIBLE>){
                    $lines++              if ( $_ !~ /^>/ );
                    $counter = $counter+2 if ( $counter==1 && $_ !~ /^\w/ && $lines==1 && $_ !~ /^>/ );
                    $counter++            if ( $_ =~ /^>/ );
                    $counter = $counter+2 if ( $_ !~ /^>/ && ($_ =~ /Error:/ || $_ =~ /[<>]/) );
#                   last CHECK_CHR_SEQ    if ( $_ !~ /^>/ && $counter==1 && $_ =~ /^\w/ );
                }
                close $CIBLE;
                $whatNumber++;
                last GET_CHR_SEQ if ( $whatNumber==20 );
                if ( $counter != 1 ){
                    $rep = $rep-1;
                    unlink("$cache/${acc[$a]}-${amont}.fas");
                }
            }
        }
        else{
            GET_OTHER_SEQ:
            for(my $rep=0;$rep <= 4; $rep++){
991
                system("wget -q -O $cache/${acc[$a]}.fas 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${acc[$a]}&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'") if ( !-e "$cache/${acc[$a]}.fas" || -z "$cache/${acc[$a]}.fas" || $cache eq 'none' || ($cache eq 'update' && -M "$cache/${acc[$a]}.fas" > $cacheStorageTime ) );
992
993
994
995
996
997
998
999
1000
                open(my $CIBLE, '<', "$cache/${acc[$a]}.fas");
                my $counter = 0;
                my $lines   = 0;
                CHECK_OTHER_SEQ:
                while(<$CIBLE>){
                    $lines++              if ( $_ !~ /^>/ );
                    $counter = $counter+2 if ( $counter==1 && $_ !~ /^\w/ && $lines==1 && $_ !~ /^>/ );
                    $counter++            if ( $_ =~ /^>/ );
                    $counter = $counter+2 if ( $_ !~ /^>/ && ($_ =~ /Error:/ || $_ =~ /[<>]/) );