webblast.pl 31.9 KB
Newer Older
Sebastien Moretti's avatar
Sebastien Moretti committed
1
2
3
#!/usr/bin/env perl
#
#
4
#date : 2007/11/19
Sebastien Moretti's avatar
Sebastien Moretti committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#prog : webblast.pl
#subj : make a BLAST/WU-BLAST (by HTTP request or locally) against a database with a file containing sequences in fasta format
####### method genid, pdbid and profile 
#
#############################################################################################
#use Env qw(HOME);
#use lib "$HOME/.lib_webblast/";
use LWP::UserAgent; 
use HTML::Parser;                                            # @@@@@@@ #       
use HTTP::Request::Common qw(POST);                         # @/^   ^\@ # 
use URI::Escape;                                           # @/ -   - \@ #
use Getopt::Long;                                         ##  \   ^   /  ##
use strict;                                              ##    |  0  |    ##
use warnings;                                           ####### \ _ / #######
19
##############################################################################################
Sebastien Moretti's avatar
Sebastien Moretti committed
20
21


22
23
24
25
26
27
28
29
############################  EXPRESSO PARAM  ##########################################################
my $database_expresso  = 'pdb';                                            #PDB database name          #
my $blast_dir_expresso = '/mnt/local/bin/blastall';                        #blastall executable        #
my $BLASTMAT           = 'export BLASTMAT=/mnt/local/ncbi/data/';          #matrix directory for blast #
my $BLASTDB            = 'export BLASTDB=/scratch/frt/blastnet/database/'; #PDB directoty              #
########################################################################################################
    
my $runblast = '/mnt/local/bin/runblast.pl';
Sebastien Moretti's avatar
Sebastien Moretti committed
30

31
my(@list_encoded)=(), my(@list_pdb)=(), my(%deja_vu)=(), my(@pdb_list)=(),my($i)=0, my(@names)=(), my $locale=0, my $distant=0, my $database, my $blast_way;
Sebastien Moretti's avatar
Sebastien Moretti committed
32
33
my($ua)= LWP::UserAgent->new;

34

Sebastien Moretti's avatar
Sebastien Moretti committed
35
36
37
38
39
40
41
42
43
##-- Variables d'environnements 

my($database_var)= $ENV { 'DATABASE' };
my($blast_var)   = $ENV { 'BLAST_DIRECTORY' };

##-- Recupere Options/parametres du BLAST && controle des options ds OPTIONS_GET
 
my($program,$database_line,$blast_line,$query_file,
   $out_file,$identity_treshold,$cover_tresh,$Eval,
44
   $align,$matrix,$filter,$method,$orgn,$process,$quiet,$gigablast)= &OPTIONS_GET();
Sebastien Moretti's avatar
Sebastien Moretti committed
45

46
##-- Determination BLAST LOCAL /DISTANT && Controle database/programme
Sebastien Moretti's avatar
Sebastien Moretti committed
47

48
49
unless (-e $query_file ) { print {*STDERR} "\nfile does not exist!\n";exit;}
unless (-s $query_file ) { print {*STDERR} "\nyour file is empty!\n";exit; }
Sebastien Moretti's avatar
Sebastien Moretti committed
50

51
if ((($database_line || $database_var) && ($blast_line || $blast_var)) ) 
Sebastien Moretti's avatar
Sebastien Moretti committed
52
{   
53
54
55
56
57
58
59
60
61
62
63
64
65
66
    if ($database_line=~/expressopdb/ && $blast_line=~/blastexpresso/)
    {
	#mode special pour fichier de configuration du serveur Expresso
	$locale=1;
	$database="expressopdb";
	unless ($quiet=~ /on/i) { print {*STDERR} "\nRUN BLAST LOCALY\n"; }
    }
    else
    {
	($database)=$database_line || $database_var;	
	my($blast_tp)=$blast_var || $blast_line; $locale=1;
	$blast_way = &CONTROLE_DB_PG($database,$blast_tp,$program);
	unless ($quiet=~ /on/i) { print {*STDERR} "\nRUN BLAST LOCALY\n"; }
    }
Sebastien Moretti's avatar
Sebastien Moretti committed
67
}
68

Sebastien Moretti's avatar
Sebastien Moretti committed
69
70
else
{
71
72
73
    $database = &NCBI_DATABASE($database_line); $distant=1;    
    if ($gigablast=~ /^yes$/i) { $locale=2; $distant=0; unless ($quiet=~ /on/i) { print {*STDERR} "\nRUN GIGABLASTER\n"; }}   
    else { unless ($quiet=~ /on/i)  {print {*STDERR} "\nRUN BLAST AT THE NCBI\n";}} 
Sebastien Moretti's avatar
Sebastien Moretti committed
74
75
}
 
76
##-fixation de parametres selon la valeur du flag -method
Sebastien Moretti's avatar
Sebastien Moretti committed
77

78
if ($method=~ /^pdbid$/i)
Sebastien Moretti's avatar
Sebastien Moretti committed
79
{   
80
81
82
83
84
85
86
  
    if    ($gigablast=~ /^yes$/i)          
    {
	if ($database ne "pdb") { print {*STDERR} "\nprovide a valid database name FOR RUN GIGABLASTER: nr,pdb or refseq_protein\n";exit;} else {$database="pdbaa";}
    }
    elsif ($gigablast=~ /^no$/i && $distant==1) { $database="pdb";}
          
Sebastien Moretti's avatar
Sebastien Moretti committed
87
}
88
89
90
elsif ($method=~ /^geneid$/i)
{
    if ($distant==1) { unless ($database eq "nr" || $database eq "swissprot" || $database eq "pdb") { $database="refseq_protein";}}   
Sebastien Moretti's avatar
Sebastien Moretti committed
91
}
92
93
94
95
96
97
98
99
elsif ($method=~/^profile$/i)
{
    if ($distant==1) {	unless ($database eq "nr" || $database eq "swissprot" || $database eq "refseq_protein" )  { $database="pdb"; } }
}
else { die "unknown method\n";}

if (($orgn !~ /All\+organisms/) && ($locale=~/1|2/))
   { print {*STDERR} "-organism option can't be used locally or with -gigablast option!\n";exit;}
Sebastien Moretti's avatar
Sebastien Moretti committed
100

101
##---AFFICHAGE des valeurs des options
Sebastien Moretti's avatar
Sebastien Moretti committed
102
103
unless ($quiet =~ /on/i )
{
104
105
    print {*STDERR} "   
              
Sebastien Moretti's avatar
Sebastien Moretti committed
106
107
108
109
110
111
112
             Program : $program
             Database : $database
             Method : $method
     
             Query_file : $query_file
             Out_file : $out_file 
             ";
113
114
  print {*STDOUT} "      
             Evalue threshold : $Eval
Sebastien Moretti's avatar
Sebastien Moretti committed
115
116
117
118
119
             Matrix : $matrix
             Filter : $filter
             Blast_identity_threshold : $identity_treshold
             Cover threshold : $cover_tresh
              ";
120
    print {*STDERR} "
Sebastien Moretti's avatar
Sebastien Moretti committed
121
122
123
             Number of hits :  $align
             Number of processors used : $process 
             ";
124
    if ($gigablast=~ /^yes$/i) { print {*STDERR} "
Sebastien Moretti's avatar
Sebastien Moretti committed
125
             gigablast: yes\n" }
126
    unless ($locale) { print {*STDERR} "
Sebastien Moretti's avatar
Sebastien Moretti committed
127
128
129
             Organism : $orgn\n" }
            
                        
130
print {*STDERR} "
Sebastien Moretti's avatar
Sebastien Moretti committed
131
132
133
***************************************************************\n\n";
}

134

Sebastien Moretti's avatar
Sebastien Moretti committed
135
136
#-- LOCAL/DISTANT BLASTP
   
137
if    ($locale==1 || $locale==2) {@list_pdb= &LOCAL_BLAST ($blast_way,$database,$query_file,$Eval,$align,$method,$matrix,$filter,$process,$gigablast,$database_expresso,$blast_dir_expresso,$runblast);   }
Sebastien Moretti's avatar
Sebastien Moretti committed
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
elsif ($distant==1)  { @list_pdb= &WEB_BLAST   ($query_file,$Eval,$program,$database,$matrix,$method,$align,$orgn,$filter); }
else              { die " Report bug to armougom\@igs.cnrs-mrs.fr\n"}; 

#-- PARSE BLAST RESULTS -> MAKE A PDB_ID LIST
if ($method =~ /^pdbid$/i) 
{ 
    my(@result_sort)= &PARSING (\@list_pdb,$locale,$distant,$method,$quiet,$database,$gigablast);
   
                      &AFFICHAGE_PDB_PARSING (\@result_sort,$cover_tresh,$identity_treshold,$out_file);
                      exit;
}

#-- PARSE BLAST RESULT -> MAKE LIST OF REFSEQ ID
elsif ($method =~ /^geneid$/i)
{ 
    my(@result_sort)= &PARSING (\@list_pdb,$locale,$distant,$method,$quiet,$database,$gigablast);
                      &AFFICHAGE_REFSEQ_PARSING (\@result_sort,$cover_tresh,$identity_treshold,$out_file);
                      exit;
}

#-- PARSE BLAST RESULT -> MAKE PROFILE
159
elsif ($method=~ /^profile$/i) { &PROFILE (\@list_pdb,$out_file,$distant); exit; }
Sebastien Moretti's avatar
Sebastien Moretti committed
160
else { die " \nFATAL ERROR :  Method or database error\n" ;}
161

Sebastien Moretti's avatar
Sebastien Moretti committed
162
exit;
163
                   
Sebastien Moretti's avatar
Sebastien Moretti committed
164
165
166
167
168
169
                                               ##############
###############################################  FONCTIONS  ####################################################################
                                              ##############
sub CONTROLE_DB_PG
{
    my($database,$blast_dir,$program)=@_;
170
171
172
    
    if (! -e $database) { die "$database file  does not exist\n";}
    if ( -d $database) { die "$database must be a file, not a directory\n"; }    
Sebastien Moretti's avatar
Sebastien Moretti committed
173
174
    if ($blast_dir !~ /\/$/) { $blast_dir.="/"; }    
    my ($blastall) = $blast_dir . "blastall";
175
    if (! -e $blastall) { die "$blastall program not found \n";} 
Sebastien Moretti's avatar
Sebastien Moretti committed
176

177
    return ($blastall);
Sebastien Moretti's avatar
Sebastien Moretti committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
}

#-------------------------------------------------------------------------------------------------------------------------------------
sub NCBI_DATABASE
{
    my($ncbi_db)=@_;
    
    my  (%all_db)=
	(
	         'nr'                 =>'1',
	         'pdb'               =>'1',
	         'swissprot'        =>'1',
                 'refseq_protein'  =>'1',
                	       
	);
    
    if (exists $all_db{$ncbi_db}) { return($ncbi_db); }    
    elsif ($ncbi_db eq "")        { return (""); }
    else                          { return (1);  }
} 
#------------------------------------------------------------------------------------------------------------------------
sub HELP
{   
    my($org,@orga)= &LIST_ORGA();
    my ($list_orga)=join(', ',@orga);

204
205
    print {*STDERR} "
                      usage: $0 -infile <fasta file> -method <pdbid/geneid or profile> options []\n
Sebastien Moretti's avatar
Sebastien Moretti committed
206
207
208

                            -program ...... Program Name (blastp)
	                                    Default = blastp
209
                            -database ..... Database at NCBI (nr, pdb, swissprot, refseq_protein) or indicate a local fasta file
Sebastien Moretti's avatar
Sebastien Moretti committed
210
211
	                                    Default = pdb at NCBI
	                    -infile ....... Query_file = a list of sequences in fasta format
212
213
214
215
	                    -outfile ...... Name the outfile to make a template file for t_coffee
	                                    Default = STDOUT or default.profile if method is profile
                            -evalue ....... Evalue threshold Default = 1
                            -matrix ....... PAM30 PAM70 BLOSUM45 BLOSUM80
Sebastien Moretti's avatar
Sebastien Moretti committed
216
                                            Default BLOSUM62
217
218
219
                            -method ....... geneid, pdbid, profile
                            -gigablast..... yes/no FASTER REMOTE BLAST with Gigablaster
                                            (Stephane Audic program: http://www.igs.cnrs-mrs.fr/adele/~database/remoteblast.cgi)
Sebastien Moretti's avatar
Sebastien Moretti committed
220
                                            Default no
221
                            -filter ....... T or F locally, L or R or M or C or V for distant blast
Sebastien Moretti's avatar
Sebastien Moretti committed
222
                                            Default = Off
223
224
225
226
227
228
                            -organism ..... $list_orga are available
                                            Default is All_organisms
	                    -identity ..... blast identity threshold = provide a % for view only the results upper or equal to the threshold
                                            Default 50
                            -cover ........ Cover threshold = provide a % : sequence covering Default: 30
                            -hits ........  Number of hits
Sebastien Moretti's avatar
Sebastien Moretti committed
229
                                            Default = 1
230
231
232
233
234
                            -processor .... Number of processors to use
                                            Default = 1
                            -blast_dir .... Indicates where your BLAST directory is installed localy
                            -quiet ........ on : do not display all the default/defined blast parameters
                                            Default off
Sebastien Moretti's avatar
Sebastien Moretti committed
235
236
237
238
239

                     Environement Variables
                     These variables can be set from the environement
           DATABASE......................[Indicates where your database file must be fetched (localy)]
           BLAST_DIRECTORY...............[Indicates where your BLAST directory is installed localy]
240

Sebastien Moretti's avatar
Sebastien Moretti committed
241
242
243
244
245
246
247
248
249
250
251
252
";
	
    exit;
    
}
#-----------------------------------------------------------------------------------------
sub OPTIONS_GET
{   
    my %opt=();
 
    GetOptions 
              (
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
	       'infile=s'    =>\$opt{infile},
	       'outfile=s'    =>\$opt{outfile},
	       'program=s'     =>\$opt{program},
	       'database=s'     =>\$opt{database},
	       'blast_dir=s'     =>\$opt{blast_dir},
	       'identity=f'       =>\$opt{treshold},
	       'cover=f'           =>\$opt{cover},
	       'evalue=f'           =>\$opt{evalue},     
	       'hits=i'              =>\$opt{hits},	   
	       'matrix=s'             =>\$opt{matrix},
	       'filter=s'              =>\$opt{filter},
	       'method=s'               =>\$opt{method},
           'organism=s'              =>\$opt{organism},
	       'processor=i'              =>\$opt{processor},
	       'quiet=s'                   =>\$opt{quiet},
	       'gigablast=s'                =>\$opt{gigablast},
Sebastien Moretti's avatar
Sebastien Moretti committed
269
270
271
272
273
274
	       );
  
    if ($ARGV[0]) {print "Unprocessed by Getopt::Long\n $ARGV[0]\n"; &HELP();} 
   
  

275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
    my($evalue_tresh) = $opt{'evalue'};       unless ($evalue_tresh) { $evalue_tresh=1;};
    my($cover_tresh)  = $opt{'cover'};         unless (defined $cover_tresh)  { $cover_tresh=30;};
    my($query_file)   = $opt{'infile'};         unless ($query_file)   { print {*STDERR} "Flag -infile must be defined\n"; &HELP();};
    my($outfil)       = $opt{'outfile'};         unless ($outfil)       { $outfil='';};
    my($treshold)     = $opt{'treshold'};         unless (defined $treshold)     { $treshold=50;};
    my($blast_dir)    = $opt{'blast_dir'};         unless ($blast_dir)    { $blast_dir='';};
    my($database)     = $opt{'database'};           unless ($database)     { $database='';};
    my($program)      = $opt{'program'};             unless ($program)      { $program='blastp';};  
    my($align)        = $opt{'hits'};                 unless (defined $align)       { $align=1;};  
    my($matrix)       = $opt{'matrix'};                unless ($matrix)      { $matrix='BLOSUM62';};
    my($filter)       = $opt{'filter'};                 unless ($filter)      { $filter='F';};
    my($method)       = $opt{'method'};                  unless ($method)      {print {*STDERR} "Flag -method must be defined\n"; &HELP();};
    my($organism)     = $opt{'organism'};                 unless ($organism)    { $organism='All organisms';};
    my($process)      = $opt{'processor'};                 unless ($process)     { $process=1;};   
    my($param)        = $opt{'quiet'};                      unless ($param)       { $param='off';}; 
    my($gigablast)    = $opt{'gigablast'};                   unless ($gigablast)    {$gigablast='no';}; 
    if ($method !~ /(^geneid$|^pdbid$|^profile$)/i)  { print {*STDERR} "unknown method for the flag -method\n";&HELP(); }

	if ($treshold <0 || $treshold >100)               { print {*STDERR} "\nout of range for the option -treshold \n"; &HELP();}  
	if ($cover_tresh <0 || $cover_tresh >100)         { print {*STDERR} "\nout of range for the option -cover \n"; &HELP();} 
	if ($align <0)                                    { print {*STDERR} "\n error with option   align\n"; &HELP();}
	if ($gigablast!~/^yes$|^no$/i)                    { print {*STDERR} "invalid argument for gigaglast option : yes/no\n";exit;};
	if ($filter!~ /^[TFRLMCV]{1}$|^off$/i)                  {print {*STDERR}  "valid values for -filter are T,F,R,L,M,C,or V!\n";exit;}
	if ($matrix!~ /PAM30|PAM70|BLOSUM45|BLOSUM80|BLOSUM62/) { print {*STDERR} "valid values for -matrix  are PAM30,PAM70,BLOSUM45,BLOSUM80 or BLOSUM62\n";exit }
	if ($outfil eq "" && $method=~ /^profile$/i) { $outfil='default_profile.template'}
	
	if ($param!~ /^on$|^off$/i)                       { print {*STDERR} "valid values for -quiet is on or off\n";exit;}
	my($orgn,@all_orgn)= &ORGN($organism);
	return ($program,$database,$blast_dir,$query_file,
		    $outfil,$treshold,$cover_tresh,$evalue_tresh,
		    $align,$matrix,$filter,$method,$orgn,$process,$param,$gigablast);
Sebastien Moretti's avatar
Sebastien Moretti committed
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
}

#--------------------------------------------------------------------------------------------------
sub RECOVER
{
    my($pdb_result,$aln_length,$length_query)=@_;
    my $nb_gap=0;
    
    if ($pdb_result=~ /(score.+?\n\n\n).+?score/ism) #cas ou plusieurs HSP, prend que le 1er
    { $pdb_result= $1;}
    
    $length_query  =~ s/,//g;
    my ($requete)  =  join('',($pdb_result=~/^Query(.*)\n/gm));
    $requete       =~ s/[^A-Z-]//g;
    my(@sequence)  =  split('',$requete);
 
    for (my $i=0; $i<=$#sequence; $i++)
    {
	if($sequence[$i] eq "-"){ ++$nb_gap; }
    }
    my($recouvrement)= sprintf("%-3d",(($aln_length-$nb_gap)/$length_query)*100);
    undef(@sequence);
    return ($recouvrement,$nb_gap); 
   
}
#--------------------------------------------------------------------------------------------------
sub WEB_BLAST
{
334
    open (SOR1, '>', 'web_tempo.result') or die;
Sebastien Moretti's avatar
Sebastien Moretti committed
335
    my($query_file,$Eval,$program,$database,$matrix,$method,$align,$orgn,$filter)=@_;
336
    my $aln_view, my $format='Txt';
Sebastien Moretti's avatar
Sebastien Moretti committed
337
    my($description)=$align;
338
    if ($method=~/^profile$/i) { $aln_view ='FlatQueryAnchoredNoIdentities'} else { $aln_view ='Pairwise'}
Sebastien Moretti's avatar
Sebastien Moretti committed
339
   
340
    if ($filter eq 'F') { $filter='off';}
Sebastien Moretti's avatar
Sebastien Moretti committed
341

342
    $/='>';
Sebastien Moretti's avatar
Sebastien Moretti committed
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
    open(FIC,$query_file) or die "can not open $query_file $!\n";
    my(@sequences)=<FIC>;
    close FIC;
    shift(@sequences);

    foreach my $sequence(@sequences)	
    {
	$sequence=~ s/>//g;
	$sequence=">$sequence";
	
	my($name)=($sequence=~ /^>(.+)\n/);
	push(@names, $name); 
	my($encoded_query)= uri_escape($sequence);
	push (@list_encoded, $encoded_query);    
    } 
    
    undef(@sequences);
    if (scalar (@names != @list_encoded)) { die "error $!";}     
    foreach my $encoded_seq(@list_encoded)
    {    
	my $nb=0;
364
	print {*STDERR} "BLAST $names[$i]...";
Sebastien Moretti's avatar
Sebastien Moretti committed
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
	
#-- BUILD THE REQUEST
		
	my($arguments) = "CMD=Put&ENTREZ_QUERY=$orgn&CDD_SEARCH=off&FILTER=$filter&MATRIX_NAME=$matrix&PROGRAM=$program&DATABASE=$database&QUERY=" . $encoded_seq;
	
	my($req) = new HTTP::Request POST => 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi';
	$req -> content_type('application/x-www-form-urlencoded');
	$req -> content($arguments);
	
#-- GET THE RESPONSE : PARSE OUT THE REQUEST ID and THE ESTIMATED TIME
	my($response) = $ua -> request($req);
	
	if ($response -> content =~ /Server Error/i) { die "Server Error at NCBI!!Sorry try later\n"; }
	$response -> content =~ /^\s{4}RID = (.*)$/m;   my($rid) = $1;
	$response -> content =~ /^\s{4}RTOE = (.*)$/m;	my($wait)= $1;
	unless ($rid && $wait)             { die "parse error: $!" };
381
	for (my $j=0; $j<=$wait/2; $j++)   {    print {*STDERR} ".";	sleep 2;   }
Sebastien Moretti's avatar
Sebastien Moretti committed
382
383
384
385
386
	
	my($verif)=0;
	
	while ()
	{  		
387
		for (my $j=0; $j<=5; $j++)  { print  {*STDERR} ".";  sleep 1; }
Sebastien Moretti's avatar
Sebastien Moretti committed
388
389
390
391
392
		
		$req = new HTTP::Request GET =>
		    "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";   
		$response = $ua->request($req);	   
		if    ($response->content =~ /Status=WAITING/im) {  next; }	
393
394
		elsif ($response->content =~ /Status=FAILED/im)  { print {*STDERR} "Search $rid failed\n"; $verif=1; last; }	    
		elsif ($response->content =~ /Status=UNKNOWN/im) { print {*STDERR} "Search $rid expired\n"; $verif=1; last; }	    
Sebastien Moretti's avatar
Sebastien Moretti committed
395
396
397
		elsif ($response->content =~ /Status=READY/im) 
		{	       
		    if   ($response->content =~ /ThereAreHits=yes/im){last;}	       
398
		    else { print {*STDERR} "No hits found.\n";$verif=1;last;  }
Sebastien Moretti's avatar
Sebastien Moretti committed
399
400
401
		}
		elsif ($response->content =~ /can\'t connect/im)
		{ 
402
		    print {*STDERR} "\nCan't connect to www.ncbi.nlm.nih.gov:80...new attempt"; 
Sebastien Moretti's avatar
Sebastien Moretti committed
403
		    if ($nb <3) { ++$nb; next; } 
404
		    else { print {*STDERR} "sorry, BLAST $names[$i] failed after 3 attempts!!\n"; $verif=1; last;}
Sebastien Moretti's avatar
Sebastien Moretti committed
405
		}
406
		else { print {*STDERR} "unknown error\n"; $verif=1; last; }
Sebastien Moretti's avatar
Sebastien Moretti committed
407
408
409
410
411
412
413
414
415
416
417
418
	    } 
	
	if($verif==1){ ++$i; next; }
	
#-- GET RESULT
	
	while ()
	{
	    sleep 3;
	    $req = new HTTP::Request GET => "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&FORMAT_TYPE=$format&FILTER=off&EXPECT=$Eval&ALIGNMENTS=$align&DESCRIPTIONS=$align&ALIGNMENT_VIEW=$aln_view&RID=$rid";
		$response = $ua -> request($req);
	    
419
	    if   ($response->content =~ /Altschul/i) {  print {*STDERR} "Search Complete\n"; push(@list_pdb,$response -> content);last; }
Sebastien Moretti's avatar
Sebastien Moretti committed
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
	    else { next; }
	}
	print SOR1 (@list_pdb);
	++$i;
    }
    
    undef (@list_encoded);
    
    close SOR1;
    return (@list_pdb);
    
}
#-----------------------------------------------------------------------------------------------------------------------
sub LOCAL_BLAST
{
435
436
    my ($blast_dir,$database,$query_file,$Eval,$align,$method,$matrix,$filter,$process,$gigablast,$database_expresso,$blast_dir_expresso,$runblast)=@_;
    my $n=0;
Sebastien Moretti's avatar
Sebastien Moretti committed
437
438
    if ($method=~ /^profile$/i && $gigablast=~ /^no$/i)      
    { 
439
	open (COM,"$blast_dir -p blastp -d $database -i $query_file -m 6 -M $matrix -v $align -b $align -F $filter -e $Eval -a $process|") or die;
Sebastien Moretti's avatar
Sebastien Moretti committed
440
441
442
443
    }
    elsif ($method=~ /^geneid$/i && $gigablast=~ /^no$/i) 
    { 
      
444
	open (COM,"$blast_dir -p blastp -d $database -i $query_file  -v $align  -b $align -F $filter -M $matrix -e $Eval -a $process|") or die;
Sebastien Moretti's avatar
Sebastien Moretti committed
445
446
    }

447
    elsif ($method=~ /^geneid$|^pdbid$/i && ($gigablast=~ /^yes$/i))
Sebastien Moretti's avatar
Sebastien Moretti committed
448
    {
449
450

	unless ($database eq "nr" || $database eq "pdb" || $database eq "refseq_protein" || $database eq "pdbaa" ) { print {*STDERR} "\nsorry invalid database for gigablast\n";exit;} ;
Sebastien Moretti's avatar
Sebastien Moretti committed
451
452
	if ($database eq 'pdb')            { $database='pdbaa';}
	if ($database eq 'refseq_protein') { $database='refprot';}
453
454
455
456
457
458
459
460
461
	if ($database eq '')               { print {*STDERR} "provide a valid database!\n" ;exit;}
	open (COM,"$runblast -d $database -p blastp  -e $Eval -v $align -F F \<$query_file |");
    }

    elsif ($method=~ /^profile$/i && ($gigablast=~ /^yes$/i)) { print {*STDERR} "\nSorry method profile  can't be used with -gigablast option\n";exit;} 
    elsif ($method=~ /^pdbid$/i && $database=~ /expressopdb/)
    {	
	#BLAST pour Expresso  	
	open (COM,"$BLASTMAT; $BLASTDB;$blast_dir_expresso -p blastp -d $database_expresso -i $query_file -F $filter -e $Eval -M $matrix -v $align -b $align |") or die;       
Sebastien Moretti's avatar
Sebastien Moretti committed
462
    }
463

Sebastien Moretti's avatar
Sebastien Moretti committed
464
465
    else
    {  
466
	open (COM,"$blast_dir -p blastp -d $database -i $query_file -v 1 -b 1  -F $filter -e $Eval -M $matrix   -v $align -b $align -a $process |") or die;
Sebastien Moretti's avatar
Sebastien Moretti committed
467
    }
468
469
470
    
    unless ($quiet=~ /on/) { print {*STDERR} "\nrun BLAST..."; } 
    
Sebastien Moretti's avatar
Sebastien Moretti committed
471
472
    my $name_database, my $posted, my $version;

473
    open (SOR2,">blast_result.txt") or die;
Sebastien Moretti's avatar
Sebastien Moretti committed
474
    
475
    $/="Query=";
Sebastien Moretti's avatar
Sebastien Moretti committed
476
477
478
479
    while (<COM>) 
    {
	if ($_=~ /Database: (\S+)/g)      { $name_database=$1;}
	if ($_=~ /Posted date: (.+?)\n/)  { $posted=$1;       }
480
       	if ($_=~ /BLASTP\s+(\S+)/o)        { $version=$1;}
Sebastien Moretti's avatar
Sebastien Moretti committed
481
482
	print SOR2 $_;
	push (@list_pdb,$_) ; 
483
	if ($_=~ /\s*(.+?)\s/) { print {*STDERR} "\n$1 done";} 
Sebastien Moretti's avatar
Sebastien Moretti committed
484
485
486
    }
    close COM;
    close SOR2;
487
    print {*STDERR} "\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
488
489

    unless ($quiet=~ /on/i) { 
490
	                print {*STDOUT} "
Sebastien Moretti's avatar
Sebastien Moretti committed
491
492
493
494
495
496
497
498
499
500
501
502
             Version: BLASTP $version
             Database: $name_database
             Posted date: $posted\n\n";
		           }
    shift (@list_pdb);  
    return (@list_pdb);
}

#-----------------------------------------------------------------------------------------------------------------------------
sub PARSING
{    
    my($list_pdb,$locale,$distant,$method,$quiet,$database,$gigablast)=@_;
503
504
    my(@list_pdb)=@$list_pdb; my(@result_not_sort)=();my $n=0;
    open (SOR, '>', 'webblast.log') or die;
Sebastien Moretti's avatar
Sebastien Moretti committed
505
506
507
508
509
510
511
512

    if ($gigablast=~ /^yes$/i) { $locale=2;$distant=0;}
    if ($gigablast=~ /^no$/i)  { $locale=1;}
    if ($distant==1)           { $locale=0;}
    
    foreach my $pdb_result(@list_pdb)
    { 	 
	my $query, my $length_query, my($pdb_id), my $comp=0;
513
       
Sebastien Moretti's avatar
Sebastien Moretti committed
514
515
516
517
518
519
	if ($pdb_result=~/No hits found/m) {  print SOR $pdb_result; next;}

	$pdb_result=~ s/ALIGNMENTS//;
	local $/=undef;
	my(@intra_res)= split(/(?=\n\n>)/s,$pdb_result);

520
521

	if ( $distant==1 )
Sebastien Moretti's avatar
Sebastien Moretti committed
522
523
	{
	    my $version_d, my $database_d, my $poste_d;
524
	    undef $/; ($query,$length_query)=($intra_res[0] =~ /Query=\s+(\S+)\s+Length=\s*(\d+)/smo);
Sebastien Moretti's avatar
Sebastien Moretti committed
525
	    $/="\n";
526
527
	    open (F3, '<', 'web_tempo.result') or die ;
	    while ($_=<F3>)
Sebastien Moretti's avatar
Sebastien Moretti committed
528
529
530
531
	    {
		if ($_=~ /BLASTP\s+(\S+)/o)        { $version_d=$1;}
		if ($_=~ /Database:\s+(.+?)$/o)    { $database_d=$1;}
		if ($_=~ /Posted date:\s*(.+?)$/o) { $poste_d=$1; last;}
532
	    }
Sebastien Moretti's avatar
Sebastien Moretti committed
533
	    close F3;
534
535
536
        $database_d = $database if ( $database_d =~ m{/} );
	    unless ($quiet=~ /on/i || $n>0) {++$n;
		         print {*STDOUT} "
Sebastien Moretti's avatar
Sebastien Moretti committed
537
538
539
540
541
542
             Version: BLASTP $version_d
             Database: $database_d
             Posted date: $poste_d\n\n";
		                       }

	} 
543
544
545
546
	else { ($query,$length_query)=($intra_res[0] =~ /\s*(.+?)\s.+?\(([\d,]+) letters/smo);}

	shift(@intra_res) if ( exists($intra_res[1]) );

Sebastien Moretti's avatar
Sebastien Moretti committed
547
	foreach my $intra_res(@intra_res) #look for the different results of the query
548
	{
Sebastien Moretti's avatar
Sebastien Moretti committed
549
550
551
552
553
554
555
556
	    my($aln_length,$identity)  = ($intra_res=~ /^\sIdentities = \d+\/(\d+)\s\((.+?)\)/im);
	    my($recouvrement,$gap)     = &RECOVER($intra_res,$aln_length,$length_query);	
	    my($evalue)                = ($intra_res=~ /Expect = (.+?)\s/im);
	    my($bits  )                = ($intra_res=~ /Score =\s+([\d.]+)\s/im);
	    
	    
	    unless ($method !~ /^geneid$/i) { if ($comp<=$bits) { $comp=$bits;} else { last;} }
	    
557
	    if ( $query eq '' || $length_query eq '' || $aln_length eq '' || $identity eq '' || $recouvrement eq '' || $gap eq '' ) 
Sebastien Moretti's avatar
Sebastien Moretti committed
558
559
560
561
562
563
564
565
566
567
568
	    { print SOR " can't parse $pdb_result"; next; }
	       
	    if ($method =~ /^pdbid$/i)
	    {
		if ($locale == 1)  { ($pdb_id) = ($intra_res=~ /^>(.{6})/im); $pdb_id=~ s/_//; $pdb_id=uc($pdb_id);}
		else               { ($pdb_id) = ($intra_res=~ /^>pdb\|(.{6})/im); $pdb_id=~ s/\|//; }	
		($evalue)                      = ($intra_res=~ /Expect = (.+?)\s/im);
	    
		push (@result_not_sort,("$query\t$pdb_id\t$evalue\t$identity\t$recouvrement\t"));
	    }	     
	    elsif ($method =~/^geneid$/i)
569
570
	    {	    
		if ($database !~ /pdb/i && $database !~ /swiss/i && ($locale=~/1|2/))
Sebastien Moretti's avatar
Sebastien Moretti committed
571
		{
572
	   
Sebastien Moretti's avatar
Sebastien Moretti committed
573
574
575
576
577
		     while  ($intra_res=~ />.*?(gb|prf|emb|sp|pir|tpe|ref|prf|dbj|ddbj|pdb)[\|]+([A-Za-z0-9_\.]+?)(\s|\|(.{1}))/sg) 
		     { 
			 my $databank =$1;
			 my $last     =$4;
			 my $refseq   =$2;
578
			 if ($databank eq 'pdb') { $refseq.=$last } 
Sebastien Moretti's avatar
Sebastien Moretti committed
579
580
581
582
583
			 $refseq=~ s/\.\d+$//;
			 push (@result_not_sort,"$query\t$refseq\t$identity\t$recouvrement\t$bits\t$evalue\t$databank");
		     }
		   	  
		}  
584
		elsif ($database=~ /pdb|pdbaa/i && ($locale==1 || $locale==2))
Sebastien Moretti's avatar
Sebastien Moretti committed
585
		{	  
586
		   
Sebastien Moretti's avatar
Sebastien Moretti committed
587
588
589
590
591
592
593
594
		    my $refseq;
		    if($locale==1) {($refseq)  = ($intra_res=~ />(.*?)\s/o);      $refseq=~ s/_//; }
		    else           {($refseq)  = ($intra_res=~ /^>pdb\|(.{6})/im);$refseq=~ s/\|//;}
		    
		    unless ($refseq)  { print SOR $intra_res; next; }		    
		    push (@result_not_sort,("$query\t$refseq\t$identity\t$recouvrement\t$bits\t$evalue\tpdb"));  
		}  	 
		elsif ($distant==1 )
595
		{
Sebastien Moretti's avatar
Sebastien Moretti committed
596
		    my($resul)=&MULTI_EQUIVALENT($query,$identity,$recouvrement,$bits,$evalue,$intra_res);
597
		    push (@result_not_sort,$resul);
Sebastien Moretti's avatar
Sebastien Moretti committed
598
599
		}
		elsif ($database=~ /swiss/i)
600
		{
Sebastien Moretti's avatar
Sebastien Moretti committed
601
		    my($refseq)       = ($intra_res=~ />.*?sp\|(.+?)\|/o);
602
		    unless ($refseq)  { print SOR $pdb_result; next; }
Sebastien Moretti's avatar
Sebastien Moretti committed
603
		    $refseq=~ s/\.\d+$//;
604
605
606
607
		    push (@result_not_sort,("$query\t$refseq\t$identity\t$recouvrement\t$bits\tswiss_prot"));
		}
	    }
	    else {die;}
Sebastien Moretti's avatar
Sebastien Moretti committed
608
	}
609
    }
Sebastien Moretti's avatar
Sebastien Moretti committed
610
611
    close SOR;
    undef (@list_pdb);
612

Sebastien Moretti's avatar
Sebastien Moretti committed
613
    if ($method =~/^geneid$/i) { return (@result_not_sort); }
614
    else
Sebastien Moretti's avatar
Sebastien Moretti committed
615
616
617
618
619
620
    {
	my(@result_sort)= 
	    map {$_->[1]} 
	sort { $b->[0]<=>$a->[0]} 
	map {[/\t([\d.]+)%/,$_]} 
	@result_not_sort;
621
622
623
624

	undef(@result_not_sort);
	return (@result_sort);
    }
Sebastien Moretti's avatar
Sebastien Moretti committed
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
}

#-------------------------------------------------------------------------------------------------------------------

sub MULTI_EQUIVALENT
{
    my($query,$identity,$recouvrement,$bits,$evalue,$intra_res)=@_;

    my @result=();
    while  ($intra_res=~ />.*?(gb|prf|emb|sp|pir|tpe|ref|prf|dbj|ddbj|pdb)[\|]+([A-Za-z0-9_\.]+?)(\s|\|(.{1}))/g) 
    { 
	my $databank =$1;
	my $last     =$4;
	my $refseq   =$2;
	
640
	if ($databank eq 'pdb') { $refseq.=$last } 
Sebastien Moretti's avatar
Sebastien Moretti committed
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
	$refseq=~ s/\.\d+$//;
	push (@result,"$query\t$refseq\t$identity\t$recouvrement\t$bits\t$evalue\t$databank");
    }
    return (@result);
}
#--------------------------------------------------------------------------------------------------------------------
sub AFFICHAGE_REFSEQ_PARSING
{
    my($result_sort,$cover_tresh,$identity_treshold,$out_file)=@_;
    my(@result_sort)=@$result_sort, my(@name_gid)=();my@resultats=();my $afficher="";
   
(my($entete)= sprintf("%-40s %-25s %-10s %-12s %-10s %-10s %-10s","Sequence Name","Accession number","Databank","%Identity","%Cover","BITS","Evalue")); 
    
    foreach my $result_sort(@result_sort)
    {     
656
657
	my($seq_name,$refseq_name,$identiq,$cover,$bits,$evalue,$bank)= split("\t",$result_sort);
    $evalue =~ s/,$//; #To remove an additional comment with new blast release (2.2.17)
Sebastien Moretti's avatar
Sebastien Moretti committed
658
659
660
661
662
663
664
665
666
667
668
669
670
671
	($identiq)= split(/%/,$identiq);
	
	if ($identiq >= $identity_treshold && $cover >= $cover_tresh)
	{
	    push (@name_gid,">$seq_name\@$bank\_\_$refseq_name\n");
	    (($afficher).=  sprintf("%-40s %-25s %-10s %-12s %-10s %-10s %-10s ",$seq_name,$refseq_name,$bank,$identiq,$cover,$bits,$evalue));
	    $afficher.="\n";
	} 
	else {next;}	
    }

if ($afficher) { print "\n$entete\n\n"; print $afficher; }


672
673
674
675
if (@name_gid) { print {*STDOUT} "\n**********************************************************************\n\n"; }
if ($out_file) { open (SOR,">$out_file") or die "can not open $out_file"; print SOR @name_gid; }
print {*STDOUT} "\n", @name_gid;
close SOR;
Sebastien Moretti's avatar
Sebastien Moretti committed
676
677
678
679
680
}
#-------------------------------------------------------------------------------------------------------------

sub AFFICHAGE_PDB_PARSING 
{
681

Sebastien Moretti's avatar
Sebastien Moretti committed
682
683
    my($result_sort,$cover_tresh,$identity_treshold,$out_file)=@_;
    my(@result_sort)=@$result_sort, my @sortie=();
684
685
    
    print {*STDOUT} "\n\n",(my($en_tete)= sprintf("%-40s %-10s %-10s %-12s %-10s","Sequence Name","PDB_id","Evalue","Identity(%)","Cover(%)")),"\n\n"; 
Sebastien Moretti's avatar
Sebastien Moretti committed
686
687
688
689
690
691
692
693
694
    
    foreach my $result_sort(@result_sort)
    {     
	my($seq_name,$pdb_name,$EValue,$identiq,$cover)= split("\t",$result_sort);  	    
	($identiq)= split(/%/,$identiq);
	
	if ($identiq >= $identity_treshold && $cover >= $cover_tresh)
	{
	    push (@pdb_list,$pdb_name);
695
696
        $EValue =~ s/,$//;
	    print {*STDOUT} ((my $afficher)= sprintf("%-40s %-10s %-10s %-12s %-10s",$seq_name,$pdb_name,$EValue,$identiq,$cover)),"\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
697
698
	    push (@sortie,">$seq_name _P_ $pdb_name\n");
	} 
699
	else {next;}		
Sebastien Moretti's avatar
Sebastien Moretti committed
700
701
    }
    undef(@result_sort);
702
    print {*STDOUT} "\n**********************************************************************\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
703
704
705

#-- OUTFILE /STDOUT
    if   ($out_file) { open (SOR,">$out_file") or die "can not open $out_file"; print SOR @sortie; }
706
    print {*STDOUT} @sortie;
Sebastien Moretti's avatar
Sebastien Moretti committed
707
708
709
710
711
712
    close SOR;

}
#-----------------------------------------------------------------------------------------------------------------------------------
sub PROFILE
{
713
714
715
    my($list_pdb,$out_file,$distant)=@_;
    my(@list_pdb)=@$list_pdb,  my(@sortie)=();
    my %names=();   my $i=0;    my($name)='';
Sebastien Moretti's avatar
Sebastien Moretti committed
716
717
718

    open (SOR1,">$out_file") or die;
    foreach my $pdb_result(@list_pdb)
719
    {     	
Sebastien Moretti's avatar
Sebastien Moretti committed
720
721
	if ($pdb_result =~ /No hits found/i) { next; }
	else
722
723
724
725
	{
	    ++$i;
	    if ($distant==1) {($name)   =($pdb_result =~ /Query=\s*(.+?)Length/smoi)  or die "\nparse error in distant profile\n";}
	    else             {($name)   =($pdb_result =~ /\s*(.+?)\(.*?letters/ismo)  or die "\nparse error in profile\n";}
Sebastien Moretti's avatar
Sebastien Moretti committed
726
	  
727
728
	    my($name1)= ($name=~ /(.+?)\s+$/);
 
Sebastien Moretti's avatar
Sebastien Moretti committed
729
	    open (SOR,">tempo_file_profile") or die "can not open tempo_file_profile";
730
	    print SOR "Query= $pdb_result";
Sebastien Moretti's avatar
Sebastien Moretti committed
731
732
	    close SOR;	    
	    
733
	    open(COM,"|t_coffee -other_pg seq_reformat -input blast_aln -in tempo_file_profile -output fasta_aln -out ${i}.profile");	     
Sebastien Moretti's avatar
Sebastien Moretti committed
734
	    close COM;
735
	    push (@sortie,">$name1 _R_ ${i}.profile\n");
Sebastien Moretti's avatar
Sebastien Moretti committed
736
737
738
739
740
741
     
	} 	
    }    
    unlink("tempo_file_profile");
    undef(@list_pdb); 
  
742
    print {*STDERR} "\n**********************************************************************\n\n";
Sebastien Moretti's avatar
Sebastien Moretti committed
743
744
#-- OUTFILE /STDOUT
    if   ($out_file) { open (SOR1,">$out_file") or die "can not open $out_file"; print SOR1 @sortie; }
745
    print {*STDOUT} @sortie;
Sebastien Moretti's avatar
Sebastien Moretti committed
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
    close SOR1;
         
}

#--------------------------------------------------------------------------------------------------------------------------------
sub ORGN
{   
    my($organism)=@_;
    $organism=~ s/_/ /;

    my(%orgs)= (
		
		'Homo sapiens'         =>'1',
		'Bos taurus'             =>'1',
		'Gallus gallus'         =>'1',
		'Viruses'              =>'1',
		'Bacteria'            =>'1',           
		'Eukaryota'            =>'1',
		'Mammalia'              =>'1',
		'Vertebrata'              =>'1',
		'All organisms'          =>'1',
		'Fungi'                 =>'1',
		'Primates'             =>'1',
		'Archaea'               =>'1',
                'Arabidopsis thaliana'   =>'1',
                'Caenorhabditis elegans'  =>'1',
                'Escherichia coli'        =>'1',
		'Mus musculus'             =>'1',
                'Drosophila melanogaster'   =>'1',
		);

    if (exists $orgs{$organism}) 
    { $organism=~ s/ /+/g; return ($organism); }
779
    else { print {*STDERR} "organism not valid or syntax error, replace space by \"_\" \n"; &HELP(); }
Sebastien Moretti's avatar
Sebastien Moretti committed
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
   
} 
#------------------------------------------------------------------------------------------------------------------------------------

sub LIST_ORGA
{
    my(%orgs)= (
		
		'Homo sapiens'         =>'1',
		'Bos taurus'             =>'1',
		'Gallus gallus'         =>'1',
		'Viruses'              =>'1',
		'Bacteria'            =>'1',           
		'Eukaryota'            =>'1',
		'Mammalia'              =>'1',
		'Vertebrata'              =>'1',
		'All organisms'          =>'1',
		'Fungi'                 =>'1',
		'Primates'             =>'1',
		'Archaea'               =>'1',
                'Arabidopsis thaliana'   =>'1',
                'Caenorhabditis elegans'  =>'1',
                'Escherichia coli'        =>'1',
		'Mus musculus'             =>'1',
                'Drosophila melanogaster'   =>'1',
		);
    
    my (@cle)=keys(%orgs);
    foreach my $cle(@cle){ $cle=~ s/ /_/; }
    return (@cle);
}
811