Commit 3af6fb03 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

No commit message

No commit message
parent 316f2388
This diff is collapsed.
This diff is collapsed.
#
#Version: 2.2.5
#OS: Linux
#Author: Sebastien Moretti
#E-mail: moretti@igs.cnrs-mrs.fr
#Web: http://www.igs.cnrs-mrs.fr/
#
History of PACMAN enhancements:
2.2.5
Best management when only one gene match and alignment fails:
$intronStep==1 and $POS->{0} doesn't exist.
Best management of fake downloads:
- > without sequences
- sequences with Error: or <>
- ...
2.2.4
Change intronless behavior if best nucleotide return
a GI number.
2.2.3
Change gene locus coordinates to zero if they become
negative with +/- 5000 expansion.
2.2.2
Manage boj output when there are several intronless genes
for one query.
2.2.1
Add a $lim variable to be able to fix max number of query
sequences as an input option.
Manage several geneIDs ?!?
2.2.0
New Release:
Better memory management
Code splits into more functions
Clean code
Write results when they are available, no more only at the exit
1.2.8
Add an option to keep abnormal temporary files.
Increase size threshold for result file of exonerate, i.e.
no result or result.
1.2.6
Manage fake BLASTP acc when they have space, point or pipe
characters.
Check and correct perl POD text format.
Enhance fasta header for boj output like cds output.
1.2.4
Add an option to hide BOJ output for the IGS web server.
Output two CDS files: a standard one, and one with query prot
interleaved.
1.2.2
Manage multiple runs of PACMAN and the co-occurance query name:
0.fas and 0.fas => now use temp name to limit this.
And temp names are mix of current date-time and a random nbr.
1.2.0
Remove duplicated lines when a query, with multiple blastp hits
returns a single nucleotide for all hits. So, keeps only one.
Debug mode for BOJ, now available as a result file.
Enhance output for log files.
RefSeq -> 100% filters to avoid mis-species attribution
then NR -> 95%.
1.0.4
Add a webblast step againt nr if refseq webblast returns no
hit: RefSeq, elsif nr, else NO HIT
1.0.3
Add an option to add the original peptide query beneath the
back-translated nt seq
Add the b,o,j identification in the Exonerate parser module
1.0.2
New header format in the cds output
1.0.1
Add standard log files for the web server
Manage multiple equal blast best hits
Mix PACMAN and Exoset, specially in the Exonerate parser
0.99
Process the alignment between every nucleotide sequence found and use
the best one (transcript and genomic sequences)
0.98
Add a 'old' argument for the cache option to remove only 15 days
old files from the cache directory
Hide webblast parameters when it runs but the first one
Manage return lines from windows '^M'
Remove all the non-gap and non-alphabetic characters into the query
sequences
0.97
Create a more robust 'cache directory' process
Add triplet to sure mismatches, i.e. Met and Trp
0.96
Add a second result file, as library, with storage of accession
numbers resulting from blast query and prot-nt correspondances.
0.95 - 24 oct 2005
Correct hash table variable define as semi-local, and not strictly as
local, in loci_from_Exonerate.pm because some value were kept between
module executions due to misplaced brackets.
Clean loci_from_Exonerate.pm file and remove usage of strand, exonup
list and genomic length because they are unnecessary here.
0.9 - 20 oct 2005
Perldoc
19 oct 2005
Addition of a test to avoid to download the same file if
it has ever been downloaded in the past half day.
Addition of stop codon triplet management
Addition of a process to select the best alignment if there
are several nt sequences of the same category for a query
18 oct 2005
Get genomic and transcript acc, sort them to get only the
best category of sequences:
transcripts > chromosomes > contigs > bac,cosmid,... for RefSeqN
sort by size for other acc
17 oct 2005
Addition of a version option and variable
#!/usr/bin/env perl
use strict;
use warnings;
use diagnostics;
my $infile = $ARGV[0] or die "\n\tYou must specify a fasta file to process\n\n";
open(FILE,"$infile") if ( -e "$infile" );
my $seq='';
my $flag=0;
while(<FILE>){
if ($_ =~ /^>/){
print $_ if ($flag==0);
print "\n$_" if ($flag>0);
$flag++;
}
else {
$seq=$_;
chomp($seq);
$seq =~ s/ //g;
$seq =~ s/\d//g;
$seq =~ s/(.)/_${1}_/g;
print $seq;
}
}
close FILE;
print "\n";
exit;
#File loci_from_Exonerate.pm
package loci_from_Exonerate;
sub parser{
my ($infile)=@_;
my %pos_aln;
my %pos_boj;
open(MAP,"$infile") if ( -e "$infile" ) or die "$!\n\n";
my $flag=0;
my (@query,@match,@target,@genomic);
my ($Match,$Genom,$Query)=('','','');
my ($up,$down);
my $margeL=0;
my (@cdsUp);
my $seuil=0; #To limit processing if matches are too bad
while(<MAP>){
if ($flag==1 and $margeL==0 and $_ =~ /^( +\d+ : )[\w\-\{\} \>\<\*]+ : +\d+$/){ #\* for stop codon management
$margeL=length($1);
}
if ($flag==0 and $_ =~ /^ +Target range: \d+ -> \d+/){
$flag=1;
}
elsif ($_ =~ /^vulgar:/){
$flag=8;
}
elsif ( $_ =~ /^\# --- END OF GFF DUMP ---/){
last;
}
elsif ($flag==1 and $_ =~ /^ +\d+ : ([\w\-\{\} \>\<\*]+) : +\d+/){
@query=(@query,$_);
$Query=$Query.$1;
$flag++;
}
elsif ($flag==2 and $_ =~ /^ +[\|\+\-\{\}\dbp\.\!\:\#]+/){
@match=(@match,$_);
$_ =~ s/^.{$margeL}//;
$Match=$Match.$_;
chomp($Match);
$flag++;
}
elsif ($flag==3 and $_ =~ /^ +[\w\-\{\}\*\#]+/){
@target=(@target,$_);
$flag++;
}
elsif ($flag==4 and $_ =~ /^ +(\d+) : ([\w\-\{\}\.]+) : +(\d+)/){
@genomic=(@genomic,$_);
$up=$1;
$Genom=$Genom.$2;
$down=$3;
$flag=1;
}
elsif ($flag==8 and $_ =~ /exonerate:protein2[a-z]+\tsimilarity\t\d+\t/){
my $similarity_query=$_;
chomp($similarity_query);
$similarity_query =~ s/^[^\;]+\; //;
while($similarity_query =~ m/^ ?Align (\d+) /){
@cdsUp=(@cdsUp,$1);
$similarity_query =~ s/^ ?Align \d+ \d+ \d+ \;//;
}
$flag=9;
}
}
close MAP;
print STDERR "[@cdsUp]\n";
my $drap=0;
my $exonNbr=0;
my ($locus,$lieu)=(0,0);
my $frameIntron=0;
my $boj='';
my $pep_pos='';
my $accolade=0;
for(my $i=0; $i<length($Match); $i++){
my $matched_symbol=substr($Match,$i,1);
if ($drap==3 and $matched_symbol =~ /^[\+\-]$/){
$drap=0;
$locus=0;
$exonNbr++;
print STDERR $matched_symbol, substr($Genom,$i,1), "\t", substr($Query,$i,1), "\n";
}
elsif ($matched_symbol =~ /^[\+\-]$/){
$drap++;
print STDERR $matched_symbol, substr($Genom,$i,1), "\t", substr($Query,$i,1), "\n";
}
# elsif ($matched_symbol =~ /[ \|\.\!\:]/ and substr($Genom,$i,1) !~ /[\{\}\.a-z\-]/){
elsif ($matched_symbol =~ /[ \|\.\!\:\#]/ and substr($Genom,$i,1) !~ /[\{\}\.a-z]/){
my $peptide_pos='';
$peptide_pos=$cdsUp[0]+$lieu if (substr($Query,$i,1) =~ /[A-Z\*]/);
$pep_pos=$cdsUp[0]+$lieu if (substr($Query,$i,1) =~ /[A-Z\*]/);
$frameIntron=1 if (substr($Query,$i,1) =~ /[A-Z\*]/ and $frameIntron==-1);
$frameIntron++ if (substr($Query,$i,1) =~ /[a-z]/ and $frameIntron>=1);
$lieu++ if (substr($Query,$i,1) =~ /[A-Z\*]/);
my $line=$matched_symbol.substr($Genom,$i,1)."\t".substr($Query,$i,1)."\t".$peptide_pos;
#Et les mi-mismatches ?
if ($matched_symbol eq '|' and $peptide_pos =~ /^\d+$/){
my $triplet=substr($Genom,$i,1);
my $j=0;
if (substr($Query,$i,1) ne '*'){
while(length($triplet) <3){
$j++;
$triplet=$triplet.substr($Genom,($i+$j),1) if (substr($Genom,($i+$j),1) =~ /[A-Z]/);
}
}
elsif (substr($Query,$i,1) eq '*'){
while(length($triplet) <3){
$j++;
$triplet=$triplet.substr($Genom,($i+$j),1) if (substr($Genom,($i+$j),1) =~ /[A-Z]/);
}
$i=$i+2;
}
%pos_aln=(%pos_aln, $peptide_pos => $triplet);
}
my $shift='';
$shift="\t$frameIntron" if ($frameIntron>0);
print STDERR $line,"$shift\n";
$locus++;
}
elsif ($matched_symbol =~ /\{/){
print STDERR $matched_symbol, substr($Genom,$i,1), "\t", substr($Query,$i,1), "\n";
$frameIntron=-1;
$accolade++;
}
else{
print STDERR $matched_symbol, substr($Genom,$i,1), "\t", substr($Query,$i,1), "\n";
$lieu++ if (substr($Query,$i,1) =~ /U/);
if (substr($Query,$i,1) =~ /\}/){
if ($frameIntron==1){
$boj='o';
%pos_boj=(%pos_boj,$pep_pos => $boj);
}
elsif ($frameIntron==2){
$boj='j';
%pos_boj=(%pos_boj,$pep_pos => $boj);
}
$frameIntron=-2 if ($accolade==1);
if ($accolade==2){
$frameIntron=0;
$accolade=0;
}
$boj='';
}
elsif ($matched_symbol =~ /b/ and $frameIntron==0){
$boj='b';
%pos_boj=(%pos_boj,($pep_pos+1) => $boj);
}
if ($matched_symbol =~ /p/){
$frameIntron=0;
$boj='';
}
}
}
my $refpos_aln=\%pos_aln; #Use perl references to send two variables (mem address for the 2 hashes) instead of 2 hashes,
my $refpos_boj=\%pos_boj; #and easily, and properly, get them in the main script !
# return(%pos_aln);
return($refpos_aln,$refpos_boj);
}
1;
=head2 sub parser
=over
=item Parse Exonerate output to get only exact matching positions
=item my ($exonerate_output_file)=@_;
=back
=cut
#File output_checker.pm
package output_checker;
sub translate{
my ($infile)=@_;
my $outfile;
return($outfile);
}
sub compare{
}
1;
=head2 sub translate
=over 4
=item Translate CDS obtained from ProtoGene to check
=item matches between query and CDS.
=item my ($cds_from_query)=@_;
=back
=cut
#!/usr/bin/env perl
use strict;
use warnings;
use diagnostics;
#use HTTP::GHTTP qw/:methods/;
use CGI;
use URI::Escape;
use LWP::UserAgent;
my $ua = LWP::UserAgent->new;
use HTTP::Request::Common qw(POST);
use Getopt::Long ;
my ($program,$evalue,$vvalue,$dbname,$inputfile,$outputmethod,$filter,$html)=('','1.e-10','20','','',0,'on','off'); #Default values
my $ret = GetOptions( 'p=s' => \$program,
'e=s' => \$evalue,
'v=s' => \$vvalue,
'd=s' => \$dbname,
'i=s' => \$inputfile,
'm=i' => \$outputmethod,
'F=s' => \$filter,
'T=s' , \$html );
if( $program eq ''){
&Usage();
exit(0);
}
my $dbtype='NucleotideDatabases';
$dbtype = 'ProteinDatabases' if( $program eq 'blastp' || $program eq 'blastx' );
if( $filter eq 'T' or $filter eq 'on'){
$filter = 'on';
}
else {
$filter = 'off';
}
$html = 'on' if( $html eq 'T');
# Build Command script
my @command;
# Add program
push ( @command , "Program=$program" ) ;
# Add database
push( @command , "$dbtype=$dbname" ) ;
push( @command , "evalue=$evalue" ) ;
push( @command , "vvalue=$vvalue" ) ;
push( @command , "filter=$filter" ) ;
push( @command , "outputmethod=$outputmethod" ) ;
push( @command , "html=$html" ) ;
push( @command , "blastmethod=new" ) ;
push( @command , "raw=on" ) ;
# Read sequence
my $sequence='';
if( $inputfile eq ''){
while( <STDIN> ){
if(/^>/ ){
&RunBlast() if( $sequence ne '' );
$sequence = $_ ;
}
else {
$sequence .= $_ ;
}
}
&RunBlast();
}
elsif ($inputfile ne ''){
open(INFILE,"$inputfile") if (-e "$inputfile");
while(<INFILE>){
if(/^>/){
&RunBlast() if( $sequence ne '' );
$sequence = $_ ;
}
else {
# chomp();
$sequence .= $_ ;
}
}
close INFILE;
&RunBlast();
}
else{
&Usage();
exit(0);
}
sub RunBlast{
my $mybody= join("&" , (@command , 'sequence='.uri_escape($sequence) )) ;
# print $mybody ;
# my $r = HTTP::GHTTP->new();
# $r->set_uri("http://www.igs.cnrs-mrs.fr/adele/~database/blast.cgi");
# $r->set_uri("http://www.igs.cnrs-mrs.fr/adele/~database/newblast.cgi");
# $r->set_uri("http://www.igs.cnrs-mrs.fr/adele/~database/remoteblast.cgi");
# $r->set_type(METHOD_POST);
# $r->set_body($mybody);
# $r->process_request;
# $r =~ s/<[^<>]+>//g;
# print $r->get_body();
# my $r = HTTP::Request->new($method, $uri, $header,$mybody);# $content);
my $uri="http://www.igs.cnrs-mrs.fr/adele/~database/remoteblast.cgi";
my $method='POST';
my $r = HTTP::Request->new($method, $uri);
$r -> content_type('application/x-www-form-urlencoded'); # = $header
$r -> content($mybody);
my $response = $ua->request($r);
my $print= $response->content;
$print=~s/<[^<>]+>//g;
print $print;
}
sub Usage{
print STDERR "
$0 -d refprot -p blastp -i fasta_file
Options disponibles:
-i input multifasta file
-p blast program
-F filtre
-e evalue max
-m ouput format
-v,b controlent le nombre de hits
-d database; une parmi :
acinbactdna
bactorfs
cgvirus
cog
genpept
human
humest
keggnuc
keggpep
mimi
mimifinal
mimiorfs
mouse
mtimonensis
nr
pdbaa
refprot
sprot
sptr
sptr_nr_ncbi_v => swissprot + trembl avec entete taxonomique
targetdb
uniprot_sprot
uniprot_sptr
virus
eto
";
exit;
}
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment