Commit 6606ade4 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Lots of cleaning

parent 0c7f1145
......@@ -341,10 +341,9 @@ for(my $r=0; $r<=$#original_names; $r++){
@nt_GIs = "My_Seq-$$";
}
}
# Get Nt sequence through prot acc -> prot gi -> nt linked gi -> nt acc
else{
#Prot ACC -> PUID
# Prot ACC -> PUID
my $protGI = blastPAcc2PGI($equivalent_blast_hits[$qq]);
if ( $protGI eq '' ){
print "\tNo protein (prot GI) link found for $equivalent_blast_hits[$qq] in $fasta_header\n\n";
......@@ -352,7 +351,7 @@ for(my $r=0; $r<=$#original_names; $r++){
next HIT_LINK;
}
#PUID -> nt GIs & GeneID
# PUID -> nt GIs & GeneID
my ($ntGIs, $geneID) = protGI2NTGIs($protGI, $equivalent_blast_hits[$qq]);
if ( $ntGIs eq '' && $geneID eq '' ){
print "\tNo nucleotide (nt GIs) link found for $equivalent_blast_hits[$qq] in $fasta_header\n\n";
......@@ -362,13 +361,13 @@ for(my $r=0; $r<=$#original_names; $r++){
print " linked with [$ntGIs $geneID]\n";
#Get transcript and contig seq
@nt_GIs = split(/,/, $ntGIs);
# Get transcript and contig seq
if ( $ntGIs ne '' ){
@nt_GIs = split(/,/, $ntGIs);
downloadSeqFromGIs($cache, '', '', @nt_GIs);
}
#GeneID -> Chr
# GeneID -> Chr
if ( $geneID ne '' ){
my @geneIDs = split(/,/, $geneID);
while(<@geneIDs>){
......@@ -376,20 +375,20 @@ for(my $r=0; $r<=$#original_names; $r++){
my ($chr, $amont, $aval) = geneID2Chr($geneID, $equivalent_blast_hits[$qq]);
print "\n\tNo gene locus found for $equivalent_blast_hits[$qq] with $geneID in $fasta_header\n\n" if ($chr eq '' and $geneID ne '');
#Get Chr seq
# Get Chr seq
if ( $chr ne '' ){
download_seq($cache, $amont, $aval, $chr);
$intronStep = 1; #FIXME: useful ???
$intronStep = 1;
@nt_GIs = ("$chr:$amont-$aval", @nt_GIs);
unshift(@nt_GIs, "$chr:$amont-$aval");
}
}
}
}
#Align Nt seq with our protein query seq
my ($POS, $BOJ) = runExonerate($exonerate_exe, $cache, $date, $r, $original_seq[$r], $fasta_header, $equivalent_blast_hits[$qq], $r, @nt_GIs);
# Align Nt seq with our protein query seq
my ($POS, $BOJ) = runExonerate($cache, $date, $r, $original_seq[$r], $fasta_header, @nt_GIs);
my ($POSresult, $bestNucleotide) = prepareResults4CDS($POS, $equivalent_blast_hits[$qq], $original_seq[$r], $fasta_header);
$resultPOS .= $POSresult if ( $bestNucleotide ne '' && $POSresult ne '' && $resultPOS !~ /_G_$bestNucleotide/ );
......@@ -402,7 +401,6 @@ for(my $r=0; $r<=$#original_names; $r++){
$nameLess = $bestNucleotide if ( %$BOJ eq 0 && $intronStep==0 && ($bestNucleotide =~ /^N[CTWZG]_/ || $bestNucleotide =~ /^AC_/) );
my ($BOJresult, $bestGenomic) = prepareResults4BOJ($BOJ, $equivalent_blast_hits[$qq], $original_seq[$r], $fasta_header, $intronLess, $nameLess);
$resultBOJ .= $BOJresult if ( $bestGenomic ne '' && $BOJresult ne '' && $resultBOJ !~ /_G_$bestGenomic[ \-]/ );
# print " $intronStep $POS->{0} [@intronStatus] {$BOJresult} $bestNucleotide - $bestGenomic \t $BOJ->{0}\n";
}
......@@ -429,7 +427,7 @@ for(my $r=0; $r<=$#original_names; $r++){
}
else {
createCDSOutputFile($resultPOS, $original_seq[$r], $fasta_header);
createBOJOutputFile($resultBOJ, ${original_seq[$r]});
createBOJOutputFile($resultBOJ, $original_seq[$r]);
}
......@@ -440,14 +438,14 @@ close $LOG;
unlink "$date.log" if ( -z "$date.log" );
checkAndCleanStderrFiles("$cache/${date}_ExonerateError") if ( -e "$cache/${date}_ExonerateError" );
checkAndCleanStderrFiles("$cache/$date.ExonerateError") if ( -e "$cache/$date.ExonerateError" );
checkOutputFiles();
unlink("$msa") if ( $converted==1 );
unlink 'blast_result.txt';
my @tmpFiles = glob($cache."/${date}*");
if ( exists($tmpFiles[0]) ){
mkdir("$cache/${date}_TMP");
move("$_", "$cache/${date}_TMP") while(<@tmpFiles>);
mkdir("$cache/$date.TMP");
move("$_", "$cache/$date.TMP") while(<@tmpFiles>);
}
......@@ -455,19 +453,19 @@ if ( exists($tmpFiles[0]) ){
#STDOUT Log output for our server
if ( -s "${originalMSA}.cds" || -s "${originalMSA}.out"){
if ( -s "$originalMSA.cds" || -s "$originalMSA.out" ){
print "\n\nTERMINATION STATUS: SUCCESS\n";
print "OUTPUT RESULTS\n";
print " #### File Type= MSA Format= fasta_CDS Name= ${originalMSA}.cds\n"
if ( -s "${originalMSA}.cds");
print " #### File Type= MSA Format= fasta_CDS+Query Name= ${originalMSA}.cdsP\n"
if ( -s "${originalMSA}.cdsP");
print " #### File Type= MSA Format= colored_fasta_CDS Name= ${originalMSA}.cdsP.html\n"
if ( Views::Html("${originalMSA}.cdsP") && -s "${originalMSA}.cdsP.html");
print " #### File Type= MSA Format= Rejected_seq Name= ${originalMSA}.out\n"
if ( -s "${originalMSA}.out");
print " #### File Type= MSA Format= fasta_Exon-Boundaries Name= ${originalMSA}.boj\n"
if ( -s "${originalMSA}.boj" && -s "${originalMSA}.cds" && $hideBOJ==0);
print " #### File Type= MSA Format= fasta_CDS Name= $originalMSA.cds\n"
if ( -s "$originalMSA.cds");
print " #### File Type= MSA Format= fasta_CDS+Query Name= $originalMSA.cdsP\n"
if ( -s "$originalMSA.cdsP");
print " #### File Type= MSA Format= colored_fasta_CDS Name= $originalMSA.cdsP.html\n"
if ( Views::Html("$originalMSA.cdsP") && -s "$originalMSA.cdsP.html");
print " #### File Type= MSA Format= Rejected_seq Name= $originalMSA.out\n"
if ( -s "$originalMSA.out");
print " #### File Type= MSA Format= fasta_Exon-Boundaries Name= $originalMSA.boj\n"
if ( -s "$originalMSA.boj" && -s "$originalMSA.cds" && $hideBOJ==0);
}
else{
failure();
......@@ -642,7 +640,7 @@ sub cacheManagement{
if ( $cache eq 'empty' ){
unlink <$cacheDir/*.fas>;
unlink <$cacheDir/*_ExonerateError>;
unlink <$cacheDir/*.ExonerateError>;
unlink("$cacheDir/webblast.log", "$cacheDir/web_tempo.result", "$cacheDir/debug.tempo");
print "\n\tcache directory [$cacheDir] is empty\n\n";
exit 0;
......@@ -1005,7 +1003,6 @@ sub download_seq{
if ( $pacc2puid !~ /^[NAX][CGTSWZMR]_/ ){ #Not RefSeq acc
#pacc = primary acc NOT prot acc ! #265666 -> S55551
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$pacc2puid = $1;
......@@ -1042,40 +1039,42 @@ sub download_seq{
###################### Exonerate #######################
sub runExonerate{
my ($exonerate_exe, $cache, $date, $order, $original, $fasta_header, $blastHit, $r, @gis) = @_;
my ($cache, $date, $order, $original, $fasta_header, @gis) = @_;
#Prot to align to nucleotidic sequence(s)
open(my $FASTA, '>', "$cache/${date}_${order}.fas");
# Prot to align to nucleotidic sequence(s)
open(my $FASTA, '>', "$cache/$date.$order.fas");
my $seq2aln = $original;
$seq2aln =~ s/-//g;
$seq2aln .= '*'; #To add stop codon triplet, if any, at the end of the protein seq
#* is aligned only if there is one stop codon similar to the last position of protein
#If not, * is excluded from the alignment because it is an external border misalignment
$seq2aln =~ s{-}{}g;
$seq2aln .= '*' if ( $seq2aln !~ /\*$/ ); # Add artificially stop codon triplet at the end of the protein seq
# * is aligned only if there is one stop codon similar to the last position of protein
# If not, * is excluded from the alignment because of external border misalignment
print {$FASTA} ">$order\n$seq2aln\n";
close $FASTA;
#Warning: Abusive threshold between genomic and transcript sequence lengths. Arbitrarily fixed at 5000 bp here !
# To manage short DNA seq, everything pass through --model protein2genome but with --exhaustive for short ones
my %best_pos;
my %bestBOJ;
my %best_boj;
my @bestAln;
RUN_EXONERATE:
for(my $b=0; $b<=$#gis; $b++){
if ( -s "$cache/${gis[$b]}.fas" > 5000 ){
system("$exonerate_exe --showquerygff --showtargetgff --model protein2genome --bestn 1 --percent 75 -q $cache/${date}_${order}.fas -t $cache/${gis[$b]}.fas >$cache/${date}_${order}.exon 2>>$cache/${date}_ExonerateError"); #Exonerate prot - genomic or long RNA
# Warning: Arbitrarily threshold between genomic and transcript sequence lengths, fixed at 5000 bp here
# To manage short DNA seq, everything pass through --model protein2genome but with --exhaustive for short ones
if ( -s "$cache/$gis[$b].fas" > 5000 ){
system("$exonerate_exe --showquerygff --showtargetgff --model protein2genome --bestn 1 --percent 75 -q $cache/$date.$order.fas -t $cache/$gis[$b].fas >$cache/$date.$order.exon 2>>$cache/$date.ExonerateError"); #Exonerate prot - genomic or long RNA
}
else{
system("$exonerate_exe --showquerygff --showtargetgff --model protein2genome --bestn 1 --percent 75 --exhaustive -q $cache/${date}_${order}.fas -t $cache/${gis[$b]}.fas >$cache/${date}_${order}.exon 2>>$cache/${date}_ExonerateError"); #Exonerate prot - short genomic or RNA
system("$exonerate_exe --showquerygff --showtargetgff --model protein2genome --bestn 1 --percent 75 --exhaustive -q $cache/$date.$order.fas -t $cache/$gis[$b].fas >$cache/$date.$order.exon 2>>$cache/$date.ExonerateError"); #Exonerate prot - short genomic or RNA
}
my $targetNT = $gis[$b];
$targetNT = "gi\|".$targetNT."\|" if ( $gis[$b] =~ /^\d+$/ );
print {*STDERR} "\n@@ -> $b ... $targetNT\n";
$targetNT = 'gi|'.$targetNT.'|' if ( $gis[$b] =~ /^\d+$/ );
print "\n@@ -> $b ... $targetNT\n" if ( $debug );
unlink("$cache/Error:") if ( $tmp == 0 || -z "$cache/Error:" ); #Remove exonerate Error file is it fails to align protein and nucleotide sequences
if ( !-e "$cache/${date}_${order}.exon" || (-s "$cache/${date}_${order}.exon") < 520 ){
if ( !-e "$cache/$date.$order.exon" || (-s "$cache/$date.$order.exon") < 520 ){
print "\tProtein-Nucleotide alignment has FAILED for $targetNT with $fasta_header\n";
next RUN_EXONERATE;
}
......@@ -1083,22 +1082,22 @@ sub runExonerate{
#Parse Exonerate output
my ($posiTions, $posBOJ) = Exonerate::parser("$cache/${date}_${order}.exon", $debug);
my ($posiTions, $posBOJ) = Exonerate::parser("$cache/$date.$order.exon", $debug);
%best_pos = testPositions($posiTions, $gis[$b], %best_pos);
%bestBOJ = testBOJ($posBOJ, $gis[$b], %bestBOJ);
%best_boj = testBOJ($posBOJ, $gis[$b], %best_boj);
my $whichGI = $gis[$b];
$whichGI = 'gi|'.$whichGI.'|' if ( $whichGI =~ /^\d+$/ );
print {*STDERR} "\tNo exon-intron structure found for $whichGI with $fasta_header\n" if ( %$posBOJ eq 0 );
}
#Remove temp files from exonerate and query sequence for exonerate
unlink("$cache/${date}_${order}.fas", "$cache/${date}_${order}.exon");# if ($tmp == 0);
unlink("$cache/$date.$order.fas", "$cache/$date.$order.exon");# if ($tmp == 0);
my $refBestPOS = \%best_pos; #Use perl references to send two variables (mem address for the 2 hashes) instead of 2 hashes,
my $refBestBOJ = \%bestBOJ; #and easily, and properly, get them in the main script !
my $refBestBOJ = \%best_boj; #and easily, and properly, get them in the main script !
return($refBestPOS, $refBestBOJ);
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment