Commit 5788c595 authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

Simplify code + start using LWP instead of wget

parent 7364ef64
......@@ -340,6 +340,7 @@ for(my $r=0; $r<=$#original_names; $r++){
@nt_GIs = "My_Seq-$$";
}
}
# Get Nt sequence through prot acc -> prot gi -> nt linked gi -> nt acc
else{
#Prot ACC -> PUID
......@@ -349,7 +350,6 @@ for(my $r=0; $r<=$#original_names; $r++){
@failureStatus = ('PUI_unavailable', $equivalent_blast_hits[$qq]);
next HIT_LINK;
}
# print "\t$protGI\n";
#PUID -> nt GIs & GeneID
my ($ntGIs, $geneID) = protGI2NTGIs($protGI, $equivalent_blast_hits[$qq]);
......@@ -358,7 +358,6 @@ for(my $r=0; $r<=$#original_names; $r++){
@failureStatus = ('No_nt_link', $equivalent_blast_hits[$qq]);
next HIT_LINK;
}
#TODO: remove redundancy here !!!
print " linked with [$ntGIs $geneID]\n";
......@@ -829,25 +828,12 @@ sub blastPAcc2PGI{
my ($blastHit) = @_;
my $protGI = '';
my $count = 0;
GET_PUI:
for(my $rep=0;$rep <= 4; $rep++){
$count++;
# system("wget -q -O $cache/${date}_${blastHit}protGi.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=${blastHit}[pacc]&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
system("wget -q -O $cache/${date}_${blastHit}protGi.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=${blastHit}&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
open(my $GIP, '<', "$cache/${date}_${blastHit}protGi.tmp");
PROT_GI:
while(<$GIP>){
if ( $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
$protGI = $1;
last PROT_GI; #OK if only one PUID
}
}
close $GIP;
$rep = $rep-15 if ( -z "$cache/${date}_${blastHit}protGi.tmp" && $count==1 );
last GET_PUI if ( $protGI =~ /^\d+$/ );
#FIXME: should be ${blastHit}[pacc] but something is broken at NCBI
my $content = get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
die "Problem with NCBI eutils, please try again later/n" unless defined $content;
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$protGI = $1;
}
unlink("$cache/${date}_${blastHit}protGi.tmp") if ( $tmp == 0 || $protGI ne '' );
return($protGI);
}
......@@ -871,18 +857,16 @@ sub protGI2NTGIs{
if ( $_ =~ /<LinkName>protein_nuc[a-z]+<\/LinkName>/ ){ #for nuccleotide and nuccore
$flag = 1;
}
elsif ( $_ =~ /\<LinkName\>protein_gene\<\/LinkName\>/ ){
elsif ( $_ =~ /<LinkName>protein_gene<\/LinkName>/ ){
$flag = 2;
}
elsif ( $flag==1 && $_ =~ /^.*\<Id\>(\d+)\<\/Id\>.*$/ ){
elsif ( $flag==1 && $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
my $match = $1;
$ntGIs .= ",$match" if ( $ntGIs ne '' && $ntGIs !~ /,$match,/ && $ntGIs !~ /^$match,/ );
$ntGIs = $match if ( $ntGIs eq '' );
$ntGIs .= "$match,$match,";
}
elsif ( $flag==2 && $_ =~ /^.*\<Id\>(\d+)\<\/Id\>.*$/ ){
elsif ( $flag==2 && $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
my $match = $1;
$geneID .= ",$match" if ( $geneID ne '' && $geneID !~ /,$match,/ && $geneID !~ /^$match,/ );
$geneID = $match if ( $geneID eq '' );
$geneID .= "$match,$match,";
}
}
close $GIN;
......@@ -891,6 +875,14 @@ sub protGI2NTGIs{
}
unlink("$cache/${date}_${blastHit}nucleo.tmp") if ( $tmp==0 || ($ntGIs ne '' || $geneID ne '') );
#Remove redundancy if any
chop $ntGIs;
my %hash_NT = split(',', $ntGIs);
$ntGIs = join(',', keys(%hash_NT) );
chop $geneID;
my %hash_GE = split(',', $geneID);
$geneID = join(',', keys(%hash_GE) );
return($ntGIs, $geneID);
}
......@@ -1031,29 +1023,29 @@ sub download_seq{
DOWNL_SEQ:
for(my $a=0; $a<=$#acc; $a++){
my $pacc2puid = $acc[$a];
if ( $pacc2puid !~ /^[NAX][CGTSWZMR]_/ ){ #Not RefSeq acc
#pacc = primary acc NOT prot acc ! #265666 -> S55551
system("wget -q -O $cache/${date}_${acc[$a]}.gui 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?&db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch'");
open(my $GUI, '<', "$cache/${date}_${acc[$a]}.gui");
DOWNL:
while(<$GUI>){
if ( $_ =~ /\<Id\>(\d+)\<\/Id\>/ ){
$pacc2puid = $1;
last DOWNL;
}
my $content = get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
die "Problem with NCBI eutils, please try again later/n" unless defined $content;
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$pacc2puid = $1;
}
close $GUI;
unlink("$cache/${date}_${acc[$a]}.gui") if ($tmp == 0 or $pacc2puid =~ /^\d+$/);
}
my $whatNumber = 0;
if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ ){
CHECK_CHR_DOWN:
for(my $rep=0;$rep <= 4; $rep++){
system("wget -q -O $cache/${acc[$a]}:${from}-${to}.fas 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${pacc2puid}&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch'") if ( !-e "$cache/${acc[$a]}:${from}-${to}.fas" || -z "$cache/${acc[$a]}:${from}-${to}.fas" || $cache eq 'none' || ($cache eq 'update' && -M "$cache/${acc[$a]}:${from}-${to}.fas" > $cacheStorageTime ));
open(my $CIBLE, '<', "$cache/${acc[$a]}:${amont}-${aval}.fas");
for(my $rep=0; $rep <= 4; $rep++){
if ( !-e "$cache/$acc[$a]:$from-$to.fas" || -z "$cache/$acc[$a]:$from-$to.fas" ||
$cache eq 'none' || ($cache eq 'update' && -M "$cache/$acc[$a]:$from-$to.fas" > $cacheStorageTime) ){
getstore("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&from=$amont&to=$aval&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a]:$from-$to.fas");
}
my $counter = 0;
my $lines = 0;
open(my $CIBLE, '<', "$cache/$acc[$a]:$amont-$aval.fas");
while(<$CIBLE>){
$lines++ if ( $_ !~ /^>/ );
$counter = $counter+2 if ( $counter==1 && $_ !~ /^\w/ && $lines==1 && $_ !~ /^>/ );
......@@ -1065,17 +1057,21 @@ sub download_seq{
last CHECK_CHR_DOWN if ( $whatNumber==20 );
if ( $counter != 1 ){
$rep = $rep-1;
unlink("$cache/${acc[$a]}:${amont}-${aval}.fas");
unlink("$cache/$acc[$a]:$amont-$aval.fas");
}
}
}
else{
CHECK_OTHER_DOWN:
for(my $rep=0;$rep <= 4; $rep++){
system("wget -q -O $cache/${acc[$a]}.fas 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${pacc2puid}&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch'") if ( !-e "$cache/${acc[$a]}.fas" || -z "$cache/${acc[$a]}.fas" || $cache eq 'none' || ($cache eq 'update' && -M "$cache/${acc[$a]}.fas" > $cacheStorageTime ));
open(my $CIBLE, '<', "$cache/${acc[$a]}.fas");
for(my $rep=0; $rep <= 4; $rep++){
if ( !-e "$cache/$acc[$a].fas" || -z "$cache/$acc[$a].fas" || $cache eq 'none' ||
($cache eq 'update' && -M "$cache/$acc[$a].fas" > $cacheStorageTime) ){
getstore("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=$pacc2puid&rettype=fasta&retmode=text&tool=ProtoGene&email=smoretti\@unil.ch", "$cache/$acc[$a].fas");
}
my $counter = 0;
my $lines = 0;
open(my $CIBLE, '<', "$cache/$acc[$a].fas");
while(<$CIBLE>){
$lines++ if ( $_ !~ /^>/ );
$counter = $counter+2 if ( $counter==1 && $_ !~ /^\w/ && $lines==1 && $_ !~ /^>/ );
......@@ -1087,17 +1083,19 @@ sub download_seq{
last CHECK_OTHER_DOWN if ( $whatNumber==20 );
if ( $counter != 1 ){
$rep = $rep-1;
unlink("$cache/${acc[$a]}.fas");
unlink("$cache/$acc[$a].fas");
}
}
}
#FIXME: Don't remember exactly what all this function does
my $checkSeq = '';
$checkSeq = `grep -v '>' "$cache/${acc[$a]}:${from}-${to}.fas"` if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
$checkSeq = `grep -v '>' "$cache/${acc[$a]}.fas"` if ( $amont !~ /^\d+$/ || $aval !~ /^\d+$/ );
$checkSeq = `grep -v '>' "$cache/$acc[$a]:$from-$to.fas"` if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
$checkSeq = `grep -v '>' "$cache/$acc[$a].fas"` if ( $amont !~ /^\d+$/ || $aval !~ /^\d+$/ );
if ( $checkSeq !~ /^[A-Za-z]/ && $cp==0 ){
# if ( -s "$cache/${acc[$a]}:${from}-${to}.fas" <120 && $cp==0){ #Instead of multiple downloads, only seq name and header
unlink("$cache/${acc[$a]}:${from}-${to}.fas") if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
unlink("$cache/${acc[$a]}.fas") if ( $amont !~ /^\d+$/ || $aval !~ /^\d+$/ );
# if ( -s "$cache/$acc[$a]:$from-$to.fas" <120 && $cp==0){ #Instead of multiple downloads, only seq name and header
unlink("$cache/$acc[$a]:$from-$to.fas") if ( $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
unlink("$cache/$acc[$a].fas") if ( $amont !~ /^\d+$/ || $aval !~ /^\d+$/ );
# $amont = '';
$aval = $aval-5000 if ( $aval =~ /^\d+$/ );
$a = $a-1;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment