Commit 9b8392da authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

General function for fetch XML NCBI message

parent 5788c595
......@@ -822,6 +822,17 @@ sub run_BLAST{
##################### NCBI requests #####################
sub fetch {
my ($url) = @_;
for (my $tries=0; $tries <20; $tries++ ){
my $content = get($url);
return $content if defined $content;
}
print {*STDERR} "Problem with NCBI eutils, please try again later/n";
exit(4);
}
#Prot ACC -> PUID
sub blastPAcc2PGI{
......@@ -829,8 +840,7 @@ sub blastPAcc2PGI{
my $protGI = '';
#FIXME: should be ${blastHit}[pacc] but something is broken at NCBI
my $content = get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
die "Problem with NCBI eutils, please try again later/n" unless defined $content;
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=$blastHit&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$protGI = $1;
}
......@@ -844,39 +854,29 @@ sub protGI2NTGIs{
my $ntGIs = '';
my $geneID = '';
my $count = 0;
GET_NTUI:
for(my $rep=0;$rep <= 4; $rep++){
$count++;
#nt GIs
system("wget -q -O $cache/${date}_${blastHit}nucleo.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,nucleotide,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
open(my $GIN, '<', "$cache/${date}_${blastHit}nucleo.tmp");
my $flag = 0;
NT_GI:
while(<$GIN>){
if ( $_ =~ /<LinkName>protein_nuc[a-z]+<\/LinkName>/ ){ #for nuccleotide and nuccore
$flag = 1;
}
elsif ( $_ =~ /<LinkName>protein_gene<\/LinkName>/ ){
$flag = 2;
}
elsif ( $flag==1 && $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
my $match = $1;
$ntGIs .= "$match,$match,";
}
elsif ( $flag==2 && $_ =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
my $match = $1;
$geneID .= "$match,$match,";
}
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=nuccore,nucleotide,gene&id=$protGI&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my @xml = split("\n", $content);
my $flag = 0;
for my $line (@xml){
if ( $line =~ /<LinkName>protein_nuc[a-z]+<\/LinkName>/ ){ #for nuccleotide and nuccore
$flag = 1;
}
elsif ( $line =~ /<LinkName>protein_gene<\/LinkName>/ ){
$flag = 2;
}
elsif ( $flag==1 && $line =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
my $match = $1;
$ntGIs .= "$match,$match,";
}
elsif ( $flag==2 && $line =~ /^.*<Id>(\d+)<\/Id>.*$/ ){
my $match = $1;
$geneID .= "$match,$match,";
}
close $GIN;
$rep = $rep-15 if ( -z "$cache/${date}_${blastHit}nucleo.tmp" && $count==1 );
last GET_NTUI if ( $ntGIs ne '' || $geneID ne '' );
}
unlink("$cache/${date}_${blastHit}nucleo.tmp") if ( $tmp==0 || ($ntGIs ne '' || $geneID ne '') );
#Remove redundancy if any
chop $ntGIs;
chop $ntGIs; #Remove last ','
my %hash_NT = split(',', $ntGIs);
$ntGIs = join(',', keys(%hash_NT) );
chop $geneID;
......@@ -959,6 +959,7 @@ sub geneID2Chr{
sub downloadSeqFromGIs{
my ($cache, $date, $amont, $aval, @acc) = @_;
#FIXME: general fct for fetching fasta seq
GET_SEQ:
for(my $a=0; $a<=$#acc; $a++){
my $whatNumber = 0;
......@@ -1027,8 +1028,7 @@ sub download_seq{
if ( $pacc2puid !~ /^[NAX][CGTSWZMR]_/ ){ #Not RefSeq acc
#pacc = primary acc NOT prot acc ! #265666 -> S55551
my $content = get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
die "Problem with NCBI eutils, please try again later/n" unless defined $content;
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=${pacc2puid}[pacc]&tool=ProtoGene&email=smoretti\@unil.ch");
if ( $content =~ /<Id>(\d+)<\/Id>/ ){
$pacc2puid = $1;
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment