Commit e21fc09f authored by Sebastien Moretti's avatar Sebastien Moretti
Browse files

re-implement geneID. NEED tests

parent 52b1fdad
......@@ -110,16 +110,17 @@ if ( !$test_option_values || ($msa eq '' && $cache ne 'empty' && $cache ne 'old'
\t --db=path_for_a_local_db_blast_formated\n
\t --template to provide your own nucleotidic sequences
\t following the cds file format
\t --revtrans to reverse-translate sequences with no
\t --revtrans reverse-translates sequences with no
\t blast hit, in IUB (IUPAC) depiction code
\t They are removed from the alignement by default
\t --pep to add the original peptide query beneath the
\t --pep adds the original peptide query beneath the
\t back-translated sequence
\t --cache=none, update, use, 'own_PATH_directory', old, empty
\t to select the cache mode
\t default is 'update'\n
\t --version to print version information
\t --help to print a full help message\n\n";
\t --debug prints extra information when running
\t --version prints version information
\t --help prints a full help message\n\n";
exit(1);
}
......@@ -364,19 +365,22 @@ for(my $r=0; $r<=$#original_names; $r++){
#Get transcript and contig seq
@nt_GIs = split(/,/, $ntGIs);
downloadSeqFromGIs($cache, '', '', @nt_GIs);
#GeneID -> Chr
my @geneIDs = split(/,/, $geneID);
while(<@geneIDs>){
my $geneID = $_;
my ($chr, $amont, $aval) = geneID2Chr($geneID, $equivalent_blast_hits[$qq]);
print "\n\tNo gene locus found for $equivalent_blast_hits[$qq] with $geneID in $fasta_header\n\n" if ($chr eq '' and $geneID ne '');
if ( $chr ne '' ){
#Get Chr seq
download_seq($cache, $amont, $aval, $chr);
$intronStep = 1;
@nt_GIs = ("$chr:$amont-$aval", @nt_GIs);
if ( $geneID ne '' ){
my @geneIDs = split(/,/, $geneID);
while(<@geneIDs>){
my $geneID = $_;
my ($chr, $amont, $aval) = geneID2Chr($geneID, $equivalent_blast_hits[$qq]);
print "\n\tNo gene locus found for $equivalent_blast_hits[$qq] with $geneID in $fasta_header\n\n" if ($chr eq '' and $geneID ne '');
if ( $chr ne '' ){
#Get Chr seq
download_seq($cache, $amont, $aval, $chr);
$intronStep = 1;
@nt_GIs = ("$chr:$amont-$aval", @nt_GIs);
}
}
}
}
......@@ -825,9 +829,14 @@ sub run_BLAST{
sub fetch {
my ($url) = @_;
XML:
for (my $tries=0; $tries <20; $tries++ ){
my $content = get($url);
return $content if defined $content;
print "[$content]\n\n" if ($debug);
if ( defined $content ){
next XML if ( $content =~ /<ERROR>/i );
return $content;
}
}
print {*STDERR} "Problem with NCBI eutils, please try again later\n";
exit(40);
......@@ -922,67 +931,46 @@ sub geneID2Chr{
my $chr = '';
my ($amont, $aval) = ('', '');
my $count = 0;
GET_CHR:
for(my $rep=0;$rep <= 8; $rep++){
$count++;
#Gene Acc
# system("wget -q -O $cache/${date}_${blastHit}gene.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
system("wget -q -O $cache/${date}_${blastHit}gene.tmp 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch'");
open(my $GACC, '<', "$cache/${date}_${blastHit}gene.tmp");
my $flag = 0;
GENE_CHR:
while(<$GACC>){
if ( $_ =~ /\<ERROR\>Empty id list \- nothing todo\<\/ERROR\>/ && $flag==0 ){
$rep = $rep-15 if ( $count==1 );
}
# if ( $_ =~ /\<Entrezgene_locus\>/ && $flag==0 ){
if ( $_ =~ /<Item Name="GenomicInfoType" Type="Structure">/ && $flag==0 ){
$flag = 1;
}
# elsif ( $_ =~ /\<Gene-commentary_type value=\"genomic\"\>/ && $flag==1 ){
# $flag = 2;
# }
# elsif ( $_ =~ /\<Gene-commentary_type value=/ && $flag==2 ){
# last GENE_CHR;
# }
# elsif ( $_ =~ /\<Gene-commentary_accession\>([\w\_\-\.]+)\<\/Gene-commentary_accession\>/ && $flag==2 ){
elsif ( $_ =~ /<Item Name="ChrAccVer" Type="String">([^<]+?)\.?\d*<\/Item>/ && $flag==1 ){
$chr = $1;
$flag = 3;
}
# elsif ( $_ =~ /\<Seq-interval_from\>(\d+)\<\/Seq-interval_from\>/ && $flag==3 ){
elsif ( $_ =~ /<Item Name="ChrStart" Type="Integer">(\d+)<\/Item>/ && $flag==3 ){
$amont = $1;
$flag = 4;
}
# elsif ( $_ =~ /\<Seq-interval_to\>(\d+)\<\/Seq-interval_to\>/ && $flag==4 ){
elsif ( $_ =~ /<Item Name="ChrStop" Type="Integer">(\d+)<\/Item>/ && $flag==4 ){
$aval = $1;
last GENE_CHR;
}
# my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my $content = fetch("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=$geneID&retmode=xml&tool=ProtoGene&email=smoretti\@unil.ch");
my @xml = split("\n", $content);
my $flag = 0;
GENE_INFO:
for my $line (@xml){
if ( $flag==0 && $line =~ /<Item Name="GenomicInfoType" Type="Structure">/ ){
$flag = 1;
}
elsif ( $flag==1 && $line =~ /<Item Name="ChrAccVer" Type="String">([^<]+?)\.?\d*<\/Item>/ ){
$chr = $1;
$flag = 3;
}
elsif ( $flag==3 && $line =~ /<Item Name="ChrStart" Type="Integer">(\d+)<\/Item>/ ){
$amont = $1;
$flag = 4;
}
elsif ( $flag==4 && $line =~ /<Item Name="ChrStop" Type="Integer">(\d+)<\/Item>/ ){
$aval = $1;
last GENE_INFO;
}
close $GACC;
$rep = $rep-15 if ( -z "$cache/${date}_${blastHit}gene.tmp" && $count==1 );
last GET_CHR if ( $chr ne '' && $amont =~ /^\d+$/ && $aval =~ /^\d+$/ );
}
unlink("$cache/${date}_${blastHit}gene.tmp") if ( $tmp == 0 || $chr ne '' );
if ( $amont eq '' || $aval eq '' ){
$amont = '';
$aval = '';
$aval = '';
}
elsif ( $amont ne '' && $aval ne '' ){
if ( $amont > $aval ){
$amont = $amont+5000;
$aval = $aval-5000;
$amont = $amont + 5000;
$aval = $aval - 5000;
}
else{
$amont = $amont-5000;
$aval = $aval+5000;
$amont = $amont - 5000;
$aval = $aval + 5000;
}
}
$amont = 1 if ( $amont ne '' && $amont <= 0 );
$aval = 1 if ( $aval ne '' && $aval <= 0 );
$aval = 1 if ( $aval ne '' && $aval <= 0 );
return($chr, $amont, $aval);
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment