Revert "Added mm10 as requested by Mikhail"

This reverts commit 22c807c0.

Revert "Added mm10 as requested by Mikhail"
5696bd00 · Severine Duvaud · 22c807c0 · 5696bd00 · 5696bd00 · 5696bd00
Commit 5696bd00 authored Apr 28, 2017 by Severine Duvaud
--- a/SlidingMenu.qml
+++ b/SlidingMenu.qml
@@ -108,16 +108,6 @@ Rectangle {
                                Utils.reenable_file_selection()
                            }
                        }
-
-                        RadioButton {
-                            id: mm10
-                            text: "mm10"
-                            exclusiveGroup: genome
-                            onClicked:{
-                                genomeName = "mm10"
-                                Utils.reenable_file_selection()
-                            }
-                        }
                    }
                }
            }
@@ -147,7 +137,6 @@ Rectangle {
            /* "normal" content */
            ListView {
                width: 130
-
                anchors { top: parent.top; bottom: parent.bottom; left: parent.left; margins: 10 }
                clip: true;


--- a/component-utils.js
+++ b/component-utils.js
@@ -209,7 +209,6 @@ function reset_genome_versions() {
    hg19.checked = false
    hg18.checked = false
    mm9.checked = false
-    mm10.checked = false
 }

 function update_data_type(type) {

--- a/scripts/merge_reads_chipseq.py
+++ b/scripts/merge_reads_chipseq.py
 """ merge reads """

 import argparse
-import json
-import logging
 import math
 import os
 import re
@@ -19,7 +17,7 @@ def main():

    parser.add_argument('--bed-dir',
                        dest='bed_dir',
-                        default='sorted_bed',
+                        default='./',
                        help='Directory with bed files')
    parser.add_argument('-d',
                        help='directory with read files',
@@ -43,14 +41,10 @@ def main():
                        help="organism id (hg19 or mm9)",
                        dest="organism",
                        required=True)
-    parser.add_argument('--kallisto',
-                        help="Flag indicating that data is from kallisto processing",
-                        dest="kallisto",
-                        action="store_true",
-                        default=False)
+
    args = parser.parse_args()

-    genome_length = {'hg19': 3101788170, 'hg18': 3091592211, 'mm9': 2654911517, 'mm10': 2730871774}
+    genome_length = {'hg19': 3101788170, 'hg18': 3091592211, 'mm9': 2654911517}

    # define left, right window coverage if only one number is given
    if len(args.window) == 1:
@@ -69,20 +63,18 @@ def main():
        bed_file = read_file.rstrip('.expr')
        reads[bed_file] = {}
        bed_files.append(bed_file)
+
        for line in fin:
                entry = line.rstrip()
                if entry == '':
                        continue
                (prom_id, val) = entry.split("\t")
-                val = float(val)
-                if val == 0.:
-                    continue
-                reads[bed_file][prom_id] = val
+                reads[bed_file][prom_id] = float(val)
                proms.add(prom_id)

-    total_counts = get_total_counts(reads, args.bed_dir, args.kallisto)
-    logging.warning("merge_reads_chipseq: total counts %s\n" % str(total_counts))
-    fout = open('counts.tab', 'w')
+    total_counts = get_total_counts(reads, args.bed_dir)
+
+    fout = open(os.path.join(args.read_dir, 'counts.tab'), 'w')
    fout.write("\t".join(bed_files) + "\n")
    fout.write("\t".join([str(total_counts[bed_file])
                          for bed_file in bed_files]) + "\n")
@@ -108,6 +100,7 @@ def main():
                    / genome_length[args.organism])
            if cluster_id not in table:
                table[cluster_id] = []
+	    if expr > 0:
 	        table[cluster_id].append(math.log(expr, 2))

    fout = open(args.expr_file, 'w')
@@ -120,7 +113,7 @@ def main():
    fout.close()

    # write sample indices
-    fout = open('sample_indices', 'w')
+    fout = open(os.path.join(args.read_dir, 'sample_indices'), 'w')
    fout.write("\n".join([re.sub("\.(bam|bed)\.?g?z?$", '', x, flags=2)
                          for x in bed_files]))
    fout.write("\n")
@@ -140,7 +133,7 @@ def get_quantiles(prob, reads):
    return quantiles


-def get_total_counts(reads, bed_dir, is_kallisto):
+def get_total_counts(reads, bed_dir):
    """ Returns dict of total counts for samples"""
    total_count = {}

@@ -153,16 +146,10 @@ def get_total_counts(reads, bed_dir, is_kallisto):
            out, err = p.communicate()
            count = int(out.rstrip())
        elif f.lower().endswith(".bed.gz"):
-            p = subprocess.Popen(["zcat", f], stdout=subprocess.PIPE)
+            p = subprocess.Popen("zcat", stdin=open(f, 'r'), stdout=subprocess.PIPE)
            out = subprocess.check_output(["wc", "-l"], stdin=p.stdout)
            count = int(out.rstrip())
-        elif is_kallisto:
-            if not os.path.exists("%s/run_info.json" % f):
-                raise(BaseException("Could not open file %s/run_info.json" % f))
-            with open("%s/run_info.json" % f) as fin:
-                count = float(json.loads(fin.read())["n_processed"])
        total_count[sample] = count
-
    return total_count



--- a/scripts/parallel_rnaseq.sh
+++ b/scripts/parallel_rnaseq.sh
@@ -14,12 +14,6 @@ format-and-index $FILEPATH $FILE
 # hardcoded number of this step
 json_out "info" 2 "STEP 3: Reading regions" true

-if [ "$genome" = "hg19" -o "$genome" = "mm10" ]; then
-    t_option="$INPUT/$genome/transcript_annotation.gtf.gz"
-else 
-    t_option="$INPUT/$genome/refSeqAli.gz $INPUT/$genome/all_mrna.gz"
-fi
-
-readRegion_Cmd1="$IPATH/readRegion.py -t $t_option -p $INPUT/$genome/clusters_150_gene_associations.gz -i $TMP/processing/$FILE --expr-dir $TMP/expression"
+readRegion_Cmd1="$IPATH/readRegion.py -t $INPUT/$genome/refSeqAli.gz $INPUT/$genome/all_mrna.gz -p $INPUT/$genome/clusters_150_gene_associations.gz -i $TMP/processing/$FILE --expr-dir $TMP/expression"
 python $readRegion_Cmd1 || json_out "error" 1 "Script $0 Line $LINENO: Problem with: python $readRegion_Cmd1 Error: $( { python $readRegion_Cmd1; } )" true $?

--- a/scripts/readRegion.py
+++ b/scripts/readRegion.py
@@ -54,8 +54,7 @@ def main():
    use_numbers = False
    if args.input_file.lower().endswith('bam') or args.input_file.lower().endswith('sam') or args.input_file.lower().endswith('.sam.gz'):
        use_numbers = have_numbers(readfile)
-    elif args.input_file.lower().endswith('.bed') or args.input_file.lower().endswith('.bed.gz'):
-        use_numbers = have_numbers_bed(readfile)
+
    counter = 0
    log.warning("starting read processing")
    for chrom in proms.trCluster:
@@ -145,7 +144,7 @@ def processReads(reads):
        return blocks

    # process reads depending on type BAM or BED
-    if isinstance(reads[0], pysam.AlignedRead):
+    if isinstance(reads[0], pysam.AlignedRead) or isinstance(reads[0], pysam.calignmentfile.AlignedSegment):
        for read in reads:
            if read.is_paired:
                if read.is_proper_pair:
@@ -241,7 +240,6 @@ class Promoters:
    exonTree = {}
    trCluster = {}
    tr_count = 0
-    transcripts = {}

    def __init__(self, prom_file=None, tr_files=None):
        """ Init promoters """
@@ -250,10 +248,6 @@ class Promoters:

        if tr_files:
            for tr_file in tr_files:
-                if tr_file.lower().endswith(".gff") or tr_file.lower().endswith(".gtf") \
-                   or tr_file.lower().endswith(".gff.gz") or tr_file.lower().endswith(".gtf.gz"):
-                    self.read_gtf_transcripts(tr_file)
-                else:
                self.read_all_mrna(tr_file)

    def read_promoters(self, prom_file):
@@ -317,63 +311,6 @@ class Promoters:

        return transcript

-
-    def read_gtf_transcripts(self, transcript_file):
-        """ Read and process gtf/gff file (for gtf files from gencode project) """
-        fin = multi_open(transcript_file)
-
-        for line in fin:
-            if line.strip().startswith('#'):
-                continue
-            data = line.rstrip().split('\t')
-            if data[2] == 'transcript':
-                chrom = data[0]
-                strand = data[6]
-                (tr_start, tr_end) = sorted([int(data[3]), int(data[4])])
-                #extract transcript ID
-                transcript_id = None
-                for pair in data[8].split(";"):
-                    if pair.strip().startswith("transcript_id"):
-                        transcript_id = pair.split()[1].strip('"')
-                if transcript_id is None:
-                    logging.warning("No transcript id found in line %s" % line)
-                    continue
-                if transcript_id not in self.transcripts:
-                    self.transcripts[transcript_id] = {(chrom, strand, tr_start, tr_end):[]}
-                else:
-                    self.transcripts[transcript_id][(chrom, strand, tr_start, tr_end)] = []
-                    logging.warning("Transcript %s is seen multiple times!" % transcript_id)
-            if data[2] == "exon":
-                chrom = data[0]
-                strand = data[6]
-                (ex_start, ex_end) = sorted([int(data[3]), int(data[4])])
-                #extract transcript ID
-                transcript_id = None
-                for pair in data[8].split(";"):
-                    if pair.strip().startswith("transcript_id"):
-                        transcript_id = pair.split()[1].strip('"')
-                if transcript_id is None:
-                    logging.warning("No transcript id found in line %s" % line)
-                    continue
-                if transcript_id not in self.transcripts:
-                    logging.warning("transcript id %s no in annotation! Skipping exon %s"
-                                    % (transcript_id, line))
-                flag = True
-                for coord in self.transcripts[transcript_id]:
-                    if chrom == coord[0] and strand == coord[1] and ex_start >= coord[2] and ex_end <=coord[3]:
-                        self.transcripts[transcript_id][coord].append((ex_start, ex_end))
-                        flag = False
-                if flag:
-                    logging.warning("Exon %s is not assigned to any transcripts" % line)
-        for transcript_id in self.transcripts:
-            # check if we need this transcript
-            if transcript_id not in self.tr2proms:
-                continue
-            for coord in self.transcripts[transcript_id]:
-                transcript = (transcript_id, coord[0], coord[1], coord[2], coord[3], self.transcripts[transcript_id][coord])
-                self.add_transcript(transcript)
-
-    
    def add_transcript(self, transcript):
        """ Create refseq transcript from line of refGene file
        and add it to clusters """
@@ -515,12 +452,6 @@ def have_numbers(fin):
    return len(chroms) == 0


-def have_numbers_bed(fin):
-    """ Checks if reference sequence are in form of numbers or strings in BED """
-    chroms = [x for x in fin.contigs if x.startswith('chr')]
-    return len(chroms) == 0
-
-
 ###################

 if __name__ == '__main__':