Merge pull request #85 from dieterich-lab/devel

tjakobi · web-flow · commit afd5cdd6c47e · 2020-08-12T14:48:27.000+02:00
Go for it
diff --git a/DCC/Circ_nonCirc_Exon_Match.py b/DCC/Circ_nonCirc_Exon_Match.py
@@ -5,7 +5,7 @@
 
 import HTSeq
 
-from IntervalTree import IntervalTree
+from .IntervalTree import IntervalTree
 
 
 class CircNonCircExon(object):
@@ -194,7 +194,7 @@ def printuniq(self, Infile):
         for lin in f:
             lin_split = lin.split('\t')
             if keys.count(lin_split[0] + '\t' + lin_split[1] + '\t' + lin_split[2]) == 1:
-                print lin.strip('\n')
+                print(lin.strip('\n'))
 
     def readgtf(self, gtf_file):
         # store nonCircExons based on transcript_id and exon_number with all its annotations from different transcripts
@@ -275,7 +275,7 @@ def readHTSeqCount(self, HTSeqCount, exon_id2custom_exon_id):
     def findcircAdjacent(self, circExons, Custom_exon_id2Iv, Iv2Custom_exon_id, start=True):
         circAdjacentExons = {}
         circAdjacentExonsIv = {}
-        for key in circExons.keys():
+        for key in list(circExons.keys()):
             for ids in circExons[key]:
                 try:
                     interval = Custom_exon_id2Iv[self.getAdjacent(ids, start=start)]
@@ -292,7 +292,7 @@ def printCounts(self, Exons, Count_custom_exon_id, Custom_exon_id2Length):
         # Print the counts of circexons and adjacentexons
         # Exons: dictionaries with intervals as key, custom_exon_id as values
         ExonCounts = {}
-        for key in Exons.keys():
+        for key in list(Exons.keys()):
             counts = []
             for ids in Exons[key]:  # If for circAdjacentExons, ids here is a list
                 try:
@@ -397,7 +397,7 @@ def readSJ_out_tab(self, SJ_out_tab):
                                   strand] = lin_split[6]
             sj.close()
         except IOError:
-            print 'Do you have SJ.out.tab files in your sample folder? DCC cannot find it.'
+            print('Do you have SJ.out.tab files in your sample folder? DCC cannot find it.')
         return junctionReadCount
 
     def getskipjunctionCount(self, exonskipjunctions, junctionReadCount):
diff --git a/DCC/CombineCounts.py b/DCC/CombineCounts.py
@@ -35,9 +35,9 @@ def comb_coor(self, circfiles, strand=True):
             onefile.close()
 
         if strand:
-            coors = ['\t'.join(key.split('\t')[:-1]) + value for key, value in coorsDict.iteritems()]
+            coors = ['\t'.join(key.split('\t')[:-1]) + value for key, value in coorsDict.items()]
         else:
-            coors = ['{}{}'.format(key, value) for key, value in coorsDict.iteritems()]
+            coors = ['{}{}'.format(key, value) for key, value in coorsDict.items()]
 
         coorsSorted = self.sortBed(coors, retList=True)
         for itm in coorsSorted:
diff --git a/DCC/IntervalTree.py b/DCC/IntervalTree.py
@@ -36,7 +36,7 @@ def intersect(self, interval, report_func):
             # use the intersect method of IntervalNode class, need make this function aware of strand
 
     def traverse(self, func):
-        for item in self.chroms.itervalues():
+        for item in self.chroms.values():
             item.traverse(func)
 
 
diff --git a/DCC/__init__.py b/DCC/__init__.py
@@ -1,9 +1,9 @@
 # Import modules
-from findcircRNA import Findcirc
-from circFilter import Circfilter
-from circAnnotate import CircAnnotate
-from genecount import Genecount
-from CombineCounts import Combine
-from Circ_nonCirc_Exon_Match import CircNonCircExon
-from IntervalTree import IntervalTree
-from main import main
+from .findcircRNA import Findcirc
+from .circFilter import Circfilter
+from .circAnnotate import CircAnnotate
+from .genecount import Genecount
+from .CombineCounts import Combine
+from .Circ_nonCirc_Exon_Match import CircNonCircExon
+from .IntervalTree import IntervalTree
+from .main import main
diff --git a/DCC/circAnnotate.py b/DCC/circAnnotate.py
@@ -9,7 +9,7 @@
 
 import HTSeq
 
-from IntervalTree import IntervalTree
+from .IntervalTree import IntervalTree
 
 
 class CircAnnotate(object):
diff --git a/DCC/circFilter.py b/DCC/circFilter.py
@@ -4,7 +4,7 @@
 
 import HTSeq
 
-from IntervalTree import IntervalTree
+from .IntervalTree import IntervalTree
 
 
 ##########################
@@ -63,7 +63,7 @@ def readcirc(self, countfile, coordinates):
 
     # Do filtering
     def filtercount(self, count, indx):
-        print 'Filtering by read counts'
+        print('Filtering by read counts')
         sel = []  # store the passed filtering rows
         for itm in range(len(count)):
             if indx[itm][4] == '0':
@@ -117,7 +117,7 @@ def dummy_filter(self, indx0, count0):
         np.savetxt(self.tmp_dir + 'tmp_unsortedWithChrM', nonrep, delimiter='\t', newline='\n', fmt='%s')
 
     def removeChrM(self, withChrM):
-        print 'Remove ChrM'
+        print('Remove ChrM')
         unremoved = open(withChrM, 'r').readlines()
         removed = []
         for lines in unremoved:
diff --git a/DCC/findcircRNA.py b/DCC/findcircRNA.py
@@ -78,7 +78,7 @@ def sepDuplicates(self, Chim_junc, duplicates, nonduplicates):
             if reads.count(read) == 2:
                 dup.write(lines[indx])
             elif reads.count(read) > 2:
-                print 'Read %s has more than 2 count.' % read
+                print('Read %s has more than 2 count.' % read)
                 try:
                     logging.warning('Read %s has more than 2 count.' % read)
                 except NameError:
@@ -159,8 +159,8 @@ def findcirc(self, Chim_junc, output, strand=True):
             linecnt = linecnt + 1
 
             if len(L) < 14:
-                print ("WARNING: File " + str(Chim_junc) + ", line " + str(linecnt) + " does not contain all features.")
-                print ("WARNING: " + str(Chim_junc) + " is probably corrupt.")
+                print(("WARNING: File " + str(Chim_junc) + ", line " + str(linecnt) + " does not contain all features."))
+                print(("WARNING: " + str(Chim_junc) + " is probably corrupt."))
             if L[0] == "chr_donorA":
                continue
             if int(L[6]) >= 0 and L[0] == L[3] and L[2] == L[5] and (
@@ -217,7 +217,7 @@ def count(self, sortedlist, strand=True):
             elif not strand:
                 circs = (itm[0], itm[1], itm[2])
             else:
-                print "Please specify correct strand information."
+                print("Please specify correct strand information.")
             cnt[circs] += 1
             itm.append(str(cnt[circs]))
             # tmp_count.append( [itm[0],itm[1],itm[2],itm[3],itm[7],itm[4],itm[5],itm[6]] )
diff --git a/DCC/fix2chimera.py b/DCC/fix2chimera.py
@@ -55,10 +55,10 @@ def modify_junctiontype(junctiontype):
                 continue
             # check if the row has all fields
             if len(line_split) < 14:
-                print ("WARNING: File " + str(chimeric_junction_mate2) + ", line " + str(linecnt)
-                       + " does not contain all features.")
-                print ("WARNING: " + str(chimeric_junction_mate2) + " is probably corrupt.")
-                print ("WARNING: Offending line: " + str(line))
+                print(("WARNING: File " + str(chimeric_junction_mate2) + ", line " + str(linecnt)
+                       + " does not contain all features."))
+                print(("WARNING: " + str(chimeric_junction_mate2) + " is probably corrupt."))
+                print(("WARNING: Offending line: " + str(line)))
 
             linecnt += 1
 
@@ -110,7 +110,7 @@ def printduplicates(self, merged, duplicates, field=10):
         if not os.path.isfile(merged):
             sys.exit("ERROR: File " + str(merged) + " is missing!")
         elif os.stat(merged).st_size == 0:
-            print ("WARNING: File " + str(merged) + " is empty!")
+            print(("WARNING: File " + str(merged) + " is empty!"))
         else:
             try:
                 inputfile = open(merged, 'r')
diff --git a/DCC/genecount.py b/DCC/genecount.py
@@ -99,33 +99,33 @@ def genecount(self, circ_coordinates, bamfile, ref, tid):
         start_coordinates.close()
         end_coordinates.close()
 
-        print ('Started linear gene expression counting for %s' % bamfile)
+        print(('Started linear gene expression counting for %s' % bamfile))
 
         start = time.time()
         # mpileup get the read counts of the start and end positions
-        print ("\t=> running mpileup for start positions [%s]" % bamfile)
+        print(("\t=> running mpileup for start positions [%s]" % bamfile))
         mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coordinates_' + tid)
         end = time.time() - start
-        print ("\t=> mpileup for start positions for %s took %d seconds" % (bamfile, end))
+        print(("\t=> mpileup for start positions for %s took %d seconds" % (bamfile, end)))
 
         start = time.time()
         # mpileup get the read counts of the start and end positions
-        print ("\t=> running mpileup for end positions [%s]" % bamfile)
+        print(("\t=> running mpileup for end positions [%s]" % bamfile))
         mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coordinates_' + tid)
         end = time.time() - start
-        print ("\t=> mpileup for end positions for %s took %d seconds" % (bamfile, end))
+        print(("\t=> mpileup for end positions for %s took %d seconds" % (bamfile, end)))
 
-        print "\t=> gathering read counts for start positions [%s]" % bamfile
+        print("\t=> gathering read counts for start positions [%s]" % bamfile)
         startcount = self.getreadscount(mpileup_start, countmapped=True)
 
-        print "\t=> gathering read counts for end positions [%s]" % bamfile
+        print("\t=> gathering read counts for end positions [%s]" % bamfile)
         endcount = self.getreadscount(mpileup_end, countmapped=True)
 
         # remove tmp files
         # os.remove(self.tmp_dir + 'tmp_start_coordinates_' + tid)
         # os.remove(self.tmp_dir + 'tmp_end_coordinates_' + tid)
 
-        print 'Finished linear gene expression counting for %s' % bamfile
+        print('Finished linear gene expression counting for %s' % bamfile)
 
         return startcount, endcount
 
@@ -194,29 +194,29 @@ def linearsplicedreadscount(self, circ_coor, bamfile, ref, header=True):
         start_coor_1.close()
         end_coor.close()
         end_coor_1.close()
-        print ('Started linear spliced read counting for %s' % bamfile)
+        print(('Started linear spliced read counting for %s' % bamfile))
 
         # mpileup get the number of spliced reads at circle start position and (start-1) position.
 
-        print ("\t=> running mpileup 1 for start positions [%s]" % bamfile)
+        print(("\t=> running mpileup 1 for start positions [%s]" % bamfile))
         mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coor_1')
 
-        print ("\t=> running mpileup 2 for start positions [%s]" % bamfile)
+        print(("\t=> running mpileup 2 for start positions [%s]" % bamfile))
         mpileup_start_1 = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coor_2')
 
         # mpileup get the number of spliced reads at circle end position and (end+1) position.
-        print ("\t=> running mpileup 1 for end positions [%s]" % bamfile)
+        print(("\t=> running mpileup 1 for end positions [%s]" % bamfile))
         mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coor_1')
 
-        print ("\t=> running mpileup 2 for end positions [%s]" % bamfile)
+        print(("\t=> running mpileup 2 for end positions [%s]" % bamfile))
         mpileup_end_1 = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coor_2')
 
         # get count
 
-        print "\t=> gathering read counts for start positions [%s]" % bamfile
+        print("\t=> gathering read counts for start positions [%s]" % bamfile)
         startcount = self.submpileup(self.getreadscount(mpileup_start_1), self.getreadscount(mpileup_start))
 
-        print "\t=> gathering read counts for end positions [%s]" % bamfile
+        print("\t=> gathering read counts for end positions [%s]" % bamfile)
         endcount = self.submpileup(self.getreadscount(mpileup_end), self.getreadscount(mpileup_end_1), left=False)
 
         # remove tmp files
@@ -225,7 +225,7 @@ def linearsplicedreadscount(self, circ_coor, bamfile, ref, header=True):
         # os.remove(self.tmp_dir + 'tmp_end_coor')
         # os.remove(self.tmp_dir + 'tmp_end_coor_1')
 
-        print 'Finished linear spliced read counting for %s' % bamfile
+        print('Finished linear spliced read counting for %s' % bamfile)
 
         return startcount, endcount
 
@@ -266,7 +266,7 @@ def comb_gen_count(self, circ_coor, bamfile, ref, output, countlinearsplicedread
             # call genecount to get the start and end positon read counts
             tmp_start, tmp_end = self.genecount(circ_coor, bamfile, ref, tid)
 
-        print 'Ended linear gene expression counting %s' % bamfile
+        print('Ended linear gene expression counting %s' % bamfile)
         logging.info('Ended linear gene expression counting %s' % bamfile)
 
         for line in tmp_start:
@@ -314,6 +314,6 @@ def comb_gen_count(self, circ_coor, bamfile, ref, output, countlinearsplicedread
         # tmp_end.close()
         count_table.close()
 
-        print 'Ended post processing %s' % bamfile
+        print('Ended post processing %s' % bamfile)
         logging.info('Ended post processing %s' % bamfile)
         return tid
diff --git a/DCC/main.py b/DCC/main.py
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py