Skip to content

Commit 485be30

Browse files
authored
Merge pull request #1360 from deeptools/mbs
Mbs
2 parents 0a05ede + b38f760 commit 485be30

File tree

8 files changed

+539
-12
lines changed

8 files changed

+539
-12
lines changed

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,6 @@ bigtools = "0.5.3"
1717
tokio = "*"
1818
flate2 = "*"
1919
tempfile = "*"
20-
ndarray = "0.16.1"
20+
ndarray = "0.16.1"
21+
npyz = "*"
22+
zip = "*"
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
import os
5+
import sys
6+
import argparse
7+
import numpy as np
8+
9+
import deeptools.countReadsPerBin as countR
10+
from deeptools import parserCommon
11+
from deeptools.utilities import smartLabels
12+
from importlib.metadata import version
13+
old_settings = np.seterr(all='ignore')
14+
15+
from deeptools.hp import r_mbams
16+
17+
def parse_arguments(args=None):
18+
parser = \
19+
argparse.ArgumentParser(
20+
formatter_class=argparse.RawDescriptionHelpFormatter,
21+
description="""
22+
23+
``multiBamSummary`` computes the read coverages for genomic regions for typically two or more BAM files.
24+
The analysis can be performed for the entire genome by running the program in 'bins' mode.
25+
If you want to count the read coverage for specific regions only, use the ``BED-file`` mode instead.
26+
The standard output of ``multiBamSummary`` is a compressed numpy array (``.npz``).
27+
It can be directly used to calculate and visualize pairwise correlation values between the read coverages using the tool 'plotCorrelation'.
28+
Similarly, ``plotPCA`` can be used for principal component analysis of the read coverages using the .npz file.
29+
Note that using a single bigWig file is only recommended if you want to produce a bedGraph file (i.e., with the ``--outRawCounts`` option; the default output file cannot be used by ANY deepTools program if only a single file was supplied!).
30+
31+
A detailed sub-commands help is available by typing:
32+
33+
multiBamSummary bins -h
34+
35+
multiBamSummary BED-file -h
36+
37+
38+
""",
39+
epilog='example usages:\n'
40+
'multiBamSummary bins --bamfiles file1.bam file2.bam -o results.npz \n\n'
41+
'multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam \n'
42+
'-o results.npz'
43+
' \n\n',
44+
conflict_handler='resolve')
45+
46+
parser.add_argument('--version', action='version',
47+
version='%(prog)s {}'.format(version('deeptools')))
48+
subparsers = parser.add_subparsers(
49+
title="commands",
50+
dest='command',
51+
description='subcommands',
52+
help='subcommands',
53+
metavar='')
54+
55+
parent_parser = parserCommon.getParentArgParse(binSize=False)
56+
read_options_parser = parserCommon.read_options()
57+
58+
# bins mode options
59+
subparsers.add_parser(
60+
'bins',
61+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
62+
parents=[bamcorrelate_args(case='bins'),
63+
parent_parser, read_options_parser,
64+
parserCommon.gtf_options(suppress=True)
65+
],
66+
help="The coverage calculation is done for consecutive bins of equal "
67+
"size (10 kilobases by default). This mode is useful to assess the "
68+
"genome-wide similarity of BAM files. The bin size and "
69+
"distance between bins can be adjusted.",
70+
add_help=False,
71+
usage='%(prog)s '
72+
'--bamfiles file1.bam file2.bam '
73+
'-o results.npz \n'
74+
'help: multiBamSummary bins -h / multiBamSummary bins --help\n')
75+
76+
# BED file arguments
77+
subparsers.add_parser(
78+
'BED-file',
79+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
80+
parents=[bamcorrelate_args(case='BED-file'),
81+
parent_parser, read_options_parser,
82+
parserCommon.gtf_options()
83+
],
84+
help="The user provides a BED file that contains all regions "
85+
"that should be considered for the coverage analysis. A "
86+
"common use is to compare ChIP-seq coverages between two "
87+
"different samples for a set of peak regions.",
88+
usage='%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz\n'
89+
'help: multiBamSummary BED-file -h / multiBamSummary bins --help\n',
90+
add_help=False)
91+
92+
return parser
93+
94+
95+
def bamcorrelate_args(case='bins'):
96+
parser = argparse.ArgumentParser(add_help=False)
97+
required = parser.add_argument_group('Required arguments')
98+
99+
# define the arguments
100+
required.add_argument('--bamfiles', '-b',
101+
metavar='FILE1 FILE2',
102+
help='List of indexed bam files separated by spaces.',
103+
nargs='+',
104+
required=True)
105+
106+
required.add_argument('--outFileName', '-out', '-o',
107+
help='File name to save the coverage matrix. This matrix '
108+
'can be subsequently plotted using plotCorrelation or '
109+
'or plotPCA.',
110+
type=parserCommon.writableFile)
111+
112+
optional = parser.add_argument_group('Optional arguments')
113+
114+
optional.add_argument("--help", "-h", action="help",
115+
help="show this help message and exit")
116+
optional.add_argument('--labels', '-l',
117+
metavar='sample1 sample2',
118+
help='User defined labels instead of default labels from '
119+
'file names. '
120+
'Multiple labels have to be separated by a space, e.g. '
121+
'--labels sample1 sample2 sample3',
122+
nargs='+')
123+
optional.add_argument('--smartLabels',
124+
action='store_true',
125+
help='Instead of manually specifying labels for the input '
126+
'BAM files, this causes deepTools to use the file name '
127+
'after removing the path and extension.')
128+
129+
optional.add_argument('--genomeChunkSize',
130+
type=int,
131+
default=None,
132+
help='Manually specify the size of the genome provided to each processor. '
133+
'The default value of None specifies that this is determined by read '
134+
'density of the BAM file.')
135+
136+
if case == 'bins':
137+
optional.add_argument('--binSize', '-bs',
138+
metavar='INT',
139+
help='Length in bases of the window used '
140+
'to sample the genome. (Default: %(default)s)',
141+
default=10000,
142+
type=int)
143+
144+
optional.add_argument('--distanceBetweenBins', '-n',
145+
metavar='INT',
146+
help='By default, multiBamSummary considers consecutive '
147+
'bins of the specified --binSize. However, to '
148+
'reduce the computation time, a larger distance '
149+
'between bins can by given. Larger distances '
150+
'result in fewer bins considered. (Default: %(default)s)',
151+
default=0,
152+
type=int)
153+
154+
required.add_argument('--BED',
155+
help=argparse.SUPPRESS,
156+
default=None)
157+
else:
158+
optional.add_argument('--binSize', '-bs',
159+
help=argparse.SUPPRESS,
160+
default=10000,
161+
type=int)
162+
163+
optional.add_argument('--distanceBetweenBins', '-n',
164+
help=argparse.SUPPRESS,
165+
metavar='INT',
166+
default=0,
167+
type=int)
168+
169+
required.add_argument('--BED',
170+
help='Limits the coverage analysis to '
171+
'the regions specified in these files.',
172+
metavar='FILE1.bed FILE2.bed',
173+
nargs='+',
174+
required=True)
175+
176+
group = parser.add_argument_group('Output optional options')
177+
178+
group.add_argument('--outRawCounts',
179+
help='Save the counts per region to a tab-delimited file.',
180+
type=parserCommon.writableFile,
181+
metavar='FILE')
182+
183+
group.add_argument('--scalingFactors',
184+
help='Compute scaling factors (in the DESeq2 manner) '
185+
'compatible for use with bamCoverage and write them to a '
186+
'file. The file has tab-separated columns "sample" and '
187+
'"scalingFactor".',
188+
type=parserCommon.writableFile,
189+
metavar='FILE')
190+
191+
return parser
192+
193+
194+
def process_args(args=None):
195+
args = parse_arguments().parse_args(args)
196+
197+
if len(sys.argv) == 1:
198+
parse_arguments().print_help()
199+
sys.exit()
200+
201+
if args.labels and len(args.bamfiles) != len(args.labels):
202+
print("The number of labels does not match the number of bam files.")
203+
exit(0)
204+
if not args.labels:
205+
if args.smartLabels:
206+
args.labels = smartLabels(args.bamfiles)
207+
else:
208+
args.labels = [os.path.basename(x) for x in args.bamfiles]
209+
210+
if not args.BED:
211+
args.BED = "None"
212+
if not args.region:
213+
args.region = []
214+
if not args.blackListFileName:
215+
args.blackListFileName = "None"
216+
if not args.outRawCounts:
217+
args.outRawCounts = "None"
218+
if not args.scalingFactors:
219+
args.scalingFactors = "None"
220+
# defaults for the filtering options
221+
if not args.samFlagInclude:
222+
args.samFlagInclude = 0
223+
if not args.samFlagExclude:
224+
args.samFlagExclude = 0
225+
if not args.minFragmentLength:
226+
args.minFragmentLength = 0
227+
if not args.maxFragmentLength:
228+
args.maxFragmentLength = 0
229+
if not args.minMappingQuality:
230+
args.minMappingQuality = 0
231+
return args
232+
233+
234+
def main(args=None):
235+
"""
236+
1. get read counts at different positions either
237+
all of same length or from genomic regions from the BED file
238+
239+
2. save data for further plotting
240+
241+
"""
242+
args = process_args(args)
243+
print(f"args = {args}")
244+
print("running r_mbams")
245+
r_mbams(
246+
args.command,
247+
args.bamfiles,
248+
args.outFileName,
249+
args.outRawCounts,
250+
args.scalingFactors,
251+
args.labels,
252+
args.binSize,
253+
args.distanceBetweenBins,
254+
args.numberOfProcessors,
255+
args.BED,
256+
args.region,
257+
args.blackListFileName,
258+
args.verbose,
259+
args.extendReads,
260+
args.centerReads,
261+
args.samFlagInclude,
262+
args.samFlagExclude,
263+
args.minFragmentLength,
264+
args.maxFragmentLength,
265+
args.minMappingQuality,
266+
args.keepExons,
267+
args.transcriptID,
268+
args.exonID,
269+
args.transcript_id_designator
270+
)
271+
272+
# if 'BED' in args:
273+
# bed_regions = args.BED
274+
# else:
275+
# bed_regions = None
276+
277+
# if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors):
278+
# sys.stderr.write("You've input a single BAM file and not specified "
279+
# "--outRawCounts or --scalingFactors. The resulting output will NOT be "
280+
# "useful with any deepTools program!\n")
281+
282+
# stepsize = args.binSize + args.distanceBetweenBins
283+
# c = countR.CountReadsPerBin(
284+
# args.bamfiles,
285+
# args.binSize,
286+
# numberOfSamples=None,
287+
# genomeChunkSize=args.genomeChunkSize,
288+
# numberOfProcessors=args.numberOfProcessors,
289+
# verbose=args.verbose,
290+
# region=args.region,
291+
# bedFile=bed_regions,
292+
# blackListFileName=args.blackListFileName,
293+
# extendReads=args.extendReads,
294+
# minMappingQuality=args.minMappingQuality,
295+
# ignoreDuplicates=args.ignoreDuplicates,
296+
# center_read=args.centerReads,
297+
# samFlag_include=args.samFlagInclude,
298+
# samFlag_exclude=args.samFlagExclude,
299+
# minFragmentLength=args.minFragmentLength,
300+
# maxFragmentLength=args.maxFragmentLength,
301+
# stepSize=stepsize,
302+
# zerosToNans=False,
303+
# out_file_for_raw_data=args.outRawCounts)
304+
305+
# num_reads_per_bin = c.run(allArgs=args)
306+
307+
# sys.stderr.write("Number of bins "
308+
# "found: {}\n".format(num_reads_per_bin.shape[0]))
309+
310+
# if num_reads_per_bin.shape[0] < 2:
311+
# exit("ERROR: too few non zero bins found.\n"
312+
# "If using --region please check that this "
313+
# "region is covered by reads.\n")
314+
315+
# # numpy will append .npz to the file name if we don't do this...
316+
# if args.outFileName:
317+
# f = open(args.outFileName, "wb")
318+
# np.savez_compressed(f,
319+
# matrix=num_reads_per_bin,
320+
# labels=args.labels)
321+
# f.close()
322+
323+
# if args.scalingFactors:
324+
# f = open(args.scalingFactors, 'w')
325+
# f.write("sample\tscalingFactor\n")
326+
# scalingFactors = countR.estimateSizeFactors(num_reads_per_bin)
327+
# for sample, scalingFactor in zip(args.labels, scalingFactors):
328+
# f.write("{}\t{:6.4f}\n".format(sample, scalingFactor))
329+
# f.close()
330+
331+
# if args.outRawCounts:
332+
# # append to the generated file the
333+
# # labels
334+
# header = "#'chr'\t'start'\t'end'\t"
335+
# header += "'" + "'\t'".join(args.labels) + "'\n"
336+
# f = open(args.outRawCounts, 'r+')
337+
# content = f.read()
338+
# f.seek(0, 0)
339+
# f.write(header + content)
340+
# f.close()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,4 @@ bamCoverage2 = "deeptools.bamCoverage2:main"
8888
bamCompare2 = "deeptools.bamCompare2:main"
8989
computeMatrix2 = "deeptools.computeMatrix2:main"
9090
alignmentSieve2 = "deeptools.alignmentSieve2:main"
91+
multiBamSummary2 = "deeptools.multiBamSummary2:main"

src/alignmentsieve.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ pub fn r_alignmentsieve(
2727
_blacklist: &str, // blacklist file name.
2828
min_fragment_length: u32, // minimum fragment length.
2929
max_fragment_length: u32, // maximum fragment length.
30+
extend_reads: u32,
31+
center_reads: bool,
32+
3033
) -> PyResult<()> {
3134
// Input bam file
3235
let mut bam = Reader::from_path(bamifile).unwrap();

src/bamcompare.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,12 @@ pub fn r_bamcompare(
143143
Ok(())
144144
}
145145

146-
struct ParsedBamFile<'a> {
147-
bamfile: &'a str,
148-
ispe: bool,
149-
bg: Vec<TempPath>,
150-
mapped: u32,
151-
unmapped: u32,
152-
readlen: f32,
153-
fraglen: f32
146+
pub struct ParsedBamFile<'a> {
147+
pub bamfile: &'a str,
148+
pub ispe: bool,
149+
pub bg: Vec<TempPath>,
150+
pub mapped: u32,
151+
pub unmapped: u32,
152+
pub readlen: f32,
153+
pub fraglen: f32
154154
}

src/calc.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,4 @@ pub fn calc_ratio(
155155
return (num / den).log2();
156156
}
157157
}
158-
}
158+
}

0 commit comments

Comments
 (0)