|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import os |
| 5 | +import sys |
| 6 | +import argparse |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +import deeptools.countReadsPerBin as countR |
| 10 | +from deeptools import parserCommon |
| 11 | +from deeptools.utilities import smartLabels |
| 12 | +from importlib.metadata import version |
| 13 | +old_settings = np.seterr(all='ignore') |
| 14 | + |
| 15 | +from deeptools.hp import r_mbams |
| 16 | + |
| 17 | +def parse_arguments(args=None): |
| 18 | + parser = \ |
| 19 | + argparse.ArgumentParser( |
| 20 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 21 | + description=""" |
| 22 | +
|
| 23 | +``multiBamSummary`` computes the read coverages for genomic regions for typically two or more BAM files. |
| 24 | +The analysis can be performed for the entire genome by running the program in 'bins' mode. |
| 25 | +If you want to count the read coverage for specific regions only, use the ``BED-file`` mode instead. |
| 26 | +The standard output of ``multiBamSummary`` is a compressed numpy array (``.npz``). |
| 27 | +It can be directly used to calculate and visualize pairwise correlation values between the read coverages using the tool 'plotCorrelation'. |
| 28 | +Similarly, ``plotPCA`` can be used for principal component analysis of the read coverages using the .npz file. |
| 29 | +Note that using a single bigWig file is only recommended if you want to produce a bedGraph file (i.e., with the ``--outRawCounts`` option; the default output file cannot be used by ANY deepTools program if only a single file was supplied!). |
| 30 | +
|
| 31 | +A detailed sub-commands help is available by typing: |
| 32 | +
|
| 33 | + multiBamSummary bins -h |
| 34 | +
|
| 35 | + multiBamSummary BED-file -h |
| 36 | +
|
| 37 | +
|
| 38 | +""", |
| 39 | + epilog='example usages:\n' |
| 40 | + 'multiBamSummary bins --bamfiles file1.bam file2.bam -o results.npz \n\n' |
| 41 | + 'multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam \n' |
| 42 | + '-o results.npz' |
| 43 | + ' \n\n', |
| 44 | + conflict_handler='resolve') |
| 45 | + |
| 46 | + parser.add_argument('--version', action='version', |
| 47 | + version='%(prog)s {}'.format(version('deeptools'))) |
| 48 | + subparsers = parser.add_subparsers( |
| 49 | + title="commands", |
| 50 | + dest='command', |
| 51 | + description='subcommands', |
| 52 | + help='subcommands', |
| 53 | + metavar='') |
| 54 | + |
| 55 | + parent_parser = parserCommon.getParentArgParse(binSize=False) |
| 56 | + read_options_parser = parserCommon.read_options() |
| 57 | + |
| 58 | + # bins mode options |
| 59 | + subparsers.add_parser( |
| 60 | + 'bins', |
| 61 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| 62 | + parents=[bamcorrelate_args(case='bins'), |
| 63 | + parent_parser, read_options_parser, |
| 64 | + parserCommon.gtf_options(suppress=True) |
| 65 | + ], |
| 66 | + help="The coverage calculation is done for consecutive bins of equal " |
| 67 | + "size (10 kilobases by default). This mode is useful to assess the " |
| 68 | + "genome-wide similarity of BAM files. The bin size and " |
| 69 | + "distance between bins can be adjusted.", |
| 70 | + add_help=False, |
| 71 | + usage='%(prog)s ' |
| 72 | + '--bamfiles file1.bam file2.bam ' |
| 73 | + '-o results.npz \n' |
| 74 | + 'help: multiBamSummary bins -h / multiBamSummary bins --help\n') |
| 75 | + |
| 76 | + # BED file arguments |
| 77 | + subparsers.add_parser( |
| 78 | + 'BED-file', |
| 79 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| 80 | + parents=[bamcorrelate_args(case='BED-file'), |
| 81 | + parent_parser, read_options_parser, |
| 82 | + parserCommon.gtf_options() |
| 83 | + ], |
| 84 | + help="The user provides a BED file that contains all regions " |
| 85 | + "that should be considered for the coverage analysis. A " |
| 86 | + "common use is to compare ChIP-seq coverages between two " |
| 87 | + "different samples for a set of peak regions.", |
| 88 | + usage='%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz\n' |
| 89 | + 'help: multiBamSummary BED-file -h / multiBamSummary bins --help\n', |
| 90 | + add_help=False) |
| 91 | + |
| 92 | + return parser |
| 93 | + |
| 94 | + |
| 95 | +def bamcorrelate_args(case='bins'): |
| 96 | + parser = argparse.ArgumentParser(add_help=False) |
| 97 | + required = parser.add_argument_group('Required arguments') |
| 98 | + |
| 99 | + # define the arguments |
| 100 | + required.add_argument('--bamfiles', '-b', |
| 101 | + metavar='FILE1 FILE2', |
| 102 | + help='List of indexed bam files separated by spaces.', |
| 103 | + nargs='+', |
| 104 | + required=True) |
| 105 | + |
| 106 | + required.add_argument('--outFileName', '-out', '-o', |
| 107 | + help='File name to save the coverage matrix. This matrix ' |
| 108 | + 'can be subsequently plotted using plotCorrelation or ' |
| 109 | + 'or plotPCA.', |
| 110 | + type=parserCommon.writableFile) |
| 111 | + |
| 112 | + optional = parser.add_argument_group('Optional arguments') |
| 113 | + |
| 114 | + optional.add_argument("--help", "-h", action="help", |
| 115 | + help="show this help message and exit") |
| 116 | + optional.add_argument('--labels', '-l', |
| 117 | + metavar='sample1 sample2', |
| 118 | + help='User defined labels instead of default labels from ' |
| 119 | + 'file names. ' |
| 120 | + 'Multiple labels have to be separated by a space, e.g. ' |
| 121 | + '--labels sample1 sample2 sample3', |
| 122 | + nargs='+') |
| 123 | + optional.add_argument('--smartLabels', |
| 124 | + action='store_true', |
| 125 | + help='Instead of manually specifying labels for the input ' |
| 126 | + 'BAM files, this causes deepTools to use the file name ' |
| 127 | + 'after removing the path and extension.') |
| 128 | + |
| 129 | + optional.add_argument('--genomeChunkSize', |
| 130 | + type=int, |
| 131 | + default=None, |
| 132 | + help='Manually specify the size of the genome provided to each processor. ' |
| 133 | + 'The default value of None specifies that this is determined by read ' |
| 134 | + 'density of the BAM file.') |
| 135 | + |
| 136 | + if case == 'bins': |
| 137 | + optional.add_argument('--binSize', '-bs', |
| 138 | + metavar='INT', |
| 139 | + help='Length in bases of the window used ' |
| 140 | + 'to sample the genome. (Default: %(default)s)', |
| 141 | + default=10000, |
| 142 | + type=int) |
| 143 | + |
| 144 | + optional.add_argument('--distanceBetweenBins', '-n', |
| 145 | + metavar='INT', |
| 146 | + help='By default, multiBamSummary considers consecutive ' |
| 147 | + 'bins of the specified --binSize. However, to ' |
| 148 | + 'reduce the computation time, a larger distance ' |
| 149 | + 'between bins can by given. Larger distances ' |
| 150 | + 'result in fewer bins considered. (Default: %(default)s)', |
| 151 | + default=0, |
| 152 | + type=int) |
| 153 | + |
| 154 | + required.add_argument('--BED', |
| 155 | + help=argparse.SUPPRESS, |
| 156 | + default=None) |
| 157 | + else: |
| 158 | + optional.add_argument('--binSize', '-bs', |
| 159 | + help=argparse.SUPPRESS, |
| 160 | + default=10000, |
| 161 | + type=int) |
| 162 | + |
| 163 | + optional.add_argument('--distanceBetweenBins', '-n', |
| 164 | + help=argparse.SUPPRESS, |
| 165 | + metavar='INT', |
| 166 | + default=0, |
| 167 | + type=int) |
| 168 | + |
| 169 | + required.add_argument('--BED', |
| 170 | + help='Limits the coverage analysis to ' |
| 171 | + 'the regions specified in these files.', |
| 172 | + metavar='FILE1.bed FILE2.bed', |
| 173 | + nargs='+', |
| 174 | + required=True) |
| 175 | + |
| 176 | + group = parser.add_argument_group('Output optional options') |
| 177 | + |
| 178 | + group.add_argument('--outRawCounts', |
| 179 | + help='Save the counts per region to a tab-delimited file.', |
| 180 | + type=parserCommon.writableFile, |
| 181 | + metavar='FILE') |
| 182 | + |
| 183 | + group.add_argument('--scalingFactors', |
| 184 | + help='Compute scaling factors (in the DESeq2 manner) ' |
| 185 | + 'compatible for use with bamCoverage and write them to a ' |
| 186 | + 'file. The file has tab-separated columns "sample" and ' |
| 187 | + '"scalingFactor".', |
| 188 | + type=parserCommon.writableFile, |
| 189 | + metavar='FILE') |
| 190 | + |
| 191 | + return parser |
| 192 | + |
| 193 | + |
| 194 | +def process_args(args=None): |
| 195 | + args = parse_arguments().parse_args(args) |
| 196 | + |
| 197 | + if len(sys.argv) == 1: |
| 198 | + parse_arguments().print_help() |
| 199 | + sys.exit() |
| 200 | + |
| 201 | + if args.labels and len(args.bamfiles) != len(args.labels): |
| 202 | + print("The number of labels does not match the number of bam files.") |
| 203 | + exit(0) |
| 204 | + if not args.labels: |
| 205 | + if args.smartLabels: |
| 206 | + args.labels = smartLabels(args.bamfiles) |
| 207 | + else: |
| 208 | + args.labels = [os.path.basename(x) for x in args.bamfiles] |
| 209 | + |
| 210 | + if not args.BED: |
| 211 | + args.BED = "None" |
| 212 | + if not args.region: |
| 213 | + args.region = [] |
| 214 | + if not args.blackListFileName: |
| 215 | + args.blackListFileName = "None" |
| 216 | + if not args.outRawCounts: |
| 217 | + args.outRawCounts = "None" |
| 218 | + if not args.scalingFactors: |
| 219 | + args.scalingFactors = "None" |
| 220 | + # defaults for the filtering options |
| 221 | + if not args.samFlagInclude: |
| 222 | + args.samFlagInclude = 0 |
| 223 | + if not args.samFlagExclude: |
| 224 | + args.samFlagExclude = 0 |
| 225 | + if not args.minFragmentLength: |
| 226 | + args.minFragmentLength = 0 |
| 227 | + if not args.maxFragmentLength: |
| 228 | + args.maxFragmentLength = 0 |
| 229 | + if not args.minMappingQuality: |
| 230 | + args.minMappingQuality = 0 |
| 231 | + return args |
| 232 | + |
| 233 | + |
| 234 | +def main(args=None): |
| 235 | + """ |
| 236 | + 1. get read counts at different positions either |
| 237 | + all of same length or from genomic regions from the BED file |
| 238 | +
|
| 239 | + 2. save data for further plotting |
| 240 | +
|
| 241 | + """ |
| 242 | + args = process_args(args) |
| 243 | + print(f"args = {args}") |
| 244 | + print("running r_mbams") |
| 245 | + r_mbams( |
| 246 | + args.command, |
| 247 | + args.bamfiles, |
| 248 | + args.outFileName, |
| 249 | + args.outRawCounts, |
| 250 | + args.scalingFactors, |
| 251 | + args.labels, |
| 252 | + args.binSize, |
| 253 | + args.distanceBetweenBins, |
| 254 | + args.numberOfProcessors, |
| 255 | + args.BED, |
| 256 | + args.region, |
| 257 | + args.blackListFileName, |
| 258 | + args.verbose, |
| 259 | + args.extendReads, |
| 260 | + args.centerReads, |
| 261 | + args.samFlagInclude, |
| 262 | + args.samFlagExclude, |
| 263 | + args.minFragmentLength, |
| 264 | + args.maxFragmentLength, |
| 265 | + args.minMappingQuality, |
| 266 | + args.keepExons, |
| 267 | + args.transcriptID, |
| 268 | + args.exonID, |
| 269 | + args.transcript_id_designator |
| 270 | + ) |
| 271 | + |
| 272 | + # if 'BED' in args: |
| 273 | + # bed_regions = args.BED |
| 274 | + # else: |
| 275 | + # bed_regions = None |
| 276 | + |
| 277 | + # if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors): |
| 278 | + # sys.stderr.write("You've input a single BAM file and not specified " |
| 279 | + # "--outRawCounts or --scalingFactors. The resulting output will NOT be " |
| 280 | + # "useful with any deepTools program!\n") |
| 281 | + |
| 282 | + # stepsize = args.binSize + args.distanceBetweenBins |
| 283 | + # c = countR.CountReadsPerBin( |
| 284 | + # args.bamfiles, |
| 285 | + # args.binSize, |
| 286 | + # numberOfSamples=None, |
| 287 | + # genomeChunkSize=args.genomeChunkSize, |
| 288 | + # numberOfProcessors=args.numberOfProcessors, |
| 289 | + # verbose=args.verbose, |
| 290 | + # region=args.region, |
| 291 | + # bedFile=bed_regions, |
| 292 | + # blackListFileName=args.blackListFileName, |
| 293 | + # extendReads=args.extendReads, |
| 294 | + # minMappingQuality=args.minMappingQuality, |
| 295 | + # ignoreDuplicates=args.ignoreDuplicates, |
| 296 | + # center_read=args.centerReads, |
| 297 | + # samFlag_include=args.samFlagInclude, |
| 298 | + # samFlag_exclude=args.samFlagExclude, |
| 299 | + # minFragmentLength=args.minFragmentLength, |
| 300 | + # maxFragmentLength=args.maxFragmentLength, |
| 301 | + # stepSize=stepsize, |
| 302 | + # zerosToNans=False, |
| 303 | + # out_file_for_raw_data=args.outRawCounts) |
| 304 | + |
| 305 | + # num_reads_per_bin = c.run(allArgs=args) |
| 306 | + |
| 307 | + # sys.stderr.write("Number of bins " |
| 308 | + # "found: {}\n".format(num_reads_per_bin.shape[0])) |
| 309 | + |
| 310 | + # if num_reads_per_bin.shape[0] < 2: |
| 311 | + # exit("ERROR: too few non zero bins found.\n" |
| 312 | + # "If using --region please check that this " |
| 313 | + # "region is covered by reads.\n") |
| 314 | + |
| 315 | + # # numpy will append .npz to the file name if we don't do this... |
| 316 | + # if args.outFileName: |
| 317 | + # f = open(args.outFileName, "wb") |
| 318 | + # np.savez_compressed(f, |
| 319 | + # matrix=num_reads_per_bin, |
| 320 | + # labels=args.labels) |
| 321 | + # f.close() |
| 322 | + |
| 323 | + # if args.scalingFactors: |
| 324 | + # f = open(args.scalingFactors, 'w') |
| 325 | + # f.write("sample\tscalingFactor\n") |
| 326 | + # scalingFactors = countR.estimateSizeFactors(num_reads_per_bin) |
| 327 | + # for sample, scalingFactor in zip(args.labels, scalingFactors): |
| 328 | + # f.write("{}\t{:6.4f}\n".format(sample, scalingFactor)) |
| 329 | + # f.close() |
| 330 | + |
| 331 | + # if args.outRawCounts: |
| 332 | + # # append to the generated file the |
| 333 | + # # labels |
| 334 | + # header = "#'chr'\t'start'\t'end'\t" |
| 335 | + # header += "'" + "'\t'".join(args.labels) + "'\n" |
| 336 | + # f = open(args.outRawCounts, 'r+') |
| 337 | + # content = f.read() |
| 338 | + # f.seek(0, 0) |
| 339 | + # f.write(header + content) |
| 340 | + # f.close() |
0 commit comments