From bbc204a1522a049006063404849c36034f5b4225 Mon Sep 17 00:00:00 2001 From: James Ashmore Date: Wed, 14 Feb 2024 14:25:17 +0000 Subject: [PATCH 1/3] Initial commit --- CITATIONS.md | 4 + README.md | 2 + bin/trimRRBSdiversityAdaptCustomers.py | 485 +++++++++++++++++++++++++ conf/modules.config | 18 +- docs/output.md | 17 + modules/local/trimdiversity/main.nf | 67 ++++ modules/local/trimdiversity/meta.yml | 48 +++ nextflow.config | 1 + nextflow_schema.json | 10 +- workflows/methylseq.nf | 19 +- 10 files changed, 666 insertions(+), 5 deletions(-) create mode 100755 bin/trimRRBSdiversityAdaptCustomers.py create mode 100644 modules/local/trimdiversity/main.nf create mode 100644 modules/local/trimdiversity/meta.yml diff --git a/CITATIONS.md b/CITATIONS.md index 87c02036c..97ec05b63 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -29,6 +29,10 @@ +- [NuMetRRBS](https://github.com/nugentechnologies/NuMetRRBS) + + > + - [Bismark](https://doi.org/10.1093/bioinformatics/btr167) > Felix Krueger, Simon R. Andrews, Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications, Bioinformatics, Volume 27, Issue 11, 1 June 2011, Pages 1571–1572, doi: [10.1093/bioinformatics/btr167](https://doi.org/10.1093/bioinformatics/btr167) diff --git a/README.md b/README.md index 0602909ee..3eedaefc0 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Choose between workflows by using `--aligner bismark` (default, uses bowtie2 for | Merge re-sequenced FastQ files | cat | cat | | Raw data QC | FastQC | FastQC | | Adapter sequence trimming | Trim Galore! | Trim Galore! | +| Diversity sequence trimming | NuMetRRBS | NuMetRRBS | | Align Reads | Bismark | bwa-meth | | Deduplicate Alignments | Bismark | Picard MarkDuplicates | | Extract methylation calls | Bismark | MethylDackel | @@ -95,6 +96,7 @@ These scripts were originally written for use at the [National Genomics Infrastr - Rickard Hammarén ([@Hammarn](https://github.com/Hammarn/)) - Alexander Peltzer ([@apeltzer](https://github.com/apeltzer/)) - Patrick Hüther ([@phue](https://github.com/phue/)) + - James Ashmore ([@jma1991](https://github.com/jma1991/)) ## Contributions and Support diff --git a/bin/trimRRBSdiversityAdaptCustomers.py b/bin/trimRRBSdiversityAdaptCustomers.py new file mode 100755 index 000000000..4efd0cc02 --- /dev/null +++ b/bin/trimRRBSdiversityAdaptCustomers.py @@ -0,0 +1,485 @@ +#!/usr/bin/env python + +from getopt import getopt +import sys +import os +from glob import glob +import gzip +from types import StringTypes + + +HELP_STRING = """Given a set of paired end read files, performs the trimming for RRBS with +diversity adapters. Make sure the pattern names are in quotations. + + -1 pattern for forward read files (for example, "R1_1_edit3_6bp_BC??.fq") + -2 pattern for reverse read files (for example, "R2_1_edit3_6bp_BC??.fq") + +If a -2 parameter is not provided then it will be assumed that the data is single ended. +The trimmed files will be named for the original files + "_trimmed.fq". + + + +VERSION: $Revision: 1.11 $ + +""" + +# DEFAULTS +BASE_POSITIONS = ["A", "C", "G", "T", "N"] +NUM_BASES = len(BASE_POSITIONS) +SAVE_UNDIGESTED = False +SHOW_STATS = False # When this is set to False, no report of the diversity will be printed. This needs to be toggled in the code if you want to turn it on. +USED_DGG = False # When this is set to False, the new version using RGG will be used for the analysis. This version of RGG is more specific than the previously used DGG. This needs to be toggled in the code if you want to turn it on. + +# FUNCTION HELPERS +def getEmptyStats(): + d = {"filename":0, + "nrBothRE":0, + "nrFwdRE":0, + "nrRevRE":0, + "nrNoRE":0, + "nrFwdREisC":0, + "nrFwdREisT":0, + "bdFwd4thBase":[0,0,0,0,0], + "nrD0":0, + "nrCGGCGG":0, + "nrTGGTGG":0, + "nrCGGTGG":0, + "nrTGGCGG":0, + "bdD1base1":[0,0,0,0,0], + "bdD2base1":[0,0,0,0,0], + "bdD2base2":[0,0,0,0,0], + "bdD3base1":[0,0,0,0,0], + "bdD3base2":[0,0,0,0,0], + "bdD3base3":[0,0,0,0,0], + "bdRev4thBase":[0,0,0,0,0], + "nrRevD0":0, + "bdRevD1base1":[0,0,0,0,0], + "bdRevD2base1":[0,0,0,0,0], + "bdRevD2base2":[0,0,0,0,0], + "bdRevD3base1":[0,0,0,0,0], + "bdRevD3base2":[0,0,0,0,0], + "bdRevD3base3":[0,0,0,0,0] + } + + return d + +def FastqIterator(fh): + """return an iterator of Records found in file handle, fh. + """ + def readTotitle(fh, titleChar): + """returns a tuple ([lines before the next title line], next tile line) + """ + preLines = [] + while True: + l = fh.readline().strip() + if l.startswith(titleChar): + return (preLines,l) + elif l == '': + return preLines,None + else: + preLines.append(l) + + if type(fh) in StringTypes: + if (fh.endswith(".gz")): + fh = gzip.open(fh,'r') + else: + fh = file(fh) + + preLines,nextTitleLine =readTotitle(fh,'@') + + while nextTitleLine != None: + seqTitle = nextTitleLine[1:].rstrip() + preLines,nextTitleLine=readTotitle(fh,'+') + qualTitle = nextTitleLine[1:].rstrip() + if len(qualTitle.strip()) > 0 and seqTitle != qualTitle: + print seqTitle + print preLines + print qualTitle + raise Exception("Error in parsing: @title sequence entry must be immediately followed by corresponding +title quality entry.") + seqLines = preLines + qualLines = [] + for i in range(len(seqLines)): # Quality characters should be the same length as the sequence + qualLines.append( fh.readline().strip() ) + + preLines,nextTitleLine=readTotitle(fh,'@') + + yield (seqTitle, ''.join(seqLines), ''.join(qualLines)) + +def trimOneRecord(fwdTitle, fwdSeq, fwdQual, revTitle, revSeq, revQual, outFwd, outRev, stats): + + + if msp1 == True: + fwdmPos = fwdSeq[:6].find("CGG") + fwdnmPos = fwdSeq[:6].find("TGG") + elif taq1 == True: + fwdmPos = fwdSeq[:6].find("TGA") + fwdnmPos = fwdSeq[:6].find("CGA") + elif both_taq1_msp1 == True: + fwdmPos = fwdSeq[:6].find("CGG") + fwdnmPos = fwdSeq[:6].find("TGG") + if fwdmPos == -1: + fwdmPos = fwdSeq[:6].find("TGA") + if fwdnmPos == -1: + fwdnmPos = fwdSeq[:6].find("CGA") + + # figure out CGR position + if revSeq == None: + cgaPos = -1 + elif revSeq: + cgaPos = revSeq[:6].find("CGA") + if cgaPos == -1: + if msp1 == True or both_taq1_msp1 == True: + cgaPos = revSeq[:6].find("CGG") + + # if a read doesn't have a fwd and rev MSP site, end early without writing to output + if fwdmPos == -1 and fwdnmPos == -1 and cgaPos == -1: + stats["nrNoRE"] += 1 + if not SAVE_UNDIGESTED: + return + elif cgaPos == -1 and revSeq != None: + stats["nrFwdRE"] += 1 + if not SAVE_UNDIGESTED: + return + elif cgaPos == -1 and revSeq == None: ##except if it's a single read + stats["nrFwdRE"] += 1 + elif fwdmPos == -1 and fwdnmPos == -1: + stats["nrRevRE"] += 1 + if not SAVE_UNDIGESTED: + return + else: + stats["nrBothRE"] += 1 + + ## FORWARD READ + # figure out ygg position + s = fwdSeq[:6].upper() + if s == "CGGCGG": + yggPos = 0 + stats["nrCGGCGG"] += 1 + elif s == "TGGTGG": + if USED_DGG: + yggPos = 3 + else: + yggPos = 0 + stats["nrTGGTGG"] += 1 + elif s == "CGGTGG": + yggPos = 0 + stats["nrCGGTGG"] += 1 + elif s == "TGGCGG": + if USED_DGG: + yggPos = 3 + else: + yggPos = 0 + stats["nrTGGCGG"] += 1 + else: + yggPos = max(fwdmPos, fwdnmPos) + + if yggPos == 0: + stats["nrD0"] += 1 + elif yggPos == 1: + if fwdSeq[0] not in BASE_POSITIONS: + print "Hmm... odd letter %s in %s at fwdSeq[0] when yggPos == 1" % (fwdSeq[1], fwdTitle) + for i in range(NUM_BASES): + if fwdSeq[0] == BASE_POSITIONS[i]: + stats["bdD1base1"][i] += 1 + elif yggPos == 2: + if fwdSeq[0] not in BASE_POSITIONS: + print "Hmm... odd letter %s in %s at fwdSeq[0] when yggPos == 2" % (fwdSeq[1], fwdTitle) + if fwdSeq[1] not in BASE_POSITIONS: + print "Hmm... odd letter %s in %s at fwdSeq[1] when yggPos == 2" % (fwdSeq[1], fwdTitle) + for i in range(NUM_BASES): + if fwdSeq[0] == BASE_POSITIONS[i]: + stats["bdD2base1"][i] += 1 + for i in range(NUM_BASES): + if fwdSeq[1] == BASE_POSITIONS[i]: + stats["bdD2base2"][i] += 1 + elif yggPos == 3: + if fwdSeq[0] not in BASE_POSITIONS: + print "Hmm... odd letter %s in %s at fwdSeq[0] when yggPos == 3" % (fwdSeq[0], fwdTitle) + if fwdSeq[1] not in BASE_POSITIONS: + print "Hmm... odd letter %s in %s at fwdSeq[1] when yggPos == 3" % (fwdSeq[1], fwdTitle) + if fwdSeq[2] not in BASE_POSITIONS: + print "Hmm... odd letter %s in %s at fwdSeq[2] when yggPos == 3" % (fwdSeq[2], fwdTitle) + for i in range(NUM_BASES): + if fwdSeq[0] == BASE_POSITIONS[i]: + stats["bdD3base1"][i] += 1 + for i in range(NUM_BASES): + if fwdSeq[1] == BASE_POSITIONS[i]: + stats["bdD3base2"][i] += 1 + for i in range(NUM_BASES): + if fwdSeq[2] == BASE_POSITIONS[i]: + stats["bdD3base3"][i] += 1 + elif yggPos == -1: + pass + else: + raise Exception("Impossible location for YGG: %s in %s" % (yggPos, fwdSeq)) + + if yggPos >= 0: + # track C vs. T in YGG + if fwdSeq[yggPos] == "C": + stats["nrFwdREisC"] += 1 + else: + stats["nrFwdREisT"] += 1 + + # track the stats on the 4th position + for i in range(NUM_BASES): + if fwdSeq[yggPos+3] == BASE_POSITIONS[i]: + stats["bdFwd4thBase"][i] += 1 + + if yggPos >= 0: # if YGG position is found + if revSeq == None: # if this is a single read, trim only 5 bases from 3' end + outFwd.write("@%s\n%s\n+\n%s\n" % (fwdTitle, fwdSeq[yggPos:-5], fwdQual[yggPos:-5])) # trim off the diversity before YGG site and 5 bases from 3' end + elif revSeq != None: # if this is a paired end read, trim 1 extra base from the 3' end, total of 6 bases so that alignment will be better for short fragments. + outFwd.write("@%s\n%s\n+\n%s\n" % (fwdTitle, fwdSeq[yggPos:-6], fwdQual[yggPos:-6])) # trim off the diversity before YGG site and 6 bases from the 3' end + else: # if no YGG site is found + if revSeq == None: # if this is a single read, trim only 5 bases from 3' end + outFwd.write("@%s\n%s\n+\n%s\n" % (fwdTitle, fwdSeq[:-5], fwdQual[:-5])) + elif revSeq != None: # if this is a paired end read, trim 1 extra base from the 3' end, total of 6 bases so that alignment will be better for short fragments. + outFwd.write("@%s\n%s\n+\n%s\n" % (fwdTitle, fwdSeq[:-6], fwdQual[:-6])) + + + ##REVERSE READ + if not revSeq: + return + + # track the D0, D1, D2, D3 stats and the base composition of the trimmed bases + if cgaPos == 0: + stats["nrRevD0"] += 1 + elif cgaPos == 1: + for i in range(NUM_BASES): + if revSeq[0] == BASE_POSITIONS[i]: + stats["bdRevD1base1"][i] += 1 + elif cgaPos == 2: + for i in range(NUM_BASES): + if revSeq[0] == BASE_POSITIONS[i]: + stats["bdRevD2base1"][i] += 1 + for i in range(NUM_BASES): + if revSeq[1] == BASE_POSITIONS[i]: + stats["bdRevD2base2"][i] += 1 + elif cgaPos == 3: + for i in range(NUM_BASES): + if revSeq[0] == BASE_POSITIONS[i]: + stats["bdRevD3base1"][i] += 1 + for i in range(NUM_BASES): + if revSeq[1] == BASE_POSITIONS[i]: + stats["bdRevD3base2"][i] += 1 + for i in range(NUM_BASES): + if revSeq[2] == BASE_POSITIONS[i]: + stats["bdRevD3base3"][i] += 1 + elif cgaPos == -1: + # this will only happen if we keep going with reads not full digested + pass + else: + raise Exception("Impossible location for CGA: %s in %s" % (cgaPos, revSeq)) + + # track the stats on the 4th position + if cgaPos >= 0: + for i in range(NUM_BASES): + if revSeq[cgaPos+3] == BASE_POSITIONS[i]: + stats["bdRev4thBase"][i] += 1 + + if cgaPos >= 0: # if cga position is found + outRev.write("@%s\n%s\n+\n%s\n" % (revTitle, revSeq[cgaPos+2:-6], revQual[cgaPos+2:-6])) # trim the cg from the 5' end and 6 bases from the 3' end + else: # if no cga site is found + outRev.write("@%s\n%s\n+\n%s\n" % (revTitle, revSeq[:-6], revQual[:-6])) # trim the cg from the 5' end and 6 bases from the 3' end + + + + +# retrieve the user parameters +fwdFilenames = None +revFilenames = None +statsFilename = None +usedDGG = USED_DGG +taq1 = False +both_taq1_msp1 = False +msp1 = True + +try: + optlist, args = getopt(sys.argv[1:], "h1:2:o:y:tb") +except: + print "Error retrieving options" + print "" + print HELP_STRING + sys.exit(1) + +for (opt, opt_arg) in optlist: + if opt == "-h": + print "" + print HELP_STRING + sys.exit(1) + elif opt == "-1": + fwdFilenames = opt_arg + elif opt == "-2": + revFilenames = opt_arg + elif opt == "-o": + statsFilename = opt_arg + elif opt == "-t": + taq1 = True + msp1 = False + elif opt == "-b": + both_taq1_msp1 = True + msp1 = False + +# check required parameters exist +if taq1 == True and both_taq1_msp1 == True: + print "\nYou can only choose one restriction enzyme option. If you want to select both enzymes, please enter the -b option only." + print + print HELP_STRING + sys.exit(1) + + +if fwdFilenames == None: + print "\nYou must provide both a fwd fastq filename or both fwd & rev filenames." + print + print HELP_STRING + sys.exit(1) + +if SHOW_STATS: + if statsFilename == None: + print "\nYou must provide an output filename" + print + print HELP_STRING + sys.exit(1) + +# do the actual work +fwdFiles = glob(fwdFilenames) +if revFilenames: + revFiles = glob(revFilenames) +else: + revFiles = [] + +if len(fwdFiles) == 0: + print "There are no files that fit the fwd pattern '%s'" % (fwdFilenames) + print + print HELP_STRING + sys.exit(1) + +if len(revFiles) == 0: + print "A file pattern wasn't provided for read 2. Assuming the run is single ended." + +print "Your files are:" +print "Fwd files:" +for f in fwdFiles: + print "\t%s" % (f) + +if revFilenames: + print "Rev files:" + for f in revFiles: + print "\t%s" % (f) + +if revFilenames: + if len(fwdFiles) != len(revFiles): + print "The fwd and rev files must have the same number of files." + print + print HELP_STRING + sys.exit(1) + +if SHOW_STATS: + statsOut = open(statsFilename, "w") + statsOut.write("\t".join(["Filename", "Both_RE", "Only_Fwd_RE", "Only_Rev_RE", + "No_RE", "Fwd_YG(G/A)_is_C", "Fwd_YG(G/A)_is_T", + "Fwd_4th_is_A", "Fwd_4th_is_C", "Fwd_4th_is_G", "Fwd_4th_is_T", "Fwd_4th_is_N", + "Rev_4th_is_A", "Rev_4th_is_C", "Rev_4th_is_G", "Rev_4th_is_T", "Rev_4th_is_N", + "Fwd_CGGCGG", "Fwd_TGGTGG", "Fwd_CGGTGG", "Fwd_TGGCGG", + "D0_reads", + "D1_base1_is_A", "D1_base1_is_C", "D1_base1_is_G", "D1_base1_is_T", "D1_base1_is_N", + "D2_base1_is_A", "D2_base1_is_C", "D2_base1_is_G", "D2_base1_is_T", "D2_base1_is_N", + "D2_base2_is_A", "D2_base2_is_C", "D2_base2_is_G", "D2_base2_is_T", "D2_base2_is_N", + "D3_base1_is_A", "D3_base1_is_C", "D3_base1_is_G", "D3_base1_is_T", "D3_base1_is_N", + "D3_base2_is_A", "D3_base2_is_C", "D3_base2_is_G", "D3_base2_is_T", "D3_base2_is_N", + "D3_base3_is_A", "D3_base3_is_C", "D3_base3_is_G", "D3_base3_is_T", "D3_base3_is_N", + "Rev_D0_reads", + "Rev_D1_base1_is_A", "Rev_D1_base1_is_C", "Rev_D1_base1_is_G", "Rev_D1_base1_is_T", "Rev_D1_base1_is_N", + "Rev_D2_base1_is_A", "Rev_D2_base1_is_C", "Rev_D2_base1_is_G", "Rev_D2_base1_is_T", "Rev_D2_base1_is_N", + "Rev_D2_base2_is_A", "Rev_D2_base2_is_C", "Rev_D2_base2_is_G", "Rev_D2_base2_is_T", "Rev_D2_base2_is_N", + "Rev_D3_base1_is_A", "Rev_D3_base1_is_C", "Rev_D3_base1_is_G", "Rev_D3_base1_is_T", "Rev_D3_base1_is_N", + "Rev_D3_base2_is_A", "Rev_D3_base2_is_C", "Rev_D3_base2_is_G", "Rev_D3_base2_is_T", "Rev_D3_base2_is_N", + "Rev_D3_base3_is_A", "Rev_D3_base3_is_C", "Rev_D3_base3_is_G", "Rev_D3_base3_is_T", "Rev_D3_base3_is_N"])) + statsOut.write("\n") + +for i in range(len(fwdFiles)): + ##print "Working on pair fwd='%s' and rev='%s'" % (fwdFiles[i], revFiles[i]) + recCount = 1 + fwdIt = FastqIterator(fwdFiles[i]) + (fwdTitle, fwdSeq, fwdQual) = fwdIt.next() + if revFilenames: + revIt = FastqIterator(revFiles[i]) + (revTitle, revSeq, revQual) = revIt.next() + revRoot = os.path.splitext(revFiles[i])[0] + if (revFiles[i].endswith(".gz")): + outRev = gzip.open(revRoot + "_trimmed.fq.gz", "wb") + else: + outRev = open(revRoot + "_trimmed.fq", "w") + else: + revIt = None + revTitle = revSeq = revQual = None + outRev = None + + if revFilenames and fwdTitle.split()[0] != revTitle.split()[0]: + print "The fwd title and rev title don't match" + print "fwd title: '%s'" % (fwdTitle.split()[0]) + print "rev title: '%s'" % (revTitle.split()[0]) + raise Exception("fwd and rev titles don't match. Not paired end sequence.") + + fwdRoot = os.path.splitext(fwdFiles[i])[0] + if (fwdFiles[i].endswith(".gz")): + outFwd = gzip.open(fwdRoot + "_trimmed.fq.gz", "wb") + else: + outFwd = open(fwdRoot + "_trimmed.fq", "w") + stats = getEmptyStats() + stats["filename"] = fwdRoot + statsKeys = stats.keys() + statsKeys.sort() + + + while True: + + # trim one record and add to output + trimOneRecord(fwdTitle, fwdSeq, fwdQual, revTitle, revSeq, revQual, outFwd, outRev, stats) + + try: + (fwdTitle, fwdSeq, fwdQual) = fwdIt.next() + if revFilenames: + (revTitle, revSeq, revQual) = revIt.next() + except StopIteration: + break + + recCount += 1 + + statsList = [ stats["filename"], stats["nrBothRE"], stats["nrFwdRE"], + stats["nrRevRE"], stats["nrNoRE"], stats["nrFwdREisC"], + stats["nrFwdREisT"] ] + statsList.extend(stats["bdFwd4thBase"]) + statsList.extend(stats["bdRev4thBase"]) + statsList.append(stats["nrCGGCGG"]) + statsList.append(stats["nrTGGTGG"]) + statsList.append(stats["nrCGGTGG"]) + statsList.append(stats["nrTGGCGG"]) + statsList.append(stats["nrD0"]) + statsList.extend(stats["bdD1base1"]) + statsList.extend(stats["bdD2base1"]) + statsList.extend(stats["bdD2base2"]) + statsList.extend(stats["bdD3base1"]) + statsList.extend(stats["bdD3base2"]) + statsList.extend(stats["bdD3base3"]) + statsList.append(stats["nrRevD0"]) + statsList.extend(stats["bdRevD1base1"]) + statsList.extend(stats["bdRevD2base1"]) + statsList.extend(stats["bdRevD2base2"]) + statsList.extend(stats["bdRevD3base1"]) + statsList.extend(stats["bdRevD3base2"]) + statsList.extend(stats["bdRevD3base3"]) + if SHOW_STATS: + statsOut.write("\t".join([str(x) for x in statsList])) + statsOut.write("\n") + + print "\tDone with this pair. there were %s records" % (recCount) + d1 = sum(stats["bdD1base1"]) + d2 = sum(stats["bdD2base1"]) + d3 = sum(stats["bdD3base1"]) + if revFilenames: + other = stats["nrNoRE"] + stats["nrRevRE"] + stats["nrFwdRE"] + else: + other = stats["nrNoRE"] + print "\tFwd: D0:%s D1:%s D2:%s D3:%s other=%s (total=%s)" % (stats["nrD0"], + d1, d2, d3, other, stats["nrD0"]+d1+d2+d3+other) diff --git a/conf/modules.config b/conf/modules.config index 5c55e8521..470aec269 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -56,7 +56,7 @@ process { '--fastqc', // Special flags - params.rrbs ? '--rrbs' : '', + params.rrbs ? (params.ovation ? '' : '--rrbs') : '', params.nextseq_trim > 0 ? "--nextseq ${params.nextseq_trim}" : '', // Trimming - R1 @@ -138,6 +138,22 @@ process { ] } + withName: TRIMDIVERSITY { + publishDir = [ + [ + path: { "${params.outdir}/trimdiversity" }, + mode: params.publish_dir_mode, + pattern: "*_trimmed.fastq.gz" + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/trimdiversity/logs" }, + mode: params.publish_dir_mode, + pattern: "*_trimmed.log" + ] + ] + } + withName: BISMARK_GENOMEPREPARATION { ext.args = [ params.aligner == 'bismark_hisat' ? ' --hisat2' : ' --bowtie2', diff --git a/docs/output.md b/docs/output.md index 4a60d259e..89515ebf4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -14,6 +14,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [FastQC](#fastqc) - Raw read QC - [TrimGalore](#trimgalore) - Adapter trimming +- [NuMetRRBS](#numetrrbs) - Diversity trimming - [Alignment](#alignment) - Aligning reads to reference genome - [Deduplication](#deduplication) - Deduplicating reads - [Methylation Extraction](#methylation-extraction) - Calling cytosine methylation steps @@ -68,6 +69,22 @@ Contains FastQ files with quality and adapter trimmed reads for each sample, alo Single-end data will have slightly different file names and only one FastQ file per sample. +### NuMetRRBS + +The Ovation RRBS Methyl-Seq System produces libraries that are compatible with Illumina sequencing platforms. To prepare these libraries for alignment, it's necessary to first process the data by sample index, then perform trimming to eliminate adaptor sequences, low-quality reads, and diversity bases. Before proceeding to alignment, it's crucial to remove the extra sequences introduced by diversity adaptors. This specific trimming step is carried out using a custom Python script named trimRRBSdiversityAdaptCustomers.py, which NuGEN provides in [this](https://github.com/nugentechnologies/NuMetRRBS) repository. + +**Output directory: `results/trimdiversity`** + +Contains FastQ files with diversity sequence trimmed reads for each sample, along with a log file describing the trimming. + +- `sample_1_trimmed.fastq.gz`, `sample_2_trimmed.fastq.gz` + - Trimmed FastQ data, reads 1 and 2. + - **NB:** Only saved if `--save_trimmed` has been specified. +- `logs/sample_trimmed.log` + - Trimming report (describes which parameters that were used) + +Single-end data will have slightly different file names and only one FastQ file per sample. + ### Alignment Bismark and bwa-meth convert all Cytosines contained within the sequenced reads to Thymine _in-silico_ and then align against a three-letter reference genome. This method avoids methylation-specific alignment bias. The alignment produces a BAM file of genomic alignments. diff --git a/modules/local/trimdiversity/main.nf b/modules/local/trimdiversity/main.nf new file mode 100644 index 000000000..f62604c22 --- /dev/null +++ b/modules/local/trimdiversity/main.nf @@ -0,0 +1,67 @@ +process TRIMDIVERSITY { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::python=2.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:2.7' : + 'biocontainers/python:2.7' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_trimmed.fastq.gz"), emit: reads + tuple val(meta), path("*_trimmed.log") , emit: log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def args_list = args.split("\\s(?=--)").toList() + args_list.removeAll { it.toLowerCase().contains('_r2 ') } + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + + trimRRBSdiversityAdaptCustomers.py -1 ${prefix}.fastq.gz + + cp .command.log ${prefix}_trimmed.log + + if [ -f ${prefix}.fastq_trimmed.fq.gz ]; then + mv ${prefix}.fastq_trimmed.fq.gz ${prefix}_trimmed.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + + trimRRBSdiversityAdaptCustomers.py -1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz + + cp .command.log ${prefix}_trimmed.log + + if [ -f ${prefix}_1.fastq_trimmed.fq.gz ]; then + mv ${prefix}_1.fastq_trimmed.fq.gz ${prefix}_1_trimmed.fastq.gz + fi + if [ -f ${prefix}_2.fastq_trimmed.fq.gz ]; then + mv ${prefix}_2.fastq_trimmed.fq.gz ${prefix}_2_trimmed.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + } +} diff --git a/modules/local/trimdiversity/meta.yml b/modules/local/trimdiversity/meta.yml new file mode 100644 index 000000000..45f69d560 --- /dev/null +++ b/modules/local/trimdiversity/meta.yml @@ -0,0 +1,48 @@ +name: trimdiversity +description: Diversity trimming and filtering with NuGEN's diversity trimming scripts. +keywords: + - trimming + - adapters + - sequencing adapters + - fastq +tools: + - trimgalore: + description: | + Following adaptor and quality trimming and prior to alignment, the additional sequence added by the diversity adaptors must be removed from the data. This trimming is performed by a custom python script trimRRBSdiversityAdaptCustomers.py provided by NuGEN in this repository. The script removes any reads that do not contain an MspI site signature YGG at the 5’ end. For paired end data an MspI site signature is required at the 5’ end of both sequences. The script accepts as input one or two fastq file strings, given either as complete filenames or as a pattern in quotes. When a pattern is given, the script will find all the filenames matching a specified pattern according to the rules used by the Unix shell (*,?). You may access the help option of this script for more details -h. + homepage: https://github.com/nugentechnologies/NuMetRRBS + documentation: https://github.com/nugentechnologies/NuMetRRBS/blob/master/README.md + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input adapter trimmed FastQ files of size 1 and 2 for + single-end and paired-end data, respectively. + pattern: "*_trimmed.fastq.gz" + - log: + type: file + description: | + Log file containing trimming statistics. + pattern: "*_trimmed.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jma1991" diff --git a/nextflow.config b/nextflow.config index 673cf2e34..9beb58752 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,6 +40,7 @@ params { rrbs = false slamseq = false em_seq = false + ovation = false single_cell = false accel = false cegx = false diff --git a/nextflow_schema.json b/nextflow_schema.json index a87bcb957..1fdf1aa95 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -195,6 +195,12 @@ "description": "Preset for EM-seq libraries.", "help_text": "Equivalent to `--clip_r1 10` `--clip_r2 10` `--three_prime_clip_r1 10` `--three_prime_clip_r2 10`.\n\nAlso sets the `--maxins` flag to `1000` for Bismark." }, + "ovation": { + "type": "boolean", + "description": "Preset for NuGen Ovation libraries.", + "fa_icon": "fas fa-cubes", + "help_text": "Use this parameter when processing NuGen Ovation RRBS data, that has been digested with MspI. This will run TrimGalore! using its default configuration, followed by forwarding the reads to the trimRRBSdiversityAdaptCustomers.py script for the removal of diversity bases." + }, "single_cell": { "type": "boolean", "fa_icon": "fas fa-cut", @@ -669,13 +675,13 @@ "$ref": "#/definitions/skip_pipeline_steps" }, { - "$ref": "#/definitions/generic_options" + "$ref": "#/definitions/institutional_config_options" }, { "$ref": "#/definitions/max_job_request_options" }, { - "$ref": "#/definitions/institutional_config_options" + "$ref": "#/definitions/generic_options" } ] } diff --git a/workflows/methylseq.nf b/workflows/methylseq.nf index b49d895d4..6230de302 100644 --- a/workflows/methylseq.nf +++ b/workflows/methylseq.nf @@ -52,6 +52,11 @@ else if ( params.aligner == 'bwameth' ){ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// +// MODULE: Installed directly from modules/local +// +include { TRIMDIVERSITY } from '../modules/local/trimdiversity/main' + // // MODULE: Installed directly from nf-core/modules // @@ -135,7 +140,7 @@ workflow METHYLSEQ { /* * MODULE: Run TrimGalore! */ - if (!params.skip_trimming) { + if (!params.skip_trimming || params.ovation) { TRIMGALORE(ch_cat_fastq) reads = TRIMGALORE.out.reads ch_versions = ch_versions.mix(TRIMGALORE.out.versions.first()) @@ -143,7 +148,14 @@ workflow METHYLSEQ { reads = ch_cat_fastq } - + /* + * MODULE: Run NuMetRRBS/trimRRBSdiversityAdaptCustomers.py + */ + if (params.ovation) { + TRIMDIVERSITY(reads) + reads = TRIMDIVERSITY.out.reads + ch_versions = ch_versions.mix(TRIMDIVERSITY.out.versions.first()) + } /* * SUBWORKFLOW: Align reads, deduplicate and extract methylation with Bismark @@ -223,6 +235,9 @@ workflow METHYLSEQ { if (!params.skip_trimming) { ch_multiqc_files = ch_multiqc_files.mix(TRIMGALORE.out.log.collect{ it[1] }) } + if (params.ovation) { + ch_multiqc_files = ch_multiqc_files.mix(TRIMDIVERSITY.out.log.collect{ it[1] }) + } ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{ it[1] }.ifEmpty([])) MULTIQC ( From 4bdd1259cf29f639c782fa61f18535b25932b5b7 Mon Sep 17 00:00:00 2001 From: James Ashmore Date: Wed, 14 Feb 2024 15:00:05 +0000 Subject: [PATCH 2/3] Fixed left-padding --- bin/trimRRBSdiversityAdaptCustomers.py | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/bin/trimRRBSdiversityAdaptCustomers.py b/bin/trimRRBSdiversityAdaptCustomers.py index 4efd0cc02..c8af90187 100755 --- a/bin/trimRRBSdiversityAdaptCustomers.py +++ b/bin/trimRRBSdiversityAdaptCustomers.py @@ -377,24 +377,24 @@ def trimOneRecord(fwdTitle, fwdSeq, fwdQual, revTitle, revSeq, revQual, outFwd, if SHOW_STATS: statsOut = open(statsFilename, "w") statsOut.write("\t".join(["Filename", "Both_RE", "Only_Fwd_RE", "Only_Rev_RE", - "No_RE", "Fwd_YG(G/A)_is_C", "Fwd_YG(G/A)_is_T", - "Fwd_4th_is_A", "Fwd_4th_is_C", "Fwd_4th_is_G", "Fwd_4th_is_T", "Fwd_4th_is_N", - "Rev_4th_is_A", "Rev_4th_is_C", "Rev_4th_is_G", "Rev_4th_is_T", "Rev_4th_is_N", - "Fwd_CGGCGG", "Fwd_TGGTGG", "Fwd_CGGTGG", "Fwd_TGGCGG", - "D0_reads", - "D1_base1_is_A", "D1_base1_is_C", "D1_base1_is_G", "D1_base1_is_T", "D1_base1_is_N", - "D2_base1_is_A", "D2_base1_is_C", "D2_base1_is_G", "D2_base1_is_T", "D2_base1_is_N", - "D2_base2_is_A", "D2_base2_is_C", "D2_base2_is_G", "D2_base2_is_T", "D2_base2_is_N", - "D3_base1_is_A", "D3_base1_is_C", "D3_base1_is_G", "D3_base1_is_T", "D3_base1_is_N", - "D3_base2_is_A", "D3_base2_is_C", "D3_base2_is_G", "D3_base2_is_T", "D3_base2_is_N", - "D3_base3_is_A", "D3_base3_is_C", "D3_base3_is_G", "D3_base3_is_T", "D3_base3_is_N", - "Rev_D0_reads", - "Rev_D1_base1_is_A", "Rev_D1_base1_is_C", "Rev_D1_base1_is_G", "Rev_D1_base1_is_T", "Rev_D1_base1_is_N", - "Rev_D2_base1_is_A", "Rev_D2_base1_is_C", "Rev_D2_base1_is_G", "Rev_D2_base1_is_T", "Rev_D2_base1_is_N", - "Rev_D2_base2_is_A", "Rev_D2_base2_is_C", "Rev_D2_base2_is_G", "Rev_D2_base2_is_T", "Rev_D2_base2_is_N", - "Rev_D3_base1_is_A", "Rev_D3_base1_is_C", "Rev_D3_base1_is_G", "Rev_D3_base1_is_T", "Rev_D3_base1_is_N", - "Rev_D3_base2_is_A", "Rev_D3_base2_is_C", "Rev_D3_base2_is_G", "Rev_D3_base2_is_T", "Rev_D3_base2_is_N", - "Rev_D3_base3_is_A", "Rev_D3_base3_is_C", "Rev_D3_base3_is_G", "Rev_D3_base3_is_T", "Rev_D3_base3_is_N"])) + "No_RE", "Fwd_YG(G/A)_is_C", "Fwd_YG(G/A)_is_T", + "Fwd_4th_is_A", "Fwd_4th_is_C", "Fwd_4th_is_G", "Fwd_4th_is_T", "Fwd_4th_is_N", + "Rev_4th_is_A", "Rev_4th_is_C", "Rev_4th_is_G", "Rev_4th_is_T", "Rev_4th_is_N", + "Fwd_CGGCGG", "Fwd_TGGTGG", "Fwd_CGGTGG", "Fwd_TGGCGG", + "D0_reads", + "D1_base1_is_A", "D1_base1_is_C", "D1_base1_is_G", "D1_base1_is_T", "D1_base1_is_N", + "D2_base1_is_A", "D2_base1_is_C", "D2_base1_is_G", "D2_base1_is_T", "D2_base1_is_N", + "D2_base2_is_A", "D2_base2_is_C", "D2_base2_is_G", "D2_base2_is_T", "D2_base2_is_N", + "D3_base1_is_A", "D3_base1_is_C", "D3_base1_is_G", "D3_base1_is_T", "D3_base1_is_N", + "D3_base2_is_A", "D3_base2_is_C", "D3_base2_is_G", "D3_base2_is_T", "D3_base2_is_N", + "D3_base3_is_A", "D3_base3_is_C", "D3_base3_is_G", "D3_base3_is_T", "D3_base3_is_N", + "Rev_D0_reads", + "Rev_D1_base1_is_A", "Rev_D1_base1_is_C", "Rev_D1_base1_is_G", "Rev_D1_base1_is_T", "Rev_D1_base1_is_N", + "Rev_D2_base1_is_A", "Rev_D2_base1_is_C", "Rev_D2_base1_is_G", "Rev_D2_base1_is_T", "Rev_D2_base1_is_N", + "Rev_D2_base2_is_A", "Rev_D2_base2_is_C", "Rev_D2_base2_is_G", "Rev_D2_base2_is_T", "Rev_D2_base2_is_N", + "Rev_D3_base1_is_A", "Rev_D3_base1_is_C", "Rev_D3_base1_is_G", "Rev_D3_base1_is_T", "Rev_D3_base1_is_N", + "Rev_D3_base2_is_A", "Rev_D3_base2_is_C", "Rev_D3_base2_is_G", "Rev_D3_base2_is_T", "Rev_D3_base2_is_N", + "Rev_D3_base3_is_A", "Rev_D3_base3_is_C", "Rev_D3_base3_is_G", "Rev_D3_base3_is_T", "Rev_D3_base3_is_N"])) statsOut.write("\n") for i in range(len(fwdFiles)): From 0b3f6bdfc21f0fd0e861ec3aae2e4057482eeb55 Mon Sep 17 00:00:00 2001 From: James Ashmore Date: Wed, 14 Feb 2024 15:03:11 +0000 Subject: [PATCH 3/3] Fixed left-padding --- bin/trimRRBSdiversityAdaptCustomers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/trimRRBSdiversityAdaptCustomers.py b/bin/trimRRBSdiversityAdaptCustomers.py index c8af90187..f655cfdac 100755 --- a/bin/trimRRBSdiversityAdaptCustomers.py +++ b/bin/trimRRBSdiversityAdaptCustomers.py @@ -447,8 +447,8 @@ def trimOneRecord(fwdTitle, fwdSeq, fwdQual, revTitle, revSeq, revQual, outFwd, recCount += 1 statsList = [ stats["filename"], stats["nrBothRE"], stats["nrFwdRE"], - stats["nrRevRE"], stats["nrNoRE"], stats["nrFwdREisC"], - stats["nrFwdREisT"] ] + stats["nrRevRE"], stats["nrNoRE"], stats["nrFwdREisC"], + stats["nrFwdREisT"] ] statsList.extend(stats["bdFwd4thBase"]) statsList.extend(stats["bdRev4thBase"]) statsList.append(stats["nrCGGCGG"])