diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 0000000..44f150a --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,24 @@ +name: Draft PDF +on: + push: + paths: + - paper/** + - .github/workflows/draft-pdf.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + paper-path: paper/paper.md # Path to the paper within your repo + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + path: paper/paper.pdf # Output path where Pandoc writes the compiled PDF \ No newline at end of file diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..652ea32 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,432 @@ + +@article{delahaye_sequencing_2021, + title = {Sequencing {DNA} with nanopores: {Troubles} and biases}, + volume = {16}, + issn = {1932-6203}, + shorttitle = {Sequencing {DNA} with nanopores}, + url = {https://dx.plos.org/10.1371/journal.pone.0257521}, + doi = {10.1371/journal.pone.0257521}, + abstract = {Oxford Nanopore Technologies’ (ONT) long read sequencers offer access to longer DNA fragments than previous sequencer generations, at the cost of a higher error rate. While many papers have studied read correction methods, few have addressed the detailed characterization of observed errors, a task complicated by frequent changes in chemistry and software in ONT technology. The MinION sequencer is now more stable and this paper proposes an up-to-date view of its error landscape, using the most mature flowcell and basecaller. We studied Nanopore sequencing error biases on both bacterial and human DNA reads. We found that, although Nanopore sequencing is expected not to suffer from GC bias, it is a crucial parameter with respect to errors. In particular, low-GC reads have fewer errors than high-GC reads (about 6\% and 8\% respectively). The error profile for homopolymeric regions or regions with short repeats, the source of about half of all sequencing errors, also depends on the GC rate and mainly shows deletions, although there are some reads with long insertions. Another interesting finding is that the quality measure, although over-estimated, offers valuable information to predict the error rate as well as the abundance of reads. We supplemented this study with an analysis of a rapeseed RNA read set and shown a higher level of errors with a higher level of deletion in these data. Finally, we have implemented an open source pipeline for long-term monitoring of the error profile, which enables users to easily compute various analysis presented in this work, including for future developments of the sequencing device. Overall, we hope this work will provide a basis for the design of better error-correction methods.}, + language = {en}, + number = {10}, + urldate = {2025-04-08}, + journal = {PLOS ONE}, + author = {Delahaye, Clara and Nicolas, Jacques}, + editor = {Andrés-León, Eduardo}, + month = oct, + year = {2021}, + pages = {e0257521}, + annote = {An up-to-date view of Nanopore sequencing error biases on both bacterial and human DNA reads is proposed and an open source pipeline for long-term monitoring of the error profile is implemented, which enables users to easily compute various analysis presented in this work. + +}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/QBXQJMVW/Delahaye and Nicolas - 2021 - Sequencing DNA with nanopores Troubles and biases.pdf:application/pdf}, +} + +@article{dong_long_2021, + title = {The long and the short of it: unlocking nanopore long-read {RNA} sequencing data with short-read differential expression analysis tools}, + volume = {3}, + copyright = {http://creativecommons.org/licenses/by/4.0/}, + issn = {2631-9268}, + shorttitle = {The long and the short of it}, + url = {https://academic.oup.com/nargab/article/doi/10.1093/nargab/lqab028/6253498}, + doi = {10.1093/nargab/lqab028}, + abstract = {Abstract + Application of Oxford Nanopore Technologies’ long-read sequencing platform to transcriptomic analysis is increasing in popularity. However, such analysis can be challenging due to the high sequence error and small library sizes, which decreases quantification accuracy and reduces power for statistical testing. Here, we report the analysis of two nanopore RNA-seq datasets with the goal of obtaining gene- and isoform-level differential expression information. A dataset of synthetic, spliced, spike-in RNAs (‘sequins’) as well as a mouse neural stem cell dataset from samples with a null mutation of the epigenetic regulator Smchd1 was analysed using a mix of long-read specific tools for preprocessing together with established short-read RNA-seq methods for downstream analysis. We used limma-voom to perform differential gene expression analysis, and the novel FLAMES pipeline to perform isoform identification and quantification, followed by DRIMSeq and limma-diffSplice (with stageR) to perform differential transcript usage analysis. We compared results from the sequins dataset to the ground truth, and results of the mouse dataset to a previous short-read study on equivalent samples. Overall, our work shows that transcriptomic analysis of long-read nanopore data using long-read specific preprocessing methods together with short-read differential expression methods and software that are already in wide use can yield meaningful results.}, + language = {en}, + number = {2}, + urldate = {2025-04-08}, + journal = {NAR Genomics and Bioinformatics}, + author = {Dong, Xueyi and Tian, Luyi and Gouil, Quentin and Kariyawasam, Hasaru and Su, Shian and De Paoli-Iseppi, Ricardo and Prawer, Yair David Joseph and Clark, Michael B and Breslin, Kelsey and Iminitoff, Megan and Blewitt, Marnie E and Law, Charity W and Ritchie, Matthew E}, + month = apr, + year = {2021}, + pages = {lqab028}, + annote = {This work shows that transcriptomic analysis of long-read nanopore data using long- read specific preprocessing methods together with short-read differential expression methods and software that are already in wide use can yield meaningful results. + +}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/76BCB677/Dong et al. - 2021 - The long and the short of it unlocking nanopore l.pdf:application/pdf}, +} + +@article{tang_full-length_2020-1, + title = {Full-length transcript characterization of {SF3B1} mutation in chronic lymphocytic leukemia reveals downregulation of retained introns}, + volume = {11}, + copyright = {2020 The Author(s)}, + issn = {2041-1723}, + url = {https://www.nature.com/articles/s41467-020-15171-6}, + doi = {10.1038/s41467-020-15171-6}, + abstract = {While splicing changes caused by somatic mutations in SF3B1 are known, identifying full-length isoform changes may better elucidate the functional consequences of these mutations. We report nanopore sequencing of full-length cDNA from CLL samples with and without SF3B1 mutation, as well as normal B cell samples, giving a total of 149 million pass reads. We present FLAIR (Full-Length Alternative Isoform analysis of RNA), a computational workflow to identify high-confidence transcripts, perform differential splicing event analysis, and differential isoform analysis. Using nanopore reads, we demonstrate differential 3’ splice site changes associated with SF3B1 mutation, agreeing with previous studies. We also observe a strong downregulation of intron retention events associated with SF3B1 mutation. Full-length transcript analysis links multiple alternative splicing events together and allows for better estimates of the abundance of productive versus unproductive isoforms. Our work demonstrates the potential utility of nanopore sequencing for cancer and splicing research.}, + language = {en}, + number = {1}, + urldate = {2024-11-10}, + journal = {Nature Communications}, + author = {Tang, Alison D. and Soulette, Cameron M. and van Baren, Marijke J. and Hart, Kevyn and Hrabeta-Robinson, Eva and Wu, Catherine J. and Brooks, Angela N.}, + month = mar, + year = {2020}, + note = {Number: 1 +Publisher: Nature Publishing Group}, + keywords = {Genome informatics, Cancer genomics, High-throughput screening, RNA splicing}, + pages = {1438}, +} + +@article{altschul_basic_1990, + title = {Basic local alignment search tool}, + volume = {215}, + copyright = {https://www.elsevier.com/tdm/userlicense/1.0/}, + issn = {00222836}, + url = {https://linkinghub.elsevier.com/retrieve/pii/S0022283605803602}, + doi = {10.1016/S0022-2836(05)80360-2}, + language = {en}, + number = {3}, + urldate = {2025-04-08}, + journal = {Journal of Molecular Biology}, + author = {Altschul, Stephen F. and Gish, Warren and Miller, Webb and Myers, Eugene W. and Lipman, David J.}, + month = oct, + year = {1990}, + pages = {403--410}, +} + +@article{camacho_blast_2009, + title = {{BLAST}+: architecture and applications}, + volume = {10}, + issn = {1471-2105}, + shorttitle = {{BLAST}+}, + url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-10-421}, + doi = {10.1186/1471-2105-10-421}, + abstract = {Abstract + + Background + Sequence similarity searching is a very important bioinformatics task. While Basic Local Alignment Search Tool (BLAST) outperforms exact methods through its use of heuristics, the speed of the current BLAST software is suboptimal for very long queries or database sequences. There are also some shortcomings in the user-interface of the current command-line applications. + + + Results + We describe features and improvements of rewritten BLAST software and introduce new command-line applications. Long query sequences are broken into chunks for processing, in some cases leading to dramatically shorter run times. For long database sequences, it is possible to retrieve only the relevant parts of the sequence, reducing CPU time and memory usage for searches of short queries against databases of contigs or chromosomes. The program can now retrieve masking information for database sequences from the BLAST databases. A new modular software library can now access subject sequence data from arbitrary data sources. We introduce several new features, including strategy files that allow a user to save and reuse their favorite set of options. The strategy files can be uploaded to and downloaded from the NCBI BLAST web site. + + + Conclusion + The new BLAST command-line applications, compared to the current BLAST tools, demonstrate substantial speed improvements for long queries as well as chromosome length database sequences. We have also improved the user interface of the command-line applications.}, + language = {en}, + number = {1}, + urldate = {2025-04-08}, + journal = {BMC Bioinformatics}, + author = {Camacho, Christiam and Coulouris, George and Avagyan, Vahram and Ma, Ning and Papadopoulos, Jason and Bealer, Kevin and Madden, Thomas L}, + month = dec, + year = {2009}, + pages = {421}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/HUHRNLUM/Camacho et al. - 2009 - BLAST+ architecture and applications.pdf:application/pdf}, +} + +@ARTICLE{Hauswedell2024-ph, + title = "Lambda3: homology search for protein, nucleotide, and + bisulfite-converted sequences", + author = "Hauswedell, Hannes and Hetzel, Sara and Gottlieb, Simon G and + Kretzmer, Helene and Meissner, Alexander and Reinert, Knut", + abstract = "MOTIVATION: Local alignments of query sequences in large + databases represent a core part of metagenomic studies and + facilitate homology search. Following the development of NCBI + Blast, many applications aimed to provide faster and equally + sensitive local alignment frameworks. Most applications focus on + protein alignments, while only few also facilitate DNA-based + searches. None of the established programs allow searching DNA + sequences from bisulfite sequencing experiments commonly used for + DNA methylation profiling, for which specific alignment + strategies need to be implemented. RESULTS: Here, we introduce + Lambda3, a new version of the local alignment application Lambda. + Lambda3 is the first solution that enables the search of protein, + nucleotide as well as bisulfite-converted nucleotide query + sequences. Its protein mode achieves comparable performance to + that of the highly optimized protein alignment application + Diamond, while the nucleotide mode consistently outperforms + established local nucleotide aligners. Combined, Lambda3 presents + a universal local alignment framework that enables fast and + sensitive homology searches for a wide range of use-cases. + AVAILABILITY AND IMPLEMENTATION: Lambda3 is free and open-source + software publicly available at https://github.com/seqan/lambda/.", + journal = "Bioinformatics", + volume = 40, + number = 3, + month = mar, + year = 2024, + language = "en" +} + +@article{molder_sustainable_2021, + title = {Sustainable data analysis with {Snakemake}}, + volume = {10}, + issn = {2046-1402}, + url = {https://f1000research.com/articles/10-33/v2}, + doi = {10.12688/f1000research.29032.2}, + abstract = {Data analysis often entails a multitude of heterogeneous steps, from the application of various command line tools to the usage of scripting languages like R or Python for the generation of plots and tables. It is widely recognized that data analyses should ideally be conducted in a reproducible way. Reproducibility enables technical validation and regeneration of results on the original or even new data. However, reproducibility alone is by no means sufficient to deliver an analysis that is of lasting impact (i.e., sustainable) for the field, or even just one research group. We postulate that it is equally important to ensure adaptability and transparency. The former describes the ability to modify the analysis to answer extended or slightly different research questions. The latter describes the ability to understand the analysis in order to judge whether it is not only technically, but methodologically valid. Here, we analyze the properties needed for a data analysis to become reproducible, adaptable, and transparent. We show how the popular workflow management system Snakemake can be used to guarantee this, and how it enables an ergonomic, combined, unified representation of all steps involved in data analysis, ranging from raw data processing, to quality control and fine-grained, interactive exploration and plotting of final results.}, + language = {en}, + urldate = {2024-05-07}, + journal = {F1000Research}, + author = {Mölder, Felix and Jablonski, Kim Philipp and Letcher, Brice and Hall, Michael B. and Tomkins-Tinch, Christopher H. and Sochat, Vanessa and Forster, Jan and Lee, Soohyun and Twardziok, Sven O. and Kanitz, Alexander and Wilm, Andreas and Holtgrewe, Manuel and Rahmann, Sven and Nahnsen, Sven and Köster, Johannes}, + month = apr, + year = {2021}, + pages = {33}, +} + +@article{oleary_exploring_2024, + title = {Exploring and retrieving sequence and metadata for species across the tree of life with {NCBI} {Datasets}}, + volume = {11}, + issn = {2052-4463}, + url = {https://www.nature.com/articles/s41597-024-03571-y}, + doi = {10.1038/s41597-024-03571-y}, + abstract = {Abstract + To explore complex biological questions, it is often necessary to access various data types from public data repositories. As the volume and complexity of biological sequence data grow, public repositories face significant challenges in ensuring that the data is easily discoverable and usable by the biological research community. To address these challenges, the National Center for Biotechnology Information (NCBI) has created NCBI Datasets. This resource provides straightforward, comprehensive, and scalable access to biological sequences, annotations, and metadata for a wide range of taxa. Following the FAIR (Findable, Accessible, Interoperable, and Reusable) data management principles, NCBI Datasets offers user-friendly web interfaces, command-line tools, and documented APIs, empowering researchers to access NCBI data seamlessly. The data is delivered as packages of sequences and metadata, thus facilitating improved data retrieval, sharing, and usability in research. Moreover, this data delivery method fosters effective data attribution and promotes its further reuse. This paper outlines the current scope of data accessible through NCBI Datasets and explains various options for exploring and downloading the data.}, + language = {en}, + number = {1}, + urldate = {2025-04-08}, + journal = {Scientific Data}, + author = {O’Leary, Nuala A. and Cox, Eric and Holmes, J. Bradley and Anderson, W. Ray and Falk, Robert and Hem, Vichet and Tsuchiya, Mirian T. N. and Schuler, Gregory D. and Zhang, Xuan and Torcivia, John and Ketter, Anne and Breen, Laurie and Cothran, Jonathan and Bajwa, Hena and Tinne, Jovany and Meric, Peter A. and Hlavina, Wratko and Schneider, Valerie A.}, + month = jul, + year = {2024}, + pages = {732}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/4FFX2589/O’Leary et al. - 2024 - Exploring and retrieving sequence and metadata for.pdf:application/pdf}, +} + +@article{cock_biopython_2009, + title = {Biopython: freely available {Python} tools for computational molecular biology and bioinformatics}, + volume = {25}, + issn = {1367-4803}, + shorttitle = {Biopython}, + url = {https://doi.org/10.1093/bioinformatics/btp163}, + doi = {10.1093/bioinformatics/btp163}, + abstract = {Summary: The Biopython project is a mature open source international collaboration of volunteer developers, providing Python libraries for a wide range of bioinformatics problems. Biopython includes modules for reading and writing different sequence file formats and multiple sequence alignments, dealing with 3D macro molecular structures, interacting with common tools such as BLAST, ClustalW and EMBOSS, accessing key online databases, as well as providing numerical methods for statistical learning.Availability: Biopython is freely available, with documentation and source code at www.biopython.org under the Biopython license.Contact: All queries should be directed to the Biopython mailing lists, see www.biopython.org/wiki/\_Mailing\_listspeter.cock@scri.ac.uk.}, + number = {11}, + urldate = {2024-11-12}, + journal = {Bioinformatics}, + author = {Cock, Peter J. A. and Antao, Tiago and Chang, Jeffrey T. and Chapman, Brad A. and Cox, Cymon J. and Dalke, Andrew and Friedberg, Iddo and Hamelryck, Thomas and Kauff, Frank and Wilczynski, Bartek and de Hoon, Michiel J. L.}, + month = jun, + year = {2009}, + note = {Number: 11}, + pages = {1422--1423}, +} + +@article{de_coster_nanopack_2018, + title = {{NanoPack}: visualizing and processing long-read sequencing data}, + volume = {34}, + copyright = {http://creativecommons.org/licenses/by/4.0/}, + issn = {1367-4803, 1367-4811}, + shorttitle = {{NanoPack}}, + url = {https://academic.oup.com/bioinformatics/article/34/15/2666/4934939}, + doi = {10.1093/bioinformatics/bty149}, + abstract = {Abstract + + Summary + Here we describe NanoPack, a set of tools developed for visualization and processing of long-read sequencing data from Oxford Nanopore Technologies and Pacific Biosciences. + + + Availability and implementation + The NanoPack tools are written in Python3 and released under the GNU GPL3.0 License. The source code can be found at https://github.com/wdecoster/nanopack, together with links to separate scripts and their documentation. The scripts are compatible with Linux, Mac OS and the MS Windows 10 subsystem for Linux and are available as a graphical user interface, a web service at http://nanoplot.bioinf.be and command line tools. + + + Supplementary information + Supplementary data are available at Bioinformatics online.}, + language = {en}, + number = {15}, + urldate = {2025-04-08}, + journal = {Bioinformatics}, + author = {De Coster, Wouter and D’Hert, Svenn and Schultz, Darrin T and Cruts, Marc and Van Broeckhoven, Christine}, + editor = {Berger, Bonnier}, + month = aug, + year = {2018}, + pages = {2666--2669}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/ZWVHB4RC/De Coster et al. - 2018 - NanoPack visualizing and processing long-read seq.pdf:application/pdf}, +} + +@article{li_minimap2_2018, + title = {Minimap2: pairwise alignment for nucleotide sequences}, + volume = {34}, + copyright = {https://academic.oup.com/journals/pages/open\_access/funder\_policies/chorus/standard\_publication\_model}, + issn = {1367-4803, 1367-4811}, + shorttitle = {Minimap2}, + url = {https://academic.oup.com/bioinformatics/article/34/18/3094/4994778}, + doi = {10.1093/bioinformatics/bty191}, + abstract = {Abstract + + Motivation + Recent advances in sequencing technologies promise ultra-long reads of ∼100 kb in average, full-length mRNA or cDNA reads in high throughput and genomic contigs over 100 Mb in length. Existing alignment programs are unable or inefficient to process such data at scale, which presses for the development of new alignment algorithms. + + + Results + Minimap2 is a general-purpose alignment program to map DNA or long mRNA sequences against a large reference database. It works with accurate short reads of ≥100 bp in length, ≥1 kb genomic reads at error rate ∼15\%, full-length noisy Direct RNA or cDNA reads and assembly contigs or closely related full chromosomes of hundreds of megabases in length. Minimap2 does split-read alignment, employs concave gap cost for long insertions and deletions and introduces new heuristics to reduce spurious alignments. It is 3–4 times as fast as mainstream short-read mappers at comparable accuracy, and is ≥30 times faster than long-read genomic or cDNA mappers at higher accuracy, surpassing most aligners specialized in one type of alignment. + + + Availability and implementation + https://github.com/lh3/minimap2 + + + Supplementary information + Supplementary data are available at Bioinformatics online.}, + language = {en}, + number = {18}, + urldate = {2025-04-08}, + journal = {Bioinformatics}, + author = {Li, Heng}, + editor = {Birol, Inanc}, + month = sep, + year = {2018}, + pages = {3094--3100}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/VAAMZVYK/Li - 2018 - Minimap2 pairwise alignment for nucleotide sequen.pdf:application/pdf}, +} + +@article{zhu_heavy-tailed_2019, + title = {Heavy-tailed prior distributions for sequence count data: removing the noise and preserving large differences}, + volume = {35}, + copyright = {http://creativecommons.org/licenses/by-nc/4.0/}, + issn = {1367-4803, 1367-4811}, + shorttitle = {Heavy-tailed prior distributions for sequence count data}, + url = {https://academic.oup.com/bioinformatics/article/35/12/2084/5159452}, + doi = {10.1093/bioinformatics/bty895}, + abstract = {Abstract + + Motivation + In RNA-seq differential expression analysis, investigators aim to detect those genes with changes in expression level across conditions, despite technical and biological variability in the observations. A common task is to accurately estimate the effect size, often in terms of a logarithmic fold change (LFC). + + + Results + When the read counts are low or highly variable, the maximum likelihood estimates for the LFCs has high variance, leading to large estimates not representative of true differences, and poor ranking of genes by effect size. One approach is to introduce filtering thresholds and pseudocounts to exclude or moderate estimated LFCs. Filtering may result in a loss of genes from the analysis with true differences in expression, while pseudocounts provide a limited solution that must be adapted per dataset. Here, we propose the use of a heavy-tailed Cauchy prior distribution for effect sizes, which avoids the use of filter thresholds or pseudocounts. The proposed method, Approximate Posterior Estimation for generalized linear model, apeglm, has lower bias than previously proposed shrinkage estimators, while still reducing variance for those genes with little information for statistical inference. + + + Availability and implementation + The apeglm package is available as an R/Bioconductor package at https://bioconductor.org/packages/apeglm, and the methods can be called from within the DESeq2 software. + + + Supplementary information + Supplementary data are available at Bioinformatics online.}, + language = {en}, + number = {12}, + urldate = {2025-04-08}, + journal = {Bioinformatics}, + author = {Zhu, Anqi and Ibrahim, Joseph G and Love, Michael I}, + editor = {Stegle, Oliver}, + month = jun, + year = {2019}, + pages = {2084--2092}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/Y3R2C7BG/Zhu et al. - 2019 - Heavy-tailed prior distributions for sequence coun.pdf:application/pdf}, +} + +@article{love_moderated_2014, + title = {Moderated estimation of fold change and dispersion for {RNA}-seq data with {DESeq2}}, + volume = {15}, + issn = {1474-760X}, + url = {https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8}, + doi = {10.1186/s13059-014-0550-8}, + abstract = {Abstract + + In comparative high-throughput sequencing assays, a fundamental task is the analysis of count data, such as read counts per gene in RNA-seq, for evidence of systematic changes across experimental conditions. Small replicate numbers, discreteness, large dynamic range and the presence of outliers require a suitable statistical approach. We present + DESeq2 + , a method for differential analysis of count data, using shrinkage estimation for dispersions and fold changes to improve stability and interpretability of estimates. This enables a more quantitative analysis focused on the strength rather than the mere presence of differential expression. The + DESeq2 + package is available at + http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html + .}, + language = {en}, + number = {12}, + urldate = {2025-04-08}, + journal = {Genome Biology}, + author = {Love, Michael I and Huber, Wolfgang and Anders, Simon}, + month = dec, + year = {2014}, + pages = {550}, + file = {Full Text:/uni-mainz.de/homes/meesters/Zotero/storage/KM87QE3P/Love et al. - 2014 - Moderated estimation of fold change and dispersion.pdf:application/pdf}, +} + +@ARTICLE{Chen2025-ev, + title = "A systematic benchmark of Nanopore long-read {RNA} sequencing + for transcript-level analysis in human cell lines", + author = "Chen, Ying and Davidson, Nadia M and Wan, Yuk Kei and Yao, Fei + and Su, Yan and Gamaarachchi, Hasindu and Sim, Andre and Patel, + Harshil and Low, Hwee Meng and Hendra, Christopher and Wratten, + Laura and Hakkaart, Christopher and Sawyer, Chelsea and + Iakovleva, Viktoriia and Lee, Puay Leng and Xin, Lixia and Ng, + Hui En Vanessa and Loo, Jia Min and Ong, Xuewen and Ng, Hui Qi + Amanda and Wang, Jiaxu and Koh, Wei Qian Casslynn and Poon, Suk + Yeah Polly and Stanojevic, Dominik and Tran, Hoang-Dai and Lim, + Kok Hao Edwin and Toh, Shen Yon and Ewels, Philip Andrew and Ng, + Huck-Hui and Iyer, N Gopalakrishna and Thiery, Alexandre and + Chng, Wee Joo and Chen, Leilei and DasGupta, Ramanuj and Sikic, + Mile and Chan, Yun-Shen and Tan, Boon Ooi Patrick and Wan, Yue + and Tam, Wai Leong and Yu, Qiang and Khor, Chiea Chuan and + W{\"u}stefeld, Torsten and Lezhava, Alexander and Pratanwanich, + Ploy N and Love, Michael I and Goh, Wee Siong Sho and Ng, Sarah + B and Oshlack, Alicia and {SG-NEx consortium} and G{\"o}ke, + Jonathan", + abstract = "The human genome contains instructions to transcribe more than + 200,000 RNAs. However, many RNA transcripts are generated from + the same gene, resulting in alternative isoforms that are highly + similar and that remain difficult to quantify. To evaluate the + ability to study RNA transcript expression, we profiled seven + human cell lines with five different RNA-sequencing protocols, + including short-read cDNA, Nanopore long-read direct RNA, + amplification-free direct cDNA and PCR-amplified cDNA + sequencing, and PacBio IsoSeq, with multiple spike-in controls, + and additional transcriptome-wide N6-methyladenosine profiling + data. We describe differences in read length, coverage, + throughput and transcript expression, reporting that long-read + RNA sequencing more robustly identifies major isoforms. We + illustrate the value of the SG-NEx data to identify alternative + isoforms, novel transcripts, fusion transcripts and + N6-methyladenosine RNA modifications. Together, the SG-NEx data + provide a comprehensive resource enabling the development and + benchmarking of computational methods for profiling complex + transcriptional events at isoform-level resolution.", + journal = "Nat. Methods", + publisher = "Springer Science and Business Media LLC", + volume = 22, + number = 4, + pages = "801--812", + month = apr, + year = 2025, + copyright = "https://creativecommons.org/licenses/by/4.0", + language = "en" +} + +@ARTICLE{Chen2025-yk, + title = "A systematic benchmark of Nanopore long-read {RNA} sequencing + for transcript-level analysis in human cell lines", + author = "Chen, Ying and Davidson, Nadia M and Wan, Yuk Kei and Yao, Fei + and Su, Yan and Gamaarachchi, Hasindu and Sim, Andre and Patel, + Harshil and Low, Hwee Meng and Hendra, Christopher and Wratten, + Laura and Hakkaart, Christopher and Sawyer, Chelsea and + Iakovleva, Viktoriia and Lee, Puay Leng and Xin, Lixia and Ng, + Hui En Vanessa and Loo, Jia Min and Ong, Xuewen and Ng, Hui Qi + Amanda and Wang, Jiaxu and Koh, Wei Qian Casslynn and Poon, Suk + Yeah Polly and Stanojevic, Dominik and Tran, Hoang-Dai and Lim, + Kok Hao Edwin and Toh, Shen Yon and Ewels, Philip Andrew and Ng, + Huck-Hui and Iyer, N Gopalakrishna and Thiery, Alexandre and + Chng, Wee Joo and Chen, Leilei and DasGupta, Ramanuj and Sikic, + Mile and Chan, Yun-Shen and Tan, Boon Ooi Patrick and Wan, Yue + and Tam, Wai Leong and Yu, Qiang and Khor, Chiea Chuan and + W{\"u}stefeld, Torsten and Lezhava, Alexander and Pratanwanich, + Ploy N and Love, Michael I and Goh, Wee Siong Sho and Ng, Sarah + B and Oshlack, Alicia and {SG-NEx consortium} and G{\"o}ke, + Jonathan", + abstract = "The human genome contains instructions to transcribe more than + 200,000 RNAs. However, many RNA transcripts are generated from + the same gene, resulting in alternative isoforms that are highly + similar and that remain difficult to quantify. To evaluate the + ability to study RNA transcript expression, we profiled seven + human cell lines with five different RNA-sequencing protocols, + including short-read cDNA, Nanopore long-read direct RNA, + amplification-free direct cDNA and PCR-amplified cDNA + sequencing, and PacBio IsoSeq, with multiple spike-in controls, + and additional transcriptome-wide N6-methyladenosine profiling + data. We describe differences in read length, coverage, + throughput and transcript expression, reporting that long-read + RNA sequencing more robustly identifies major isoforms. We + illustrate the value of the SG-NEx data to identify alternative + isoforms, novel transcripts, fusion transcripts and + N6-methyladenosine RNA modifications. Together, the SG-NEx data + provide a comprehensive resource enabling the development and + benchmarking of computational methods for profiling complex + transcriptional events at isoform-level resolution.", + journal = "Nat. Methods", + publisher = "Springer Science and Business Media LLC", + volume = 22, + number = 4, + pages = "801--812", + month = apr, + year = 2025, + copyright = "https://creativecommons.org/licenses/by/4.0", + language = "en" +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..38ca5da --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,136 @@ +--- +title: 'A Snakemake workflow for differential expression analysis with alternative splicing detection using long-read data' +tags: + - Snakemake + - Nanopore + - HPC + - differential gene expression + - alternative splicing detection +authors: + - name: Yannic Eising + orcid: 0009-0003-9103-5689 + affiliation: [1, 2] + - name: Sören Lukas Hellmann + orcid: 0000-0003-4958-1419 + affiliation: 1 + - name: Christiane Krämer + orcid: 0009-0000-6220-076X + affiliation: 1 + - name: Christian Meesters + corresponding: true + orcid: 0000-0003-2408-7588 + affiliation: 2 +affiliations: + - name: Nucleic Acids Core Facility, Johannes Gutenberg-University Mainz, Germany + index: 1 + - name: NHR-SouthWest / High Performance Computing Group, Johannes Gutenberg-University Mainz, Germany + index: 2 + +date: 04 April 2025 <- update +bibliography: paper.bib + +--- + +# Summary + +Long-read RNA sequencing technologies enable the characterization of full-length transcripts and complex splicing patterns. While offering new opportunities for transcriptomic analysis, these data come with substantial computational demands, especially when scaling to multiple samples, replicates, and experimental conditions. + +We present a modular, reproducible workflow tailored for differential expression and alternative splicing analysis from long-read RNA sequencing data. +The workflow is designed for use on high-performance compute (HPC) clusters or cloud systems, enabling efficient parallel execution of computationally intensive steps such as read alignment, quantification, and isoform detection. + +It supports quality filtering, statistical analysis of gene expression across conditions, and isoform-level splicing analysis. For ill-annotated or novel genomes, it includes an optional annotation step based on local similarity searches to assign putative gene functions. +Reference data can be supplied via local files or retrieved automatically using NCBI accession numbers. + +It is well-suited for researchers working with large datasets and complex experimental designs who require transparent, reproducible, and HPC-compatible analysis workflows. + +# Statement of Need + +Long-read sequencing technologies, such as Oxford Nanopore Technologies (ONT), have revolutionized transcriptomic studies by enabling direct detection of full-length RNA molecules [@delahaye_sequencing_2021]. +This advancement facilitates more accurate analyses of differential gene expression [@dong_long_2021] and alternative splicing events, both of which are essential for understanding transcriptomic complexity and functional genomics. +However, analyzing long-read transcriptomic data remains technically challenging due to the intricacies of read preprocessing, isoform-level quantification, and the need for reproducible and scalable computational workflows. + +Several existing tools, such as FLAIR [@tang_full-length_2020-1], provide frameworks for isoform-level analysis of long-read transcriptomic data. +While these tools offer powerful features, they often rely on manual configuration, may not fully support reproducible execution across computing environments, and frequently lack integration with high-performance computing (HPC) infrastructure. + +To address these gaps, we present a Snakemake-based workflow that automates the analysis of Nanopore long-read sequencing data with a focus on differential gene expression and alternative splicing detection. +While other workflows exist that support either differential expression analysis or isoform-level analysis, our workflow integrates both in a modular and reproducible workflow designed for scalability across local machines, HPC clusters, and cloud environments. + +A distinctive feature of our workflow is its capability to operate on ill-annotated or completely unannotated genomes. +To support these cases, the workflow includes optional local alignments using tools such as BLAST [@altschul_basic_1990;@camacho_blast_2009] or lambda [@Hauswedell2024-ph], enabling the functional annotation of transcripts by identifying putative gene functions. +This enhances interpretability in non-model organisms and supports exploratory analyses in less-characterized transcriptomes. + +By leveraging Snakemake’s robust workflow management capabilities [@molder_sustainable_2021], our workflow offers transparent provenance tracking, efficient resource handling, and reproducible execution. +It provides a flexible foundation for advanced long-read transcriptomic analyses and fills a critical gap in the ecosystem of accessible, reproducible, and extensible workflows for Nanopore RNA sequencing data. + +## Implementation + +## Input Data and Reference Handling + +The workflow accepts raw ONT reads in FASTQ format, along with either user-specified or automatically downloaded reference data. Reference transcriptomes and genome assemblies can be provided as file paths, or alternatively, specified using NCBI accession numbers, in which case the required data are retrieved using `ncbi-datasets` [@oleary_exploring_2024]. +This allows users to flexibly apply the workflow to well-characterized model organisms or newly sequenced, poorly annotated species. + +## Quality Filtering and Assessment + +Prior to downstream analysis, reads undergo a configurable quality control step. Users can specify a read length threshold. For this we make use of the BioPython library [@cock_biopython_2009]. To ensure sufficient quality, we rely on the ONT basecaller for filtering out low quality reads. Sample quality statistics and read length distributions are assessed using NanoPlot [@de_coster_nanopack_2018], which generates interactive and publication-ready QC plots. These are included in the workflow report and ensure high-confidence input for downstream expression and splicing analysis. + +## Transcriptome Alignment and Differential Expression Analysis + +Reads passing quality filters are aligned to the reference transcriptome by `minimap2` [@li_minimap2_2018]. Following alignment, read counts per transcript are computed and used for differential expression analysis using pyDESeq2 [@zhu_heavy-tailed_2019;@love_moderated_2014], a Python-native implementation of the DESeq2 method. + +This enables statistical analysis of gene expression changes across experimental conditions while staying within a Python-based workflow ecosystem. + +## Alternative Splicing Analysis + +For isoform-level analysis, the workflow integrates the FLAIR toolkit [@tang_full-length_2020-1]. We adapted the FLAIR plotting script to improve Snakemake compatibility and enable automated per-gene isoform visualization. Isoforms are collapsed, quantified, and categorized to identify splicing patterns and events across conditions. + +# Optional Functional Annotation via Local Alignment + +When reference data are incomplete, unannotated, or of uncertain quality, the workflow offers optional functional annotation. Transcripts or isoforms can be locally aligned against curated UniRef protein data bases using BLAST or lambda. This provides putative gene product functions that support biological interpretation in non-model organisms or exploratory studies. + +# Workflow at a Glance + +![The Directed Acyclic Graph (DAG) of the complete long-read RNA-Seq workflow. The graph was generated by the Snakemake command: 'snakemake --rulegraph | dot -Tsvg > rulegraph.svg'.](rulegraph.svg) + +TODO: Annotate the workflow with boxes per feature set. Requires latests updates and cleanups to be incorporated + +## Example Dataset + +A complete example run using six cDNA Nanopore sequencing datasets from human H9 and HCT 116 cell lines is provided below. The data originate from the SG-NEx project [@Chen2025-yk] and were accesed on 31 March 2025 from [registry.opendata.aws/sg-nex-data](registry.opendata.aws/sg-nex-data). This example illustrates the workflows capabilities in isoform analysis and functional annotation. + +[]() # Add report HTML + +# Usage + +### Configuration +The workflow uses three configuration files to enable its full functionality: +1. **`samples.csv`** +contains sample metadata, including sample IDs and experimental conditions. +2. **`config/config.yaml`** +Defines general workflow behavior, rule-specific options, and paths to input data. +3. **`profile/config.yaml`** +Stores cluster-specific variables, enabling resource allocation settings tailored to your computing environment. + + +### Running the workflow +The workflow can be started by executing the following Snakemake command: + +``` +snakemake -j unlimited \ # Run snakemake with no job limit +--workflow-profile \ # Use a specific Snakemake profile +--configfile ../config/ \ # Specify the main config file +--directory \ # Set working directory +--sdm_conda \ # Enable conda software deployment +--conda_prefix # Custom prefix for conda environments +``` + +Here, we assume the workflow directory as the current working directory. When working from a different directory specifying the workflow with `--snakefile ` is necessary. + +The `profile_directory` contains a template configuration for the cluster "Mogon NHR" in Mainz, Germany. We encourage users to contribute their profiles to the repository. + +For detailed information about these and additional options, see the [Snakemake command-line options](https://snakemake.readthedocs.io/en/stable/executing/cli.html#all-options). + +# Acknowledgements + +Any? + +# References diff --git a/paper/rulegraph.svg b/paper/rulegraph.svg new file mode 100644 index 0000000..2e8c064 --- /dev/null +++ b/paper/rulegraph.svg @@ -0,0 +1,511 @@ + + + + + + +snakemake_dag + + + +0 + +all + + + +1 + +dump_versions + + + +1->0 + + + + + +2 + +sample_qa_plot + + + +2->0 + + + + + +3 + +total_sample_qa_plot + + + +3->0 + + + + + +4 + +bam_stats + + + +4->0 + + + + + +5 + +sam_to_bam + + + +5->4 + + + + + +15 + +bam_sort + + + +5->15 + + + + + +16 + +count_reads + + + +5->16 + + + + + +6 + +map_reads + + + +6->5 + + + + + +7 + +build_minimap_index + + + +7->6 + + + + + +8 + +genome_to_transcriptome + + + +8->7 + + + + + +8->16 + + + + + +32 + +generate_gene_query + + + +8->32 + + + + + +9 + +get_genome + + + +9->8 + + + + + +22 + +flair_collapse + + + +9->22 + + + + + +23 + +flair_correct + + + +9->23 + + + + + +24 + +flair_align + + + +9->24 + + + + + +25 + +build_flair_genome_index + + + +9->25 + + + + + +10 + +standardize_gff + + + +10->8 + + + + + +26 + +gff_to_gtf + + + +10->26 + + + + + +11 + +get_annotation + + + +11->10 + + + + + +12 + +filter_reads + + + +12->6 + + + + + +12->22 + + + + + +12->24 + + + + + +13 + +alignment_qa_report + + + +13->0 + + + + + +14 + +alignment_qa + + + +14->13 + + + + + +15->14 + + + + + +16->0 + + + + + +17 + +merge_read_counts + + + +16->17 + + + + + +17->0 + + + + + +18 + +diffexp_analysis + + + +17->18 + + + + + +18->0 + + + + + +18->32 + + + + + +19 + +flair_diffexp + + + +19->0 + + + + + +28 + +flair_plot_isoforms + + + +19->28 + + + + + +20 + +flair_quantify + + + +20->19 + + + + + +20->28 + + + + + +21 + +reads_manifest + + + +21->20 + + + + + +22->20 + + + + + +22->28 + + + + + +23->22 + + + + + +24->23 + + + + + +25->24 + + + + + +26->22 + + + + + +26->23 + + + + + +27 + +iso_analysis_report + + + +27->0 + + + + + +28->27 + + + + + +29 + +get_protein_names + + + +29->0 + + + + + +30 + +lambda_gene_annotation + + + +30->29 + + + + + +31 + +get_indexed_protein_db + + + +31->30 + + + + + +32->30 + + + + +