diff --git a/.travis.yml b/.travis.yml index 8be247e5..119ea07b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,71 @@ -sudo: true +sudo: true -language: python +language: python +cache: + directories: + - $HOME/virtualenv/ + - $HOME/.R + - $HOME/Library -os: - - linux -env: - - TRAVISBUILD="True" +matrix: + include: + # Use the built in venv for linux builds + - os: linux + sudo: required + python: "3.4" + dist: trusty + before_install: + - sudo apt-get install libssl1.0.0 + - sudo sh -c 'echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list' + - gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 + - gpg -a --export E084DAB9 | sudo apt-key add - + - sudo apt-get update -qq + - sudo apt-get -y --no-install-recommends install r-base build-essential gcc g++ gfortran libblas-dev liblapack-dev libncurses5-dev libreadline-dev libjpeg-dev libpcre3-dev libpng-dev zlib1g-dev libbz2-dev liblzma-dev libgit2-dev libssh2-1-dev -python: - - "3.4" - - "3.5" - - "3.6" + - os: linux + sudo: required + python: "3.5" + dist: trusty + before_install: + - sudo apt-get install libssl1.0.0 + - sudo sh -c 'echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list' + - gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 + - gpg -a --export E084DAB9 | sudo apt-key add - + - sudo apt-get update -qq + - sudo apt-get -y --no-install-recommends install r-base build-essential gcc g++ gfortran libblas-dev liblapack-dev libncurses5-dev libreadline-dev libjpeg-dev libpcre3-dev libpng-dev zlib1g-dev libbz2-dev liblzma-dev libgit2-dev libssh2-1-dev -before_install: - - sudo apt-get install bedtools + - os: linux + sudo: required + python: "3.6" + dist: trusty + before_install: + - sudo apt-get install libssl1.0.0 + - sudo sh -c 'echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list' + - gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 + - gpg -a --export E084DAB9 | sudo apt-key add - + - sudo apt-get update -qq + - sudo apt-get -y --no-install-recommends install r-base build-essential gcc g++ gfortran libblas-dev liblapack-dev libncurses5-dev libreadline-dev libjpeg-dev libpcre3-dev libpng-dev zlib1g-dev libbz2-dev liblzma-dev libgit2-dev libssh2-1-dev + + - os: osx + language: generic + python: "3.6" + before_install: + # below the formula for python 3.6 + #- brew upgrade https://raw.githubusercontent.com/Homebrew/homebrew-core/f2a764ef944b1080be64bd88dca9a1d80130c558/Formula/python.rb + - brew install python@2 + - brew cask uninstall oclint + - brew install R + - brew install md5sha1sum -install: - - python setup.py install +env: + - TRAVISBUILD="True", INSTALL_R_PACKAGES=True + +install: + - pip3 install . --upgrade --force -v script: - - circtools --help + - circtools --version +# - ./tests/01_test_dcc.sh +# - ./tests/02_test_quickcheck.sh \ No newline at end of file diff --git a/circtools/circtools.py b/circtools/circtools.py index 366c2b07..da5f6417 100755 --- a/circtools/circtools.py +++ b/circtools/circtools.py @@ -20,8 +20,8 @@ import os.path # global settings -version = "1.1.0-beta" -program_name = "circtest" +version = "1.1.0.4" +program_name = "circtools" # samtools/git like parsing from http://chase-seibert.github.io/blog/2014/03/21/python-multilevel-argparse.html @@ -46,7 +46,7 @@ def __init__(self): description="circtools: a modular, python-based framework for circRNA-related tools that unifies " "several functions in single command line driven software.", usage="""circtools [-V] [] - + Available commands: enrich: circular RNA RBP enrichment scan @@ -332,17 +332,85 @@ def primex(): def detect(): parser = argparse.ArgumentParser( description="circular RNA detection") - # NOT prefixing the argument with -- means it"s not optional - parser.add_argument("-C", - "--params", - dest="cli_params", - help="Defines the input parameters for DCC", - default="--help" - ) - args = parser.parse_args(sys.argv[2:]) + + parser.add_argument("--version", action="version", version=version) + parser.add_argument("Input", metavar="Input", nargs="+", + help="Input of the Chimeric.out.junction file from STAR. Alternatively, a sample sheet " + "specifying where your chimeric.out.junction files are, each sample per line, " + "provide with @ prefix (e.g. @samplesheet)") + parser.add_argument("-k", "--keep-temp", dest="temp", action="store_true", default=False, + help="Temporary files will not be deleted [default: False]") + parser.add_argument("-T", "--threads", dest="cpu_threads", type=int, default=2, + help="Number of CPU threads used for computation [default: 2]") + parser.add_argument("-O", "--output", dest="out_dir", default="./", + help="DCC output directory [default: .]") + parser.add_argument("-t", "--temp", dest="tmp_dir", default="_tmp_DCC/", + help="DCC temporary directory [default: _tmp_DCC/]") + + group = parser.add_argument_group("Find circRNA Options", "Options to find circRNAs from STAR output") + group.add_argument("-D", "--detect", action="store_true", dest="detect", default=False, + help="Enable circRNA detection from Chimeric.out.junction files [default: False]") + group.add_argument("-ss", action="store_true", dest="secondstrand", default=False, + help="Must be enabled for stranded libraries, aka 'fr-secondstrand' [default: False]") + group.add_argument("-N", "--nonstrand", action="store_false", dest="strand", default=True, + help="The library is non-stranded [default stranded]") + group.add_argument("-E", "--endTol", dest="endTol", type=int, default=5, choices=range(0, 10), + help="Maximum base pair tolerance of reads extending over junction sites [default: 5]") + group.add_argument("-m", "--maximum", dest="max", type=int, default=1000000, + help="The maximum length of candidate circRNAs (including introns) [default: 1000000]") + group.add_argument("-n", "--minimum", dest="min", type=int, default=30, + help="The minimum length of candidate circRNAs (including introns) [default 30]") + group.add_argument("-an", "--annotation", dest="annotate", + help="Gene annotation file in GTF/GFF3 format, to annotate " + "circRNAs by their host gene name/identifier") + + group.add_argument("-Pi", "--PE-independent", action="store_true", dest="pairedendindependent", default=False, + help="Has to be specified if the paired end mates have also been mapped separately." + "If specified, -mt1 and -mt2 must also be provided [default: False]") + group.add_argument("-mt1", "--mate1", dest="mate1", nargs="+", + help="For paired end data, Chimeric.out.junction files from mate1 independent mapping result") + group.add_argument("-mt2", "--mate2", dest="mate2", nargs="+", + help="For paired end data, Chimeric.out.junction files from mate2 independent mapping result") + parser.add_argument_group(group) + + group = parser.add_argument_group("Filtering Options", "Options to filter the circRNA candidates") + group.add_argument("-F", "--filter", action="store_true", dest="filter", default=False, + help="If specified, the program will perform a recommended filter step on the detection results") + group.add_argument("-f", "--filter-only", dest="filteronly", nargs=2, + help="If specified, the program will only filter based on two files provided: " + "1) a coordinates file [BED6 format] and 2) a count file. E.g.: -f example.bed counts.txt") + group.add_argument("-M", "--chrM", action="store_true", dest="chrM", default=False, + help="If specified, circRNA candidates located on the mitochondrial chromosome will be removed") + group.add_argument("-R", "--rep_file", dest="rep_file", + help="Custom repetitive region file in GTF format to filter out " + "circRNA candidates in repetitive regions") + group.add_argument("-L", "--Ln", dest="length", type=int, default=50, + help="Minimum length in base pairs to check for repetitive regions [default 50]") + group.add_argument("-Nr", nargs=2, type=int, metavar=("countthreshold", "replicatethreshold"), default=[2, 5], + help="countthreshold replicatethreshold [default: 2,5]") + group.add_argument("-fg", "--filterbygene", action="store_true", dest="filterbygene", default=False, + help="If specified, filter also by gene annotation (candidates are not allowed to span" + " more than one gene) default: False") + parser.add_argument_group(group) + + group = parser.add_argument_group("Host gene count Options", "Options to count host gene expression") + group.add_argument("-G", "--gene", action="store_true", dest="gene", default=False, + help="If specified, the program will count host gene expression given circRNA coordinates " + "[default: False]") + group.add_argument("-C", "--circ", dest="circ", + help="User specified circRNA coordinates, any tab delimited file with first three " + "columns as circRNA coordinates: chr\tstart\tend, which DCC will use to count " + "host gene expression") + group.add_argument("-B", "--bam", dest="bam", nargs="+", + help="A file specifying the mapped BAM files from which host gene expression is computed; " + "must have the same order as input chimeric junction files") + group.add_argument("-A", "--refseq", dest="refseq", + help="Reference sequence FASTA file") + + parser.add_argument_group(group) import os - os.system("DCC " + args.cli_params) + os.system("DCC " + " ".join(sys.argv[2:])) @staticmethod def circtest(): @@ -751,17 +819,56 @@ def reconstruct(): parser = argparse.ArgumentParser( description="circular RNA reconstruction") # NOT prefixing the argument with -- means it"s not optional - parser.add_argument("-C", - "--params", - dest="cli_params", - help="Defines the input parameters for DCC", - default="--help" - ) - args = parser.parse_args(sys.argv[2:]) + + # input + parser.add_argument('-C', '--circIDs', dest='circlefile', default='none', + help='Tab-separated file chr:start_end(tab)read1,read2,read3.') + parser.add_argument('-D', '--DCC', dest='CircRNACount', default='none', + help='If you mapped with STAR and are using step1 you need to provide a list' + ' of circle ids (CircRNACount or CircCoordinates from DCC)' + 'You must supply either -C or -DCC') + parser.add_argument('-J', '--chimericJunctions', dest='chimeric_junction', default='none', + help='If you mapped with STAR and are using step1 you need to provide the paired end Chimeric.junction.out file here') + parser.add_argument('-F', '--mate1', dest='mate1', default='none', + help='If you mapped with STAR and are using step1 you need to provide the mate1.Chimeric.junction.out file here (optional if ends were mapped separately)') + parser.add_argument('-R', '--mate2', dest='mate2', default='none', + help='If you mapped with STAR and are using step1 you need to provide the mate2.Chimeric.junction.out file here (optional if ends were mapped separately)') + parser.add_argument('-B', '--bamfile', dest='bamfile', required=True, + help='BAM file containing chimeric reads, linear reads may be in it but are not required.') + parser.add_argument('-A', '--annotation', dest='bedfile', required=True, + help='bed formatted feature file including exons.') + # output + parser.add_argument('-O', '--outFolder', dest='out_folder', default='.', + help='Output folder. There will be a sub folder for the sample containing a BAM file ' + 'for each circle.') + parser.add_argument('-N', '--sampleName', dest='sample', required=True, + help='sample name to title every thing.') + + # options + parser.add_argument('-r', '--thresholdReads', dest='reads', default=5, type=int, + help='Circle has to have at least reads to be analysed.') + + # TODO: default: no multi map + parser.add_argument('-q', '--thresholdMapq', dest='mapq', default=3, type=int, + help='MAPQ cutoff, only reads passing this threshold will be written to circle BAM file.') + # TODO: add 0 based info + parser.add_argument('-c', '--splitCharacter', dest='split_character', default='_', + help='feature name separator.') + parser.add_argument('-e', '--exonIndex', dest='exon_index', default=3, type=int, + help='Field indicating the exon number after splitting feature name by split_character (for the annotation file).') + parser.add_argument('-p', '--annotationFormat', dest='ref_platform', default='refseq', + help='Specifies the annotation platform which was used (refseq or ensembl)') + parser.add_argument('-s', '--skipSteps', dest='skipped_steps', default='none', + help='Comma separated list of steps that should be skipped (e.g. step3,step4,step6)') + parser.add_argument('-T', '--tmp', dest='tmp_folder', default='/tmp/', + help='Folder to store temporary files generated by pybedtools.') + + parser.add_argument('-P', '--cpus', dest='num_cpus', default=4, type=int, + help='Number of CPUs used.') import os - os.system("FUCHS " + args.cli_params) + os.system("FUCHS " + " ".join(sys.argv[2:])) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/circtools/exon_usage/exon_usage.py b/circtools/exon_usage/exon_usage.py index bedf1a07..ddfb23a4 100755 --- a/circtools/exon_usage/exon_usage.py +++ b/circtools/exon_usage/exon_usage.py @@ -124,7 +124,7 @@ def run_module(self): # ------------------------------------ need to call the correct R script here ----------------------- # need to define path top R wrapper - exon_script = 'circtools_exon' + exon_script = 'circtools_exon_wrapper.R' # Variable number of args in a list args = [ @@ -143,4 +143,4 @@ def run_module(self): # ------------------------------------ run script and check output ----------------------- import os - os.system(exon_script + " " + ' '.join(str(e) for e in args)) + os.system(exon_script + " " + ' '.join(str(e) for e in args)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e5fc3c6f..b544ed8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.14.5 pybedtools>=0.7.10 -biopython >= 1.71 \ No newline at end of file +biopython >= 1.71 +scipy>=0.19.0 \ No newline at end of file diff --git a/scripts/circtools_exon b/scripts/circtools_exon deleted file mode 120000 index 36200682..00000000 --- a/scripts/circtools_exon +++ /dev/null @@ -1 +0,0 @@ -exon_usage_circtools_wrapper.R \ No newline at end of file diff --git a/scripts/exon_usage_circtools_wrapper.R b/scripts/circtools_exon_wrapper.R similarity index 99% rename from scripts/exon_usage_circtools_wrapper.R rename to scripts/circtools_exon_wrapper.R index 6f0b4594..78e92586 100755 --- a/scripts/exon_usage_circtools_wrapper.R +++ b/scripts/circtools_exon_wrapper.R @@ -429,7 +429,6 @@ colnames(RNAse_RenrichedCircTest) <- c( "Gene", "Start.End.Region", "OverallRegion", "sig_p", - "NA", "GeneID", "NExons", "P.Value", @@ -482,8 +481,7 @@ colnames(circTestSummary) <- c( "Chr", "Strand", "Start.End.Region", "OverallRegion", - "sig_p", - "NA" + "sig_p" ) addWorksheet(wb, sheetName = "Other BSJ FDR 1%") diff --git a/scripts/circtools_quickcheck_wrapper.R b/scripts/circtools_quickcheck_wrapper.R index 24b18134..bf44c816 100755 --- a/scripts/circtools_quickcheck_wrapper.R +++ b/scripts/circtools_quickcheck_wrapper.R @@ -152,10 +152,10 @@ star_runs <- star_runs[endsWith(star_runs, arg_starfolder_suffix)] star_runs <- star_runs[!grepl("*mate*", star_runs)] # check columns to remove -if (arg_remove_columns != "0" && length(arg_remove_columns) > 0) { - tmp <- unlist(lapply(arg_remove_columns, function(x){x-3})) - star_runs <- star_runs[-tmp] -} +# if (arg_remove_columns != "0" && length(arg_remove_columns) > 0) { +# tmp <- unlist(lapply(arg_remove_columns, function(x){x-3})) +# star_runs <- star_runs[-tmp] +# } # new empty list uniquely_mapped_reads <- numeric(); diff --git a/scripts/install_R_dependencies.R b/scripts/install_R_dependencies.R index ab4ef26e..deb486d6 100755 --- a/scripts/install_R_dependencies.R +++ b/scripts/install_R_dependencies.R @@ -17,8 +17,8 @@ # set mirrors -source("https://bioconductor.statistik.tu-dortmund.de/biocLite.R") -options(repos = c(CRAN = "https://cran.uni-muenster.de/")) +source("http://bioconductor.org/biocLite.R") +options(repos = c(CRAN = "http://cran.uni-muenster.de/")) biocLite() pkgs <- c( diff --git a/scripts/install_external.sh b/scripts/install_external.sh index e3d12607..cb20d25e 100755 --- a/scripts/install_external.sh +++ b/scripts/install_external.sh @@ -15,6 +15,18 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +function detect_os { + unameOut="$(uname -s)" + case "${unameOut}" in + Linux*) machine=Linux;; + Darwin*) machine=Mac;; + CYGWIN*) machine=Cygwin;; + MINGW*) machine=MinGw;; + *) machine="UNKNOWN:${unameOut}" + esac + echo ${machine} +} + function install_bedtools { cd /tmp/ wget https://github.com/arq5x/bedtools2/releases/download/v2.27.1/bedtools-2.27.1.tar.gz @@ -30,12 +42,24 @@ function install_bedtools { # install statsmodels first, does not work in setup.py due to # https://github.com/dieterich-lab/circtools/issues/55 -pip3 install statsmodels +# pip3 install statsmodels +pip install pysam==0.13.0 # install dependencies for R first -Rscript scripts/install_R_dependencies.R +if [ "$TRAVISBUILD" ]; then + if [ "$INSTALL_R_PACKAGES" ]; then + sudo Rscript scripts/install_R_dependencies.R + fi +else + Rscript scripts/install_R_dependencies.R +fi BEDTOOLS=`which bedtools` +OS=`detect_os` + +if [ "$OS" = "Mac" ]; then + brew install libgit2 +fi if [ $BEDTOOLS ]; then @@ -47,23 +71,60 @@ if [ $BEDTOOLS ]; then install_bedtools fi else - install_bedtools + echo "install_bedtools" fi +if [ "$OS" = "Mac" ]; then + + echo "checking for libgit2" + if ! [[ `brew ls --versions libgit2` ]]; then + brew install libgit2 + fi + + echo "checking for R" + if ! [[ `brew ls --versions R` ]]; then + brew install R + fi + + echo "checking for python3" + if ! [[ `brew ls --versions python@3` ]]; then + # this is the formula for python 3.6 + # python 3.7 currently does not work with pysam + brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f2a764ef944b1080be64bd88dca9a1d80130c558/Formula/python.rb + fi + + echo "checking for python2" + if ! [[ `brew ls --versions python@2` ]]; then + brew install python@2 + fi + +fi + +echo "VENV: $VIRTUAL_ENV" + # install DCC cd /tmp/ git clone https://github.com/dieterich-lab/DCC.git -cd DCC -python2 setup.py install --user +#cd DCC +if [ "$OS" = "Mac" ]; then + #python2 setup.py install --force + #echo "python2 setup.py install" + pip2 install DCC/ +else + pip2 install DCC/ --user +fi # install FUCHS -cd .. -git clone https://github.com/dieterich-lab/FUCHS.git -cd FUCHS -python2 setup.py install --user +# cd .. + git clone https://github.com/dieterich-lab/FUCHS.git + #cd FUCHS + if [ "$OS" = "Mac" ]; then + pip2 install FUCHS/ + else + pip2 install FUCHS/ --user + fi # remove all temporary files #rm /tmp/FUCHS/ -rf #rm /tmp/DCC/ -rf - diff --git a/setup.py b/setup.py index 08ef4e23..67cbc6c1 100755 --- a/setup.py +++ b/setup.py @@ -96,7 +96,7 @@ def run(self): # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.1.0.2', + version='1.1.0.4', description='circtools - a circular RNA toolbox', long_description=long_description, @@ -159,9 +159,11 @@ def run(self): # requirements files see: # https://packaging.python.org/en/latest/requirements.html install_requires=[ + 'pysam == 0.13.0', 'numpy>=1.14.5', 'pybedtools>=0.7.10', - 'biopython >= 1.71' + 'biopython >= 1.71', + 'scipy>=0.19.0' ], python_requires='>=3.4', @@ -205,7 +207,7 @@ def run(self): 'scripts/circtools', 'scripts/circtools_detect_write_skip_tracks.pl', 'scripts/circtools_enrich_visualization.R', - 'scripts/circtools_exon', + 'scripts/circtools_exon_wrapper.R', 'scripts/circtools_quickcheck_wrapper.R', 'scripts/circtools_reconstruct_visualization.R', 'scripts/circtools_primex_wrapper.R', diff --git a/tests/01_test_dcc.sh b/tests/01_test_dcc.sh new file mode 100755 index 00000000..059a8bd1 --- /dev/null +++ b/tests/01_test_dcc.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright (C) 2018 Tobias Jakobi +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +function detect_os { + unameOut="$(uname -s)" + case "${unameOut}" in + Linux*) machine=Linux;; + Darwin*) machine=Mac;; + CYGWIN*) machine=Cygwin;; + MINGW*) machine=MinGw;; + *) machine="UNKNOWN:${unameOut}" + esac + echo ${machine} +} + +OS=`detect_os` + +# get basic test data (humane genome) +wget https://data.dieterichlab.org/s/eikQFHKFstSgbrp/download -O 00_base.tar.bz2 +tar jxvf 00_base.tar.bz2 + +# get test data for DCC +wget https://data.dieterichlab.org/s/pn7QHoQJmtD44Fo/download -O 01_dcc.tar.bz2 +tar jxvf 01_dcc.tar.bz2 + +# get basic test data (humane genome) +wget https://data.dieterichlab.org/s/emNDzztToQoyerz/download -O chr1.gtf.bz2 +bunzip2 chr1.gtf.bz2 + +# change into working dir +cd 01_dcc/ + +# execute DCC +circtools detect @samplesheet -ss -T 2 -D -an ../chr1.gtf -A ../00_base/GRCh38_85.fa -R ../00_base/GRCh38_85_repeatmasker.gtf -B @bam_files.txt -M -Nr 2 2 -fg -G -t /tmp/ -F -L 20 -k -O ./ + +cat *.log + +md5sum -c ../tests/md5_dcc.txt diff --git a/tests/02_test_quickcheck.sh b/tests/02_test_quickcheck.sh new file mode 100755 index 00000000..1a906823 --- /dev/null +++ b/tests/02_test_quickcheck.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright (C) 2018 Tobias Jakobi +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# get basic test data (for QC module) +wget https://data.dieterichlab.org/s/tzTtR6oZem5bgQd/download -O 02_quickcheck.tar.bz2 +tar jxvf 02_quickcheck.tar.bz2 + +# change into working dir +cd 02_quickcheck/ + +# execute quickcheck +circtools quickcheck -d dcc_out/ -s logs/ -l AA,BB -g 1,2,1,2 -R 4,5,6,7,8,9,10,11,12,13,14,15,20,21 diff --git a/tests/md5_dcc.txt b/tests/md5_dcc.txt new file mode 100644 index 00000000..1b69ff2a --- /dev/null +++ b/tests/md5_dcc.txt @@ -0,0 +1,4 @@ +8e0836fb0c1d70e45577a5123d12d59c CircCoordinates +eb25fc2b380a22ae5da9499674d28229 CircRNACount +83e83f6615f9df0860be34517d56ae67 CircSkipJunctions +c5f89c7d2c98cd734a18ac85d771705e LinearCount