Skip to content

Commit 4fa2b08

Browse files
committed
Add count kmers methods to SequenceDataset.
1 parent cc368a5 commit 4fa2b08

File tree

3 files changed

+74
-9
lines changed

3 files changed

+74
-9
lines changed

adam-core/src/main/scala/org/bdgenomics/adam/ds/read/AlignmentDataset.scala

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -747,7 +747,7 @@ sealed abstract class AlignmentDataset extends AvroReadGroupGenomicDataset[Align
747747
}
748748

749749
/**
750-
* Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.
750+
* (Scala-specific) Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.
751751
*
752752
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
753753
* @return Returns an RDD containing k-mer/count pairs.
@@ -762,7 +762,21 @@ sealed abstract class AlignmentDataset extends AvroReadGroupGenomicDataset[Align
762762
}
763763

764764
/**
765-
* Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.
765+
* (Java-specific) Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.
766+
*
767+
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
768+
* @return Returns a JavaRDD containing k-mer/count pairs.
769+
*/
770+
def countKmers(kmerLength: java.lang.Integer): JavaRDD[(String, java.lang.Long)] = {
771+
val k: Int = kmerLength
772+
countKmers(k).map(p => {
773+
(p._1, p._2: java.lang.Long)
774+
}).toJavaRDD()
775+
}
776+
777+
/**
778+
* Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer
779+
* as a Dataset.
766780
*
767781
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
768782
* @return Returns a Dataset containing k-mer/count pairs.

adam-core/src/main/scala/org/bdgenomics/adam/ds/sequence/SequenceDataset.scala

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.bdgenomics.adam.ds.sequence
1919

2020
import org.apache.parquet.hadoop.metadata.CompressionCodecName
2121
import org.apache.spark.SparkContext
22+
import org.apache.spark.api.java.JavaRDD
2223
import org.apache.spark.api.java.function.{ Function => JFunction }
2324
import org.apache.spark.rdd.RDD
2425
import org.apache.spark.sql.Dataset
@@ -527,6 +528,54 @@ sealed abstract class SequenceDataset extends AvroGenomicDataset[Sequence, Seque
527528
disableFastConcat = disableFastConcat)
528529
}
529530

531+
/**
532+
* (Scala-specific) Cuts sequences into _k_-mers, and then counts the number of occurrences of each _k_-mer.
533+
*
534+
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
535+
* @return Returns an RDD containing k-mer/count pairs.
536+
*/
537+
def countKmers(kmerLength: Int): RDD[(String, Long)] = {
538+
rdd.flatMap(r => {
539+
// cut each read into k-mers, and attach a count of 1L
540+
r.getSequence
541+
.sliding(kmerLength)
542+
.map(k => (k, 1L))
543+
}).reduceByKey((k1: Long, k2: Long) => k1 + k2)
544+
}
545+
546+
/**
547+
* (Java-specific) Cuts sequences into _k_-mers, and then counts the number of occurrences of each _k_-mer.
548+
*
549+
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
550+
* @return Returns an JavaRDD containing k-mer/count pairs.
551+
*/
552+
def countKmers(kmerLength: java.lang.Integer): JavaRDD[(String, java.lang.Long)] = {
553+
val k: Int = kmerLength
554+
countKmers(k).map(p => {
555+
(p._1, p._2: java.lang.Long)
556+
}).toJavaRDD()
557+
}
558+
559+
/**
560+
* Cuts sequences into _k_-mers, and then counts the number of occurrences of each _k_-mer
561+
* as a Dataset.
562+
*
563+
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
564+
* @return Returns a Dataset containing k-mer/count pairs.
565+
*/
566+
def countKmersAsDataset(kmerLength: Int): Dataset[(String, Long)] = {
567+
import spark.implicits._
568+
val kmers = dataset.select($"sequence".as[String])
569+
.flatMap(_.sliding(kmerLength))
570+
.as[String]
571+
572+
kmers.toDF()
573+
.groupBy($"value")
574+
.count()
575+
.select($"value".as("kmer"), $"count".as("count"))
576+
.as[(String, Long)]
577+
}
578+
530579
/**
531580
* @param newRdd The RDD to replace the underlying RDD with.
532581
* @param newPartitionMap New partition map, if any.

adam-core/src/main/scala/org/bdgenomics/adam/ds/sequence/SliceDataset.scala

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ sealed abstract class SliceDataset extends AvroGenomicDataset[Slice, SliceProduc
548548
}
549549

550550
/**
551-
* (Java-friendly) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent
551+
* (Java-specific) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent
552552
* slices now overlap by _n_ bases, where _n_ is the flank length.
553553
*
554554
* @param flankLength The length to extend adjacent slices by.
@@ -560,7 +560,7 @@ sealed abstract class SliceDataset extends AvroGenomicDataset[Slice, SliceProduc
560560
}
561561

562562
/**
563-
* (Scala-friendly) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent
563+
* (Scala-specific) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent
564564
* slices now overlap by _n_ bases, where _n_ is the flank length.
565565
*
566566
* @param flankLength The length to extend adjacent slices by.
@@ -573,9 +573,10 @@ sealed abstract class SliceDataset extends AvroGenomicDataset[Slice, SliceProduc
573573
}
574574

575575
/**
576-
* (Scala-friendly) Counts the k-mers contained in this genomic dataset of slices.
576+
* (Scala-specific) Cuts slices after flanking into _k_-mers, and then counts the
577+
* number of occurrences of each _k_-mer.
577578
*
578-
* @param kmerLength The length of k-mers to count.
579+
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
579580
* @return Returns an RDD containing k-mer/count pairs.
580581
*/
581582
def countKmers(kmerLength: Int): RDD[(String, Long)] = {
@@ -596,10 +597,11 @@ sealed abstract class SliceDataset extends AvroGenomicDataset[Slice, SliceProduc
596597
}
597598

598599
/**
599-
* (Java-friendly) Counts the k-mers contained in this genomic dataset of slices.
600+
* (Java-specific) Cuts slices after flanking into _k_-mers, and then counts the
601+
* number of occurrences of each _k_-mer.
600602
*
601-
* @param kmerLength The length of k-mers to count.
602-
* @return Returns an RDD containing k-mer/count pairs.
603+
* @param kmerLength The value of _k_ to use for cutting _k_-mers.
604+
* @return Returns a JavaRDD containing k-mer/count pairs.
603605
*/
604606
def countKmers(
605607
kmerLength: java.lang.Integer): JavaRDD[(String, java.lang.Long)] = {

0 commit comments

Comments
 (0)