Skip to content

Commit f91a261

Browse files
author
Rob Patro
committed
Merge branch 'develop'
2 parents c707902 + 2a3765f commit f91a261

File tree

9 files changed

+124
-31
lines changed

9 files changed

+124
-31
lines changed

current_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
VERSION_MAJOR 1
22
VERSION_MINOR 5
3-
VERSION_PATCH 0
3+
VERSION_PATCH 1

doc/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# The short X.Y version.
5858
version = '1.5'
5959
# The full version, including alpha/beta/rc tags.
60-
release = '1.5.0'
60+
release = '1.5.1'
6161

6262
# The language for content autogenerated by Sphinx. Refer to documentation
6363
# for a list of supported languages.

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ MAINTAINER [email protected]
66

77
ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \
88
ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget
9-
ENV SALMON_VERSION 1.5.0
9+
ENV SALMON_VERSION 1.5.1
1010

1111
# salmon binary will be installed in /home/salmon/bin/salmon
1212

docker/build_test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
#! /bin/bash
2-
SALMON_VERSION=1.5.0
2+
SALMON_VERSION=1.5.1
33
docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest .

include/SalmonConfig.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
namespace salmon {
2828
constexpr char majorVersion[] = "1";
2929
constexpr char minorVersion[] = "5";
30-
constexpr char patchVersion[] = "0";
31-
constexpr char version[] = "1.5.0";
30+
constexpr char patchVersion[] = "1";
31+
constexpr char version[] = "1.5.1";
3232
constexpr uint32_t indexVersion = 5;
3333
constexpr char requiredQuasiIndexVersion[] = "p7";
3434
} // namespace salmon

include/SingleCellProtocols.hpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,13 @@ namespace alevin{
101101
maxValue(maxValue_){
102102
alevin::types::AlevinUMIKmer::k(umiLength);
103103
}
104-
// NOTE: these do nothing but satisfy
105-
// template requirements right now
106-
void set_umi_geo(TagGeometry& g) { (void)g; };
107-
void set_bc_geo(TagGeometry& g) { (void)g; };
104+
// NOTE: these functions are duplicated
105+
// with those in `CustomGeometry` below, and
106+
// due to semantics have slightly different
107+
// implementations. See if the design can be
108+
// unified later.
109+
void set_umi_geo(TagGeometry& g) { umiLength = g.length(); };
110+
void set_bc_geo(TagGeometry& g) { barcodeLength = g.length(); };
108111
void set_read_geo(TagGeometry& g) { (void)g; };
109112
uint32_t barcode_length() const { return barcodeLength; }
110113
uint32_t umi_length() const { return umiLength; }
@@ -185,14 +188,15 @@ namespace alevin{
185188
TagGeometry bc_geo;
186189
TagGeometry read_geo;
187190

188-
void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length1 + umi_geo.length2; }
189-
void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length1 + bc_geo.length2; }
191+
void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length(); }
192+
void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length(); }
190193
void set_read_geo(TagGeometry& g) { read_geo = g; }
191194
uint32_t barcode_length() const { return barcodeLength; }
192195
uint32_t umi_length() const { return umiLength; }
193196

194-
// These values do nothing in this class except
195-
// maintain template compat ... fix this design later.
197+
// These values are set only when `set_umi_geo` and
198+
// `set_bc_geo` are called. See if this design can
199+
// be better integrated with `Rule` later.
196200
uint32_t barcodeLength, umiLength, maxValue;
197201
BarcodeEnd end;
198202
};

scripts/fetchPufferfish.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then
2222
rm -fr ${INSTALL_DIR}/src/pufferfish
2323
fi
2424

25-
SVER=salmon-v1.5.0
25+
SVER=salmon-v1.5.1
2626
#SVER=develop
2727
#SVER=sketch-mode
2828

29-
EXPECTED_SHA256=e72470c58a7f9b1f66dece73ebf27df07b24b1585d4b155466f165dc9dfcf586
29+
EXPECTED_SHA256=468e0c23a32d81524f7acadc8326efb155628970c15fd6cb843d26a61478bfde
3030

3131
mkdir -p ${EXTERNAL_DIR}
3232
curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip

src/Alevin.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,7 @@ salmon-based processing of single-cell RNA-seq data.
10381038
if (celseq) validate_num_protocols += 1;
10391039
if (celseq2) validate_num_protocols += 1;
10401040
if (quartzseq2) validate_num_protocols += 1;
1041-
if (custom and !noTgMap) validate_num_protocols += 1;
1041+
if (custom) validate_num_protocols += 1;
10421042

10431043
if ( validate_num_protocols != 1 ) {
10441044
fmt::print(stderr, "ERROR: Please specify one and only one scRNA protocol;");

src/SalmonAlevin.cpp

Lines changed: 103 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,14 @@ namespace alevin{
149149

150150
constexpr uint32_t miniBatchSize{5000};
151151

152+
// NOTE: Consider the implications of using uint32_t below.
153+
// Currently, this should not limit the barcode length to <= 16 in
154+
// "fry" mode (either sla or sketch) since we never actually put the
155+
// barcode in the corresponding variable of the AlignmentGroup class.
156+
// In traditional alevin, this should be referring to a barcode *index*
157+
// rather than the sequence itself, so it should be OK so long as we have
158+
// < std::numeric_limits<uint32_t>::max() distinct barcodes / reads.
159+
// However, that assumption should be tested more thoroughly.
152160
using CellBarcodeT = uint32_t;
153161
using UMIBarcodeT = uint64_t;
154162

@@ -499,7 +507,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
499507
fw_score += score_inc;
500508
++fw_hits;
501509
added = true;
502-
}
510+
}
503511
return added;
504512
}
505513

@@ -515,9 +523,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
515523
// This ensures that we don't double-count a k-mer that
516524
// might occur twice on this target.
517525
if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
518-
approx_pos_rc = ref_pos - read_pos;
526+
approx_pos_rc = ref_pos;
519527
if (last_read_pos_rc == -1) {
520-
approx_end_pos_rc = ref_pos - read_pos;
528+
approx_end_pos_rc = ref_pos + read_pos;
521529
} else {
522530
if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false;}
523531
}
@@ -555,6 +563,13 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
555563
SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits<uint32_t>::max()};
556564
}
557565

566+
inline std::string to_string() {
567+
std::stringstream ss;
568+
ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw
569+
<< " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc;
570+
return ss.str();
571+
}
572+
558573
int32_t last_read_pos_fw{-1};
559574
int32_t last_read_pos_rc{-1};
560575

@@ -648,6 +663,12 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
648663
extraBAMtags.clear();
649664
bool seqOk;
650665

666+
// keep track of the *least* freqeuntly
667+
// occurring hit in this fragment to consider
668+
// alternative processing of fragments where
669+
// all observed hits have high frequency.
670+
uint64_t alt_max_occ = 0;
671+
651672
if (alevinOpts.protocol.end == bcEnd::FIVE ||
652673
alevinOpts.protocol.end == bcEnd::THREE){
653674
bool extracted_bc = aut::extractBarcode(rp.first.seq, rp.second.seq, alevinOpts.protocol, barcode);
@@ -692,14 +713,26 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
692713
decltype(raw_hits[0].first) prev_read_pos = -1;
693714
// the maximum span the supporting k-mers of a
694715
// mapping position are allowed to have.
695-
int32_t max_stretch = static_cast<int32_t>(readSubSeq->length() * 1.5);
716+
// NOTE this is still > read_length b/c the stretch is measured wrt the
717+
// START of the terminal k-mer.
718+
int32_t max_stretch = static_cast<int32_t>(readSubSeq->length() * 1.0);
696719

697720
// a raw hit is a pair of read_pos and a projected hit
721+
722+
// the least frequent hit for this fragment.
723+
uint64_t min_occ = std::numeric_limits<uint64_t>::max();
724+
725+
// this is false by default and will be set to true
726+
// if *every* collected hit for this fragment occurs
727+
// salmonOpts.maxReadOccs times or more.
728+
bool had_alt_max_occ = false;
729+
698730
for (auto& raw_hit : raw_hits) {
699731
auto& read_pos = raw_hit.first;
700732
auto& proj_hits = raw_hit.second;
701733
auto& refs = proj_hits.refRange;
702734
uint64_t num_occ = static_cast<uint64_t>(refs.size());
735+
min_occ = std::min(min_occ, num_occ);
703736

704737
// SANITY
705738
if (read_pos <= prev_read_pos) {
@@ -729,6 +762,61 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
729762
} // DONE : if (static_cast<uint64_t>(refs.size()) < salmonOpts.maxReadOccs)
730763
} // DONE : for (auto& raw_hit : raw_hits)
731764

765+
// If our default threshold was too stringent, then set a more liberal
766+
// threshold and look up the k-mers that occur the least frequently.
767+
// Specifically, if the min occuring hits have frequency < min_thresh_prime (2500 by default)
768+
// times, then collect the min occuring hits to get the mapping.
769+
// TODO: deal with code duplication below.
770+
size_t max_allowed_occ = 2500;
771+
if ((min_occ >= salmonOpts.maxReadOccs) and (min_occ < max_allowed_occ)) {
772+
prev_read_pos = -1;
773+
max_allowed_occ = min_occ;
774+
for (auto& raw_hit : raw_hits) {
775+
auto& read_pos = raw_hit.first;
776+
auto& proj_hits = raw_hit.second;
777+
auto& refs = proj_hits.refRange;
778+
uint64_t num_occ = static_cast<uint64_t>(refs.size());
779+
min_occ = std::min(min_occ, num_occ);
780+
had_alt_max_occ = true;
781+
782+
// SANITY
783+
if (read_pos <= prev_read_pos) {
784+
salmonOpts.jointLog->warn(
785+
"read_pos : {}, prev_read_pos : {}", read_pos,
786+
prev_read_pos);
787+
}
788+
789+
prev_read_pos = read_pos;
790+
if (num_occ <= max_allowed_occ) {
791+
792+
++num_valid_hits;
793+
total_occs += num_occ;
794+
largest_occ =
795+
(num_occ > largest_occ) ? num_occ : largest_occ;
796+
float score_inc = 1.0 / num_occ;
797+
perfect_score += score_inc;
798+
799+
for (auto& pos_it : refs) {
800+
const auto& ref_pos_ori = proj_hits.decodeHit(pos_it);
801+
uint32_t tid = static_cast<uint32_t>(
802+
qidx->getRefId(pos_it.transcript_id()));
803+
int32_t pos = static_cast<int32_t>(ref_pos_ori.pos);
804+
bool ori = ref_pos_ori.isFW;
805+
if (ori) {
806+
hit_map[tid].add_fw(pos,
807+
static_cast<int32_t>(read_pos),
808+
max_stretch, score_inc);
809+
} else {
810+
hit_map[tid].add_rc(pos,
811+
static_cast<int32_t>(read_pos),
812+
max_stretch, score_inc);
813+
}
814+
} // DONE: for (auto &pos_it : refs)
815+
} // DONE : if (static_cast<uint64_t>(refs.size()) <
816+
// salmonOpts.maxReadOccs)
817+
} // DONE : for (auto& raw_hit : raw_hits)
818+
}
819+
732820
//float perfect_score = static_cast<float>(num_valid_hits) / total_occs;
733821
float acceptable_score = (num_valid_hits == 1) ? perfect_score :
734822
perfect_score - (1.0f / largest_occ);
@@ -760,6 +848,8 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
760848
}
761849
}
762850

851+
alt_max_occ = had_alt_max_occ ? accepted_hits.size() : salmonOpts.maxReadOccs;
852+
763853
/*
764854
if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) {
765855
for (auto& kv : hit_map) {
@@ -796,9 +886,8 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
796886
std::max(readLenLeft, readLenRight));
797887
}
798888

799-
800889
// If the read mapped to > maxReadOccs places, discard it
801-
if (accepted_hits.size() > salmonOpts.maxReadOccs) {
890+
if (accepted_hits.size() > alt_max_occ ) {
802891
accepted_hits.clear();
803892
} else if (!accepted_hits.empty()) {
804893
mapType = salmon::utils::MappingType::SINGLE_MAPPED;
@@ -814,9 +903,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
814903

815904
// bc
816905
// if we can fit the barcode into an integer
817-
if ( alevinOpts.protocol.barcodeLength <= 32 ) {
906+
if ( barcodeLength <= 32 ) {
818907
if (barcode_ok) {
819-
if (alevinOpts.protocol.barcodeLength <= 16) { // can use 32-bit int
908+
if ( barcodeLength <= 16 ) { // can use 32-bit int
820909
uint32_t shortbck = static_cast<uint32_t>(0x00000000FFFFFFFF & bck.word(0));
821910
bw << shortbck;
822911
} else { // must use 64-bit int
@@ -828,11 +917,11 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
828917
}
829918

830919
// umi
831-
if ( alevinOpts.protocol.barcodeLength <= 16 ) { // if we can use 32-bit int
920+
if ( umiLength <= 16 ) { // if we can use 32-bit int
832921
uint64_t umiint = jointHitGroup.umi();
833922
uint32_t shortumi = static_cast<uint32_t>(0x00000000FFFFFFFF & umiint);
834923
bw << shortumi;
835-
} else if ( alevinOpts.protocol.barcodeLength <= 32 ) { // if we can use 64-bit int
924+
} else if ( umiLength <= 32 ) { // if we can use 64-bit int
836925
uint64_t umiint = jointHitGroup.umi();
837926
bw << umiint;
838927
} else { // must use string
@@ -1255,11 +1344,11 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea
12551344

12561345
// bc
12571346
// if we can if the barcode into an integer
1258-
if ( alevinOpts.protocol.barcodeLength <= 32 ) {
1347+
if ( barcodeLength <= 32 ) {
12591348

12601349
if (barcode_ok) {
12611350
//alevinOpts.jointLog->info("BARCODE : {} \t ENC : {} ", barcode, bck.word(0));
1262-
if (alevinOpts.protocol.barcodeLength <= 16) { // can use 32-bit int
1351+
if ( barcodeLength <= 16 ) { // can use 32-bit int
12631352
uint32_t shortbck = static_cast<uint32_t>(0x00000000FFFFFFFF & bck.word(0));
12641353
//alevinOpts.jointLog->info("shortbck : {} ", shortbck);
12651354
bw << shortbck;
@@ -1272,11 +1361,11 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea
12721361
}
12731362

12741363
// umi
1275-
if ( alevinOpts.protocol.barcodeLength <= 16 ) { // if we can use 32-bit int
1364+
if ( umiLength <= 16 ) { // if we can use 32-bit int
12761365
uint64_t umiint = jointHitGroup.umi();
12771366
uint32_t shortumi = static_cast<uint32_t>(0x00000000FFFFFFFF & umiint);
12781367
bw << shortumi;
1279-
} else if ( alevinOpts.protocol.barcodeLength <= 32 ) { // if we can use 64-bit int
1368+
} else if ( umiLength <= 32 ) { // if we can use 64-bit int
12801369
uint64_t umiint = jointHitGroup.umi();
12811370
bw << umiint;
12821371
} else { // must use string

0 commit comments

Comments
 (0)