@@ -149,6 +149,14 @@ namespace alevin{
149149
150150 constexpr uint32_t miniBatchSize{5000 };
151151
152+ // NOTE: Consider the implications of using uint32_t below.
153+ // Currently, this should not limit the barcode length to <= 16 in
154+ // "fry" mode (either sla or sketch) since we never actually put the
155+ // barcode in the corresponding variable of the AlignmentGroup class.
156+ // In traditional alevin, this should be referring to a barcode *index*
157+ // rather than the sequence itself, so it should be OK so long as we have
158+ // < std::numeric_limits<uint32_t>::max() distinct barcodes / reads.
159+ // However, that assumption should be tested more thoroughly.
152160 using CellBarcodeT = uint32_t ;
153161 using UMIBarcodeT = uint64_t ;
154162
@@ -499,7 +507,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
499507 fw_score += score_inc;
500508 ++fw_hits;
501509 added = true ;
502- }
510+ }
503511 return added;
504512 }
505513
@@ -515,9 +523,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
515523 // This ensures that we don't double-count a k-mer that
516524 // might occur twice on this target.
517525 if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) {
518- approx_pos_rc = ref_pos - read_pos ;
526+ approx_pos_rc = ref_pos;
519527 if (last_read_pos_rc == -1 ) {
520- approx_end_pos_rc = ref_pos - read_pos;
528+ approx_end_pos_rc = ref_pos + read_pos;
521529 } else {
522530 if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false ;}
523531 }
@@ -555,6 +563,13 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
555563 SimpleHit{false , approx_pos_rc, rc_score, rc_hits, std::numeric_limits<uint32_t >::max ()};
556564 }
557565
566+ inline std::string to_string () {
567+ std::stringstream ss;
568+ ss << " fw_hits: " << fw_hits << " , fw_score : " << fw_score << " , fw_pos : " << approx_pos_fw
569+ << " || rc_hits: " << rc_hits << " , rc_score: " << rc_score << " , rc_pos: " << approx_pos_rc;
570+ return ss.str ();
571+ }
572+
558573 int32_t last_read_pos_fw{-1 };
559574 int32_t last_read_pos_rc{-1 };
560575
@@ -648,6 +663,12 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
648663 extraBAMtags.clear ();
649664 bool seqOk;
650665
666+ // keep track of the *least* freqeuntly
667+ // occurring hit in this fragment to consider
668+ // alternative processing of fragments where
669+ // all observed hits have high frequency.
670+ uint64_t alt_max_occ = 0 ;
671+
651672 if (alevinOpts.protocol .end == bcEnd::FIVE ||
652673 alevinOpts.protocol .end == bcEnd::THREE){
653674 bool extracted_bc = aut::extractBarcode (rp.first .seq , rp.second .seq , alevinOpts.protocol , barcode);
@@ -692,14 +713,26 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
692713 decltype (raw_hits[0 ].first ) prev_read_pos = -1 ;
693714 // the maximum span the supporting k-mers of a
694715 // mapping position are allowed to have.
695- int32_t max_stretch = static_cast <int32_t >(readSubSeq->length () * 1.5 );
716+ // NOTE this is still > read_length b/c the stretch is measured wrt the
717+ // START of the terminal k-mer.
718+ int32_t max_stretch = static_cast <int32_t >(readSubSeq->length () * 1.0 );
696719
697720 // a raw hit is a pair of read_pos and a projected hit
721+
722+ // the least frequent hit for this fragment.
723+ uint64_t min_occ = std::numeric_limits<uint64_t >::max ();
724+
725+ // this is false by default and will be set to true
726+ // if *every* collected hit for this fragment occurs
727+ // salmonOpts.maxReadOccs times or more.
728+ bool had_alt_max_occ = false ;
729+
698730 for (auto & raw_hit : raw_hits) {
699731 auto & read_pos = raw_hit.first ;
700732 auto & proj_hits = raw_hit.second ;
701733 auto & refs = proj_hits.refRange ;
702734 uint64_t num_occ = static_cast <uint64_t >(refs.size ());
735+ min_occ = std::min (min_occ, num_occ);
703736
704737 // SANITY
705738 if (read_pos <= prev_read_pos) {
@@ -729,6 +762,61 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
729762 } // DONE : if (static_cast<uint64_t>(refs.size()) < salmonOpts.maxReadOccs)
730763 } // DONE : for (auto& raw_hit : raw_hits)
731764
765+ // If our default threshold was too stringent, then set a more liberal
766+ // threshold and look up the k-mers that occur the least frequently.
767+ // Specifically, if the min occuring hits have frequency < min_thresh_prime (2500 by default)
768+ // times, then collect the min occuring hits to get the mapping.
769+ // TODO: deal with code duplication below.
770+ size_t max_allowed_occ = 2500 ;
771+ if ((min_occ >= salmonOpts.maxReadOccs ) and (min_occ < max_allowed_occ)) {
772+ prev_read_pos = -1 ;
773+ max_allowed_occ = min_occ;
774+ for (auto & raw_hit : raw_hits) {
775+ auto & read_pos = raw_hit.first ;
776+ auto & proj_hits = raw_hit.second ;
777+ auto & refs = proj_hits.refRange ;
778+ uint64_t num_occ = static_cast <uint64_t >(refs.size ());
779+ min_occ = std::min (min_occ, num_occ);
780+ had_alt_max_occ = true ;
781+
782+ // SANITY
783+ if (read_pos <= prev_read_pos) {
784+ salmonOpts.jointLog ->warn (
785+ " read_pos : {}, prev_read_pos : {}" , read_pos,
786+ prev_read_pos);
787+ }
788+
789+ prev_read_pos = read_pos;
790+ if (num_occ <= max_allowed_occ) {
791+
792+ ++num_valid_hits;
793+ total_occs += num_occ;
794+ largest_occ =
795+ (num_occ > largest_occ) ? num_occ : largest_occ;
796+ float score_inc = 1.0 / num_occ;
797+ perfect_score += score_inc;
798+
799+ for (auto & pos_it : refs) {
800+ const auto & ref_pos_ori = proj_hits.decodeHit (pos_it);
801+ uint32_t tid = static_cast <uint32_t >(
802+ qidx->getRefId (pos_it.transcript_id ()));
803+ int32_t pos = static_cast <int32_t >(ref_pos_ori.pos );
804+ bool ori = ref_pos_ori.isFW ;
805+ if (ori) {
806+ hit_map[tid].add_fw (pos,
807+ static_cast <int32_t >(read_pos),
808+ max_stretch, score_inc);
809+ } else {
810+ hit_map[tid].add_rc (pos,
811+ static_cast <int32_t >(read_pos),
812+ max_stretch, score_inc);
813+ }
814+ } // DONE: for (auto &pos_it : refs)
815+ } // DONE : if (static_cast<uint64_t>(refs.size()) <
816+ // salmonOpts.maxReadOccs)
817+ } // DONE : for (auto& raw_hit : raw_hits)
818+ }
819+
732820 // float perfect_score = static_cast<float>(num_valid_hits) / total_occs;
733821 float acceptable_score = (num_valid_hits == 1 ) ? perfect_score :
734822 perfect_score - (1 .0f / largest_occ);
@@ -760,6 +848,8 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
760848 }
761849 }
762850
851+ alt_max_occ = had_alt_max_occ ? accepted_hits.size () : salmonOpts.maxReadOccs ;
852+
763853 /*
764854 if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) {
765855 for (auto& kv : hit_map) {
@@ -796,9 +886,8 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
796886 std::max (readLenLeft, readLenRight));
797887 }
798888
799-
800889 // If the read mapped to > maxReadOccs places, discard it
801- if (accepted_hits.size () > salmonOpts. maxReadOccs ) {
890+ if (accepted_hits.size () > alt_max_occ ) {
802891 accepted_hits.clear ();
803892 } else if (!accepted_hits.empty ()) {
804893 mapType = salmon::utils::MappingType::SINGLE_MAPPED;
@@ -814,9 +903,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
814903
815904 // bc
816905 // if we can fit the barcode into an integer
817- if ( alevinOpts. protocol . barcodeLength <= 32 ) {
906+ if ( barcodeLength <= 32 ) {
818907 if (barcode_ok) {
819- if (alevinOpts. protocol . barcodeLength <= 16 ) { // can use 32-bit int
908+ if ( barcodeLength <= 16 ) { // can use 32-bit int
820909 uint32_t shortbck = static_cast <uint32_t >(0x00000000FFFFFFFF & bck.word (0 ));
821910 bw << shortbck;
822911 } else { // must use 64-bit int
@@ -828,11 +917,11 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re
828917 }
829918
830919 // umi
831- if ( alevinOpts. protocol . barcodeLength <= 16 ) { // if we can use 32-bit int
920+ if ( umiLength <= 16 ) { // if we can use 32-bit int
832921 uint64_t umiint = jointHitGroup.umi ();
833922 uint32_t shortumi = static_cast <uint32_t >(0x00000000FFFFFFFF & umiint);
834923 bw << shortumi;
835- } else if ( alevinOpts. protocol . barcodeLength <= 32 ) { // if we can use 64-bit int
924+ } else if ( umiLength <= 32 ) { // if we can use 64-bit int
836925 uint64_t umiint = jointHitGroup.umi ();
837926 bw << umiint;
838927 } else { // must use string
@@ -1255,11 +1344,11 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea
12551344
12561345 // bc
12571346 // if we can if the barcode into an integer
1258- if ( alevinOpts. protocol . barcodeLength <= 32 ) {
1347+ if ( barcodeLength <= 32 ) {
12591348
12601349 if (barcode_ok) {
12611350 // alevinOpts.jointLog->info("BARCODE : {} \t ENC : {} ", barcode, bck.word(0));
1262- if (alevinOpts. protocol . barcodeLength <= 16 ) { // can use 32-bit int
1351+ if ( barcodeLength <= 16 ) { // can use 32-bit int
12631352 uint32_t shortbck = static_cast <uint32_t >(0x00000000FFFFFFFF & bck.word (0 ));
12641353 // alevinOpts.jointLog->info("shortbck : {} ", shortbck);
12651354 bw << shortbck;
@@ -1272,11 +1361,11 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea
12721361 }
12731362
12741363 // umi
1275- if ( alevinOpts. protocol . barcodeLength <= 16 ) { // if we can use 32-bit int
1364+ if ( umiLength <= 16 ) { // if we can use 32-bit int
12761365 uint64_t umiint = jointHitGroup.umi ();
12771366 uint32_t shortumi = static_cast <uint32_t >(0x00000000FFFFFFFF & umiint);
12781367 bw << shortumi;
1279- } else if ( alevinOpts. protocol . barcodeLength <= 32 ) { // if we can use 64-bit int
1368+ } else if ( umiLength <= 32 ) { // if we can use 64-bit int
12801369 uint64_t umiint = jointHitGroup.umi ();
12811370 bw << umiint;
12821371 } else { // must use string
0 commit comments