diff --git a/README.md b/README.md index 6dec1cc..1642638 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,6 @@ Seqtk Examples seqtk trimfq -b 5 -e 10 in.fa > out.fa +* Keep the 50bp from right end of each read by trimming the rest and if the trimmed read length ends up having less the 20bp then the first 20 bp should be kept only: + + seqtk trimfq -E 50 -s 20 in.fq > out.fq diff --git a/seqtk.c b/seqtk.c index 8fdd32e..587abc1 100644 --- a/seqtk.c +++ b/seqtk.c @@ -277,29 +277,33 @@ krint64_t kr_rand(krand_t *kr) /* quality based trimming with Mott's algorithm */ int stk_trimfq(int argc, char *argv[]) -{ // FIXME: when a record with zero length will always be treated as a fasta record +{ gzFile fp; kseq_t *seq; double param = 0.05, q_int2real[128]; - int i, c, min_len = 30, left = 0, right = 0, fixed_len = -1; - while ((c = getopt(argc, argv, "l:q:b:e:L:")) >= 0) { + int i, c, min_len = 30, shortest_len = 1, left = 0, right = 0, left_keep = 0, right_keep = 0; + while ((c = getopt(argc, argv, "l:s:q:b:e:B:E:")) >= 0) { switch (c) { case 'q': param = atof(optarg); break; case 'l': min_len = atoi(optarg); break; + case 's': shortest_len = atoi(optarg); break; case 'b': left = atoi(optarg); break; case 'e': right = atoi(optarg); break; - case 'L': fixed_len = atoi(optarg); break; + case 'B': left_keep = atoi(optarg); break; + case 'E': right_keep = atoi(optarg); break; } } if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk trimfq [options] \n\n"); - fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e) [%.2f]\n", param); - fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e) [%d]\n", min_len); - fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -L INT retain at most INT bp from the 5'-end (non-zero to disable -q/-l) [0]\n"); - fprintf(stderr, " -Q force FASTQ output\n"); + fprintf(stderr, "Options: -q FLOAT error rate threshold (disabled by -b/-e/-B/-E) [%.2f]\n", param); + fprintf(stderr, " -l INT maximally trim down to INT bp (disabled by -b/-e/-B/-E) [%d]\n", min_len); + fprintf(stderr, " -s INT trimming by -b/-e/-B/-E shall not produce reads shorter then INT bp [%d]\n", shortest_len); + fprintf(stderr, " -b INT trim INT bp from left (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -e INT trim INT bp from right (non-zero to disable -q) [0]\n"); + fprintf(stderr, " -B INT keep first INT bp from left (non-zero to disable -q/-e/-E) [%d]\n", left_keep); + fprintf(stderr, " -E INT keep last INT bp from right (non-zero to disable -q/-b/-B) [%d]\n", right_keep); +// fprintf(stderr, " -Q force FASTQ output\n"); fprintf(stderr, "\n"); return 1; } @@ -314,11 +318,34 @@ int stk_trimfq(int argc, char *argv[]) while (kseq_read(seq) >= 0) { int beg, tmp, end; double s, max; - if (left || right || fixed_len > 0) { + if (left_keep) { + beg = left; end = left + left_keep; + if (seq->seq.l < end) end = seq->seq.l; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (right_keep) { + beg = seq->seq.l - right_keep - right; end = seq->seq.l - right; + if (beg < 0) beg = 0; + if (end < 0) end = 0; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (left || right) { beg = left; end = seq->seq.l - right; - if (beg >= end) beg = end = 0; - if (fixed_len > 0 && end - beg > fixed_len) end = beg + fixed_len; - } else if (seq->qual.l > min_len) { + if (end < 0) end = 0; + if (seq->seq.l < beg) beg = seq->seq.l; + if (end - beg < shortest_len) { + beg = 0; + end = shortest_len; + if (end > seq->seq.l) end = seq->seq.l; + } + } else if (seq->qual.l > min_len && param != 0.) { for (i = 0, beg = tmp = 0, end = seq->qual.l, s = max = 0.; i < seq->qual.l; ++i) { int q = seq->qual.s[i]; if (q < 36) q = 36; @@ -548,11 +575,13 @@ int stk_subseq(int argc, char *argv[]) khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; - int l, i, j, c, is_tab = 0, line = 0; + int l, i, j, c, is_tab = 0, line = 0, is_exclude = 0; + reglist_t dummy; khint_t k; - while ((c = getopt(argc, argv, "tl:")) >= 0) { + while ((c = getopt(argc, argv, "tel:")) >= 0) { switch (c) { case 't': is_tab = 1; break; + case 'e': is_exclude = 1; break; case 'l': line = atoi(optarg); break; } } @@ -560,6 +589,7 @@ int stk_subseq(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk subseq [options] |\n\n"); fprintf(stderr, "Options: -t TAB delimited output\n"); + fprintf(stderr, " -e exclusion instead of inclusion for sequences from \n"); fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); return 1; @@ -575,12 +605,19 @@ int stk_subseq(int argc, char *argv[]) fprintf(stderr, "[E::%s] failed to open the input file/stream\n", __func__); return 1; } + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) continue; - p = &kh_val(h, k); + if (is_exclude == 0) { + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + } else { + if (k != kh_end(h)) continue; + p = &dummy; + dummy.a[0] = INT_MAX; + } for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (beg >= seq->seq.l) { @@ -1664,7 +1701,7 @@ static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk \n"); - fprintf(stderr, "Version: 1.2-r101-dirty\n\n"); + fprintf(stderr, "Version: 1.2-r101c-dirty\n\n"); fprintf(stderr, "Command: seq common transformation of FASTA/Q\n"); fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n"); fprintf(stderr, " sample subsample sequences\n");