Skip to content

Commit 6e6e3c6

Browse files
Merge pull request #78 from smithlabcode/add-option-for-empty-input
Adding option to exit without error on empty input
2 parents e9777a9 + 9945aeb commit 6e6e3c6

File tree

2 files changed

+87
-45
lines changed

2 files changed

+87
-45
lines changed

src/Module.cpp

Lines changed: 54 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <cmath>
2020
#include <sstream>
2121
#include <cstdlib>
22+
#include <cassert>
2223

2324
using std::string;
2425
using std::vector;
@@ -37,6 +38,8 @@ using std::ostringstream;
3738
using std::istringstream;
3839
using std::getline;
3940

41+
template<typename T> using num_lim = std::numeric_limits<T>;
42+
4043
/*****************************************************************************/
4144
/******************* AUX FUNCTIONS *******************************************/
4245
/*****************************************************************************/
@@ -97,7 +100,7 @@ make_exponential_base_groups(vector<BaseGroup> &base_groups,
97100
/************* LINEAR BASE GROUP *************/
98101
// aux function to get linear interval
99102
size_t
100-
get_linear_interval(const size_t &num_bases) {
103+
get_linear_interval(const size_t num_bases) {
101104
// The the first 9bp as individual residues since odd stuff
102105
// can happen there, then we find a grouping value which gives
103106
// us a total set of groups below 75. We limit the intervals
@@ -174,7 +177,8 @@ double get_corrected_count(size_t count_at_limit,
174177
size_t num_reads,
175178
size_t dup_level,
176179
size_t num_obs) {
177-
// See if we can bail out early
180+
// See if we can bail out early (ADS: can we know if num_reads <=
181+
// count_at_limit always holds?)
178182
if (count_at_limit == num_reads)
179183
return num_obs;
180184

@@ -210,7 +214,7 @@ double get_corrected_count(size_t count_at_limit,
210214

211215
// Now we can assume that the number we observed can be
212216
// scaled up by this proportion
213-
return num_obs/(1 - p_not_seeing);
217+
return num_obs/std::max(num_lim<double>::min(), 1.0 - p_not_seeing);
214218
}
215219

216220
// Function to calculate the deviation of a histogram with 100 bins from a
@@ -277,7 +281,8 @@ sum_deviation_from_normal(const array <double, 101> &gc_count,
277281
// centre of the model
278282
mode = first_mode;
279283
} else {
280-
mode /= mode_duplicates;
284+
// ADS: check if we need to avoid divide-by-zero here
285+
mode /= std::max(static_cast<size_t>(1), mode_duplicates);
281286
}
282287

283288
// We can now work out a theoretical distribution
@@ -286,7 +291,8 @@ sum_deviation_from_normal(const array <double, 101> &gc_count,
286291
stdev += (i - mode) * (i - mode) * gc_count[i];
287292
}
288293

289-
stdev = stdev / (total_count-1);
294+
// ADS: check if we need to avoid divide-by-zero here
295+
stdev = stdev / std::max(num_lim<double>::min(), total_count - 1.0);
290296
stdev = sqrt(stdev);
291297

292298
/******************* END COPIED FROM FASTQC **********************/
@@ -297,20 +303,24 @@ sum_deviation_from_normal(const array <double, 101> &gc_count,
297303
// ADS: lonely magic below; what is the 100?
298304
for (size_t i = 0; i <= 100; ++i) {
299305
z = i - mode;
306+
// ADS: check if we need to avoid divide-by-zero here
300307
theoretical[i] = exp(- (z*z)/ (2.0 * stdev *stdev));
301308
theoretical_sum += theoretical[i];
302309
}
303310

304311
// Normalize theoretical so it sums to the total of readsq
305312
for (size_t i = 0; i <= 100; ++i) {
306-
theoretical[i] = theoretical[i] * total_count / theoretical_sum;
313+
// ADS: check if we need to avoid divide-by-zero here
314+
theoretical[i] = theoretical[i] * total_count /
315+
std::max(num_lim<double>::min(), theoretical_sum);
307316
}
308317

309318
for (size_t i = 0; i <= 100; ++i) {
310319
ans += fabs(gc_count[i] - theoretical[i]);
311320
}
312-
// Fractional deviation
313-
return 100.0 * ans / total_count;
321+
// Fractional deviation (ADS: check if we need to avoid
322+
// divide-by-zero here)
323+
return 100.0 * ans / std::max(num_lim<double>::min(), total_count);
314324
}
315325

316326
/***************************************************************/
@@ -446,15 +456,16 @@ ModuleBasicStatistics::summarize_module(FastqStats &stats) {
446456
total_bases += i * stats.long_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD];
447457
}
448458

449-
avg_read_length = total_bases / total_sequences;
459+
avg_read_length =
460+
total_bases / std::max(static_cast<size_t>(1), total_sequences);
450461

451462
// counts bases G and C in each base position
452463
avg_gc = 0;
453464

454465
// GC %
455466
// GS: TODO delete gc calculation during stream and do it using the total G
456467
// counts in all bases
457-
avg_gc = 100 * stats.total_gc / static_cast<double>(total_bases);
468+
avg_gc = 100 * stats.total_gc / std::max(1.0, static_cast<double>(total_bases));
458469

459470
}
460471

@@ -692,6 +703,7 @@ ModulePerBaseSequenceQuality::summarize_module(FastqStats &stats) {
692703
}
693704

694705
const size_t base_positions = base_groups[group].end - base_groups[group].start + 1;
706+
assert(base_positions != static_cast<size_t>(0));
695707
group_mean[group] = mean_group_sum / base_positions;
696708
group_ldecile[group] = static_cast<double>(ldecile_group_sum) / base_positions;
697709
group_lquartile[group] = static_cast<double>(lquartile_group_sum) / base_positions;
@@ -819,17 +831,19 @@ ModulePerTileSequenceQuality::summarize_module(FastqStats &stats) {
819831

820832
// Now transform sum into mean
821833
for (size_t i = 0; i < max_read_length; ++i)
822-
if (position_counts[i] > 0)
834+
if (position_counts[i] > 0.0)
823835
mean_in_base[i] = mean_in_base[i] / position_counts[i];
824836
else
825-
mean_in_base[i] = 0;
837+
mean_in_base[i] = 0.0;
826838

827839
for (auto &v : tile_position_quality) {
828840
const size_t lim = v.second.size();
829841
for (size_t i = 0; i < lim; ++i) {
830842
// transform sum of all qualities in mean
831-
const size_t count_at_pos =
832-
stats.tile_position_count.find(v.first)->second[i];
843+
const auto itr = stats.tile_position_count.find(v.first);
844+
if (itr == cend(stats.tile_position_count))
845+
throw runtime_error("failure ModulePerTileSequenceQuality::summarize_module");
846+
const size_t count_at_pos = itr->second[i];
833847

834848
if (count_at_pos > 0)
835849
v.second[i] = v.second[i] / count_at_pos;
@@ -882,6 +896,7 @@ ModulePerTileSequenceQuality::write_module(ostream &os) {
882896

883897
inline double
884898
round_quantile(const double val, const double num_quantiles) {
899+
// ADS: check if we need to worry about divide by zero here
885900
return static_cast<int>(val * num_quantiles) / num_quantiles;
886901
}
887902

@@ -937,6 +952,7 @@ ModulePerTileSequenceQuality::make_html_data() {
937952
// We will now discretize the quantiles so plotly understands
938953
// the color scheme
939954
static const double num_quantiles = 20.0;
955+
// ADS: not sure if we need to worry about divide by zero here?
940956
double mid_point = round_quantile(min_val/(min_val - max_val), num_quantiles);
941957

942958
// - 10: red
@@ -1054,7 +1070,7 @@ Module(ModulePerBaseSequenceContent::module_name) {
10541070
void
10551071
ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) {
10561072
double a_group, t_group, g_group, c_group, n_group;
1057-
double a_pos, t_pos, g_pos, c_pos, n_pos;
1073+
double a_pos{}, t_pos{}, g_pos{}, c_pos{}, n_pos{};
10581074
double total; //a+c+t+g+n
10591075
max_diff = 0.0;
10601076

@@ -1105,10 +1121,10 @@ ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) {
11051121

11061122
const double total_pos =
11071123
static_cast<double>(a_pos + c_pos + g_pos + t_pos + n_pos);
1108-
a_pos = 100.0 * a_pos / total_pos;
1109-
c_pos = 100.0 * c_pos / total_pos;
1110-
g_pos = 100.0 * g_pos / total_pos;
1111-
t_pos = 100.0 * t_pos / total_pos;
1124+
a_pos = 100.0 * a_pos / std::max(num_lim<double>::min(), total_pos);
1125+
c_pos = 100.0 * c_pos / std::max(num_lim<double>::min(), total_pos);
1126+
g_pos = 100.0 * g_pos / std::max(num_lim<double>::min(), total_pos);
1127+
t_pos = 100.0 * t_pos / std::max(num_lim<double>::min(), total_pos);
11121128

11131129
// for WGBS, we only test non-bisulfite treated bases
11141130
if (!is_reverse_complement)
@@ -1135,11 +1151,10 @@ ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) {
11351151

11361152
// turns above values to percent
11371153
total = static_cast<double>(a_group + c_group + t_group + g_group + n_group);
1138-
a_pct[group] = 100.0*a_group / total;
1139-
c_pct[group] = 100.0*c_group / total;
1140-
g_pct[group] = 100.0*g_group / total;
1141-
t_pct[group] = 100.0*t_group / total;
1142-
1154+
a_pct[group] = 100.0*a_group / std::max(num_lim<double>::min(), total);
1155+
c_pct[group] = 100.0*c_group / std::max(num_lim<double>::min(), total);
1156+
g_pct[group] = 100.0*g_group / std::max(num_lim<double>::min(), total);
1157+
t_pct[group] = 100.0*t_group / std::max(num_lim<double>::min(), total);
11431158
}
11441159
}
11451160

@@ -1395,12 +1410,14 @@ ModulePerBaseNContent::summarize_module(FastqStats &stats) {
13951410

13961411
this_n_total = (i < FastqStats::SHORT_READ_THRESHOLD) ? (stats.cumulative_read_length_freq[i]) :
13971412
(stats.long_cumulative_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD]);
1398-
this_n_pct = this_n_cnt / static_cast<double>(this_n_total);
1413+
this_n_pct = this_n_cnt / std::max(num_lim<double>::min(),
1414+
static_cast<double>(this_n_total));
13991415
max_n_pct = max(max_n_pct, this_n_pct);
14001416
group_n_cnt += this_n_cnt;
14011417
group_n_total += this_n_total;
14021418
}
1403-
n_pct[group] = 100.0*group_n_cnt / static_cast<double>(group_n_total);
1419+
n_pct[group] = 100.0*group_n_cnt / std::max(num_lim<double>::min(),
1420+
static_cast<double>(group_n_total));
14041421
}
14051422
}
14061423

@@ -1627,15 +1644,15 @@ ModuleSequenceDuplicationLevels::summarize_module(FastqStats &stats) {
16271644
}
16281645

16291646
// "Sequence duplication estimate" in the summary
1630-
total_deduplicated_pct = 100.0 * seq_dedup / seq_total;
1647+
total_deduplicated_pct = 100.0 * seq_dedup / std::max(1.0, seq_total);
16311648

16321649
// Convert to percentage
16331650
for (auto &v : percentage_deduplicated)
1634-
v = 100.0 * v / seq_dedup; // Percentage of unique sequences in bin
1651+
v = 100.0 * v / std::max(1.0, seq_dedup); // Percentage of unique sequences in bin
16351652

16361653
// Convert to percentage
16371654
for (auto &v : percentage_total)
1638-
v = 100.0 * v / seq_total; // Percentage of sequences in bin
1655+
v = 100.0 * v / std::max(1.0, seq_total); // Percentage of sequences in bin
16391656
}
16401657

16411658
void
@@ -1796,7 +1813,7 @@ ModuleOverrepresentedSequences::make_grade() {
17961813
// implment pass warn fail for overrep sequences
17971814
if (grade != "fail") {
17981815
// get percentage that overrep reads represent
1799-
double pct = 100.0 * seq.second / num_reads;
1816+
double pct = 100.0 * seq.second / std::max(static_cast<size_t>(1), num_reads);
18001817
if (pct > grade_error) {
18011818
grade = "fail";
18021819
}
@@ -1813,7 +1830,7 @@ ModuleOverrepresentedSequences::write_module(ostream &os) {
18131830
os << "#Sequence\tCount\tPercentage\tPossible Source\n";
18141831
for (auto seq : overrep_sequences) {
18151832
os << seq.first << "\t" << seq.second << "\t" <<
1816-
100.0 * seq.second / num_reads << "\t"
1833+
100.0 * seq.second / std::max(static_cast<size_t>(1), num_reads) << "\t"
18171834
<< get_matching_contaminant(seq.first) << "\n";
18181835
}
18191836
}
@@ -1836,7 +1853,7 @@ ModuleOverrepresentedSequences::make_html_data() {
18361853
for (auto v : overrep_sequences) {
18371854
data << "<tr><td>" << v.first << "</td>";
18381855
data << "<td>" << v.second << "</td>";
1839-
data << "<td>" << 100.0 * v.second / num_reads << "</td>";
1856+
data << "<td>" << 100.0 * v.second / std::max(static_cast<size_t>(1), num_reads) << "</td>";
18401857
data << "<td>" << get_matching_contaminant(v.first)
18411858
<< "</td>";
18421859
data << "</tr>";
@@ -1907,7 +1924,8 @@ ModuleAdapterContent::summarize_module(FastqStats &stats) {
19071924
for (size_t i = 0; i < adapter_pos_pct.size(); ++i) {
19081925
for (size_t j = 0; j < adapter_pos_pct[0].size(); ++j) {
19091926
adapter_pos_pct[i][j] *= 100.0;
1910-
adapter_pos_pct[i][j] /= static_cast<double>(stats.num_reads);
1927+
adapter_pos_pct[i][j] /= std::max(num_lim<double>::min(),
1928+
static_cast<double>(stats.num_reads));
19111929
}
19121930
}
19131931
}
@@ -2077,7 +2095,8 @@ ModuleKmerContent::summarize_module(FastqStats &stats) {
20772095
observed_count =
20782096
stats.kmer_count[(i << Constants::bit_shift_kmer) | kmer];
20792097

2080-
expected_count = pos_kmer_count[i] / dividend;
2098+
expected_count = pos_kmer_count[i] / std::max(num_lim<double>::min(), dividend);
2099+
// ADS: below, denom can't be zero if not above?
20812100
obs_exp_ratio = (expected_count > 0) ? (observed_count / expected_count) : 0;
20822101

20832102
if (i == 0 || obs_exp_ratio > obs_exp_max[kmer]) {
@@ -2146,7 +2165,7 @@ ModuleKmerContent::make_html_data() {
21462165

21472166
for (size_t i = 0; i < lim; ++i) {
21482167
const size_t kmer = kmers_to_report[i].first;
2149-
const double log_obs_exp = log(kmers_to_report[i].second)/log(2);
2168+
const double log_obs_exp = log(kmers_to_report[i].second)/log(2.0);
21502169
if (!seen_first)
21512170
seen_first = true;
21522171
else

src/falco.cpp

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
*/
1717

1818
#include <chrono>
19+
#include <filesystem>
1920
#include <fstream>
2021

2122
#include "FalcoConfig.hpp"
@@ -37,6 +38,8 @@ using std::string;
3738
using std::to_string;
3839
using std::vector;
3940

41+
namespace fs = std::filesystem;
42+
4043
using std::chrono::duration_cast;
4144
using std::chrono::system_clock;
4245
using time_point = std::chrono::time_point<std::chrono::system_clock>;
@@ -61,12 +64,7 @@ log_process(const string &s) {
6164
// Function to check existance of directory
6265
static bool
6366
dir_exists(const string &path) {
64-
struct stat info;
65-
if (stat(path.c_str(), &info) != 0)
66-
return false;
67-
else if (info.st_mode & S_IFDIR)
68-
return true;
69-
return false;
67+
return fs::exists(path) && fs::is_directory(path);
7068
}
7169

7270
// Read any file type until the end and logs progress
@@ -75,7 +73,7 @@ template <typename T>
7573
void
7674
read_stream_into_stats(T &in, FastqStats &stats, FalcoConfig &falco_config) {
7775
// open file
78-
size_t file_size = in.load();
76+
size_t file_size = std::max(in.load(), static_cast<size_t>(1));
7977
size_t tot_bytes_read = 0;
8078

8179
// Read record by record
@@ -90,11 +88,10 @@ read_stream_into_stats(T &in, FastqStats &stats, FalcoConfig &falco_config) {
9088

9189
// if I could not get tile information from read names, I need to tell this to
9290
// config so it does not output tile data on the summary or html
93-
if (in.tile_ignore) {
91+
if (in.tile_ignore)
9492
falco_config.do_tile = false;
95-
}
9693

97-
if (tot_bytes_read < file_size && !quiet)
94+
if (!quiet && tot_bytes_read < file_size)
9895
progress.report(cerr, file_size);
9996
}
10097

@@ -293,6 +290,7 @@ main(int argc, const char **argv) {
293290
bool skip_html = false;
294291
bool skip_short_summary = false;
295292
bool do_call = false;
293+
bool allow_empty_input = false;
296294

297295
// a tmp boolean to keep compatibility with FastQC
298296
bool tmp_compatibility_only = false;
@@ -542,6 +540,14 @@ main(int argc, const char **argv) {
542540
" in programs that are very strict about the "
543541
" FastQC output format).",
544542
false, do_call);
543+
544+
opt_parse.add_opt(
545+
"allow-empty-input", '\0',
546+
"[Falco only] allow empty input files and generate empty output files "
547+
"without en error state. WARNING: using this option can mask problems in "
548+
"other parts of a workflow.",
549+
false, allow_empty_input);
550+
545551
vector<string> leftover_args;
546552
opt_parse.parse(argc, argv, leftover_args);
547553
if (argc == 1 || opt_parse.help_requested()) {
@@ -578,6 +584,23 @@ main(int argc, const char **argv) {
578584
return EXIT_FAILURE;
579585
}
580586

587+
// ADS: make sure all input files are non-empty unless user oks it
588+
if (!allow_empty_input) {
589+
for (const auto &fn : leftover_args) {
590+
std::error_code ec;
591+
const bool empty_file = std::filesystem::is_empty(fn, ec);
592+
if (ec) {
593+
cerr << "Error reading file: " << fn << " (" << ec.message() << ")"
594+
<< endl;
595+
return EXIT_FAILURE;
596+
}
597+
else if (empty_file) {
598+
cerr << "Input file is empty: " << fn << endl;
599+
return EXIT_FAILURE;
600+
}
601+
}
602+
}
603+
581604
if (!outdir.empty()) {
582605
if (!summary_filename.empty())
583606
cerr << "[WARNING] specifying custom output directory but also "

0 commit comments

Comments
 (0)