From 0240e9049f2f93023a1fe28e92ee0cf0f0c74617 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 20 Mar 2026 18:43:16 -0700 Subject: [PATCH 1/5] src/StreamReader.cpp: stats.total_bases incremented by read_pos within StreamReader::postprocess_sequence_line to get total bases from input --- src/StreamReader.cpp | 207 +++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 105 deletions(-) diff --git a/src/StreamReader.cpp b/src/StreamReader.cpp index fb1f769..40b5ffc 100644 --- a/src/StreamReader.cpp +++ b/src/StreamReader.cpp @@ -30,20 +30,21 @@ min8(const T a, const T b) { /****************************************************/ /***************** STREAMREADER *********************/ /****************************************************/ -size_t +std::size_t get_tile_split_position(FalcoConfig &config) { const std::string &filename = config.filename; // Count colons to know the formatting pattern - size_t num_colon = 0; + std::size_t num_colon{}; if (config.is_sam) { std::ifstream sam_file(filename); if (!sam_file) throw std::runtime_error("cannot load sam file : " + filename); std::string line; - while (std::getline(sam_file, line) && line.size() > 0 && line[0] == '@') + while (std::getline(sam_file, line) && std::size(line) > 0 && + line[0] == '@') continue; - size_t tabPos = line.find('\t'); + std::size_t tabPos = line.find('\t'); line = line.substr(0, tabPos); for (char c : line) num_colon += (c == ':'); @@ -113,26 +114,24 @@ get_tile_split_position(FalcoConfig &config) { return 0; // no tile information on read name } -// function to turn a vector into array for adapter hashes and fast lookup -std::array -make_adapters(const std::vector &adapter_hashes) { - if (adapter_hashes.size() > Constants::max_adapters) - throw std::runtime_error( - "Number of adapters is larger than 128, which hinders " - "visualziation and speed of falco. Please keep it to " - "under 128"); - - std::array ans; - for (size_t i = 0; i < adapter_hashes.size(); ++i) - ans[i] = adapter_hashes[i]; - +// function to turn a vector into array for adapter hashes and faster lookup +std::array +make_adapters(const std::vector &adapter_hashes) { + static constexpr auto error_message = + "Number of adapters is larger than 128, which hinders visualziation and " + "speed of falco. Please keep it to under 128"; + if (std::size(adapter_hashes) > Constants::max_adapters) + throw std::runtime_error(error_message); + std::array ans; + std::copy(std::cbegin(adapter_hashes), std::cend(adapter_hashes), + std::begin(ans)); return ans; } -StreamReader::StreamReader(FalcoConfig &config, const size_t _buffer_size, +StreamReader::StreamReader(FalcoConfig &config, const std::size_t _buffer_size, const char _field_separator, const char _line_separator) : - // I have to pass the config skips as const to read them fast + // must pass the config skips as const to read them fast do_sequence_hash(config.do_duplication || config.do_overrepresented), do_kmer(config.do_kmer), do_adapter(config.do_adapter), do_adapter_optimized(config.do_adapter_optimized), @@ -152,7 +151,8 @@ StreamReader::StreamReader(FalcoConfig &config, const size_t _buffer_size, do_adapters_slow(config.do_adapter && !config.do_adapter_optimized), adapter_seqs(config.adapter_seqs), - num_adapters(config.adapter_hashes.size()), adapter_size(config.adapter_size), + num_adapters(std::size(config.adapter_hashes)), + adapter_size(config.adapter_size), // for case size == 32 expr (1ull << 64) -1 gives 0. // We need to set mask as all 64 bits 1 => use SIZE_MAX in this case adapter_mask(adapter_size == 32 ? SIZE_MAX @@ -193,7 +193,7 @@ StreamReader::put_base_in_buffer() { buffer[read_pos] = base_from_buffer; } else { - if (leftover_ind == leftover_buffer.size()) + if (leftover_ind == std::size(leftover_buffer)) leftover_buffer.push_back(base_from_buffer); else leftover_buffer[leftover_ind] = base_from_buffer; @@ -241,7 +241,7 @@ StreamReader::read_fast_forward_line_eof() { void StreamReader::get_tile_value() { tile_cur = 0; - size_t num_colon = 0; + std::size_t num_colon = 0; for (; *cur_char != field_separator; ++cur_char) { num_colon += (*cur_char == ':'); if (num_colon == tile_split_point) { @@ -289,7 +289,7 @@ StreamReader::read_tile_line(FastqStats &stats) { std::vector(stats.max_read_length, 0.0); // stats.tile_position_quality.find(tile_cur)->second[0] = 0; stats.tile_position_count[tile_cur] = - std::vector(stats.max_read_length, 0); + std::vector(stats.max_read_length, 0); } } @@ -301,48 +301,44 @@ StreamReader::read_tile_line(FastqStats &stats) { // optimized at all times void StreamReader::process_sequence_base_from_buffer(FastqStats &stats) { - // I will count the Ns even if asked to ignore, as checking ifs take time + // count Ns even if asked not to report them if (base_from_buffer == 'N') { ++stats.n_base_count[read_pos]; num_bases_after_n = 1; // start over the current kmer + return; } + const auto two_bit = actg_to_2bit(base_from_buffer); + // ATGC bases - else { - // increments basic statistic counts - cur_gc_count += (actg_to_2bit(base_from_buffer) & 1); - ++stats.base_count[(read_pos << Constants::bit_shift_base) | - actg_to_2bit(base_from_buffer)]; - - if (do_sliding_window) { - // Update k-mer sequence - cur_kmer = ((cur_kmer << Constants::bit_shift_base) | - actg_to_2bit(base_from_buffer)); - - // registers k-mer if seen at least k nucleotides since the last n - if (do_kmer && do_kmer_read && - (num_bases_after_n >= Constants::kmer_size)) { - - ++stats.kmer_count[(read_pos << Constants::bit_shift_kmer) | - (cur_kmer & Constants::kmer_mask)]; - ++stats.pos_kmer_count[read_pos]; - } + // increments basic statistic counts + cur_gc_count += (two_bit & 1); + ++stats.base_count[(read_pos << Constants::bit_shift_base) | two_bit]; + + if (do_sliding_window) { + // Update k-mer sequence + cur_kmer = (cur_kmer << Constants::bit_shift_base) | two_bit; + // registers k-mer if seen at least k nucleotides since the last n + if (do_kmer && do_kmer_read && + (num_bases_after_n >= Constants::kmer_size)) { + + ++stats.kmer_count[(read_pos << Constants::bit_shift_kmer) | + (cur_kmer & Constants::kmer_mask)]; + ++stats.pos_kmer_count[read_pos]; + } - // GS: slow, need to use fsm - if (do_adapter_optimized && (num_bases_after_n == adapter_size)) { - cur_kmer &= adapter_mask; - for (size_t i = 0; i != num_adapters; ++i) { - if (cur_kmer == adapters[i] && !adapters_found[i]) { - ++stats - .pos_adapter_count[(read_pos << Constants::bit_shift_adapter) | - i]; - adapters_found[i] = true; - } + // GS: slow, need to use fsm + if (do_adapter_optimized && (num_bases_after_n == adapter_size)) { + cur_kmer &= adapter_mask; + for (std::size_t i = 0; i != num_adapters; ++i) { + if (cur_kmer == adapters[i] && !adapters_found[i]) { + ++stats.pos_adapter_count[(read_pos << Constants::bit_shift_adapter) | + i]; + adapters_found[i] = true; } } - - num_bases_after_n += (num_bases_after_n != adapter_size); } + num_bases_after_n += (num_bases_after_n != adapter_size); } } @@ -353,17 +349,15 @@ StreamReader::process_sequence_base_from_leftover(FastqStats &stats) { if (base_from_buffer == 'N') { ++stats.long_n_base_count[leftover_ind]; num_bases_after_n = 1; // start over the current kmer + return; } - // ATGC bases - else { - // increments basic statistic counts - cur_gc_count += (actg_to_2bit(base_from_buffer) & 1); - ++stats.long_base_count[(leftover_ind << Constants::bit_shift_base) | - actg_to_2bit(base_from_buffer)]; - - // WE WILL NOT DO KMER STATS OUTSIDE OF BUFFER - } + // increments basic statistic counts + const auto two_bit = actg_to_2bit(base_from_buffer); + cur_gc_count += (two_bit & 1); + const auto idx = (leftover_ind << Constants::bit_shift_base) | two_bit; + ++stats.long_base_count[idx]; + // WE WILL NOT DO KMER STATS OUTSIDE OF BUFFER } // Gets statistics after reading the entire sequence line @@ -385,7 +379,8 @@ StreamReader::postprocess_sequence_line(FastqStats &stats) { // Updates maximum read length if applicable stats.max_read_length = - ((read_pos > stats.max_read_length) ? (read_pos) : (stats.max_read_length)); + read_pos > stats.max_read_length ? read_pos : stats.max_read_length; + stats.total_bases += read_pos; // FastQC's gc model summarized, if requested if (do_gc_sequence && read_pos != 0) { @@ -429,8 +424,8 @@ StreamReader::read_sequence_line(FastqStats &stats) { if (do_adapters_slow) { const std::string seq_line_str = cur_char; - for (size_t i = 0; i != num_adapters; ++i) { - const size_t adapt_index = seq_line_str.find(adapter_seqs[i], 0); + for (std::size_t i = 0; i != num_adapters; ++i) { + const std::size_t adapt_index = seq_line_str.find(adapter_seqs[i], 0); if (adapt_index < stats.SHORT_READ_THRESHOLD) { ++stats.pos_adapter_count[((adapt_index + adapter_seqs[i].length() - 1) << Constants::bit_shift_adapter) | @@ -498,7 +493,7 @@ StreamReader::process_quality_base_from_buffer(FastqStats &stats) { // Tile processing if (!tile_ignore && do_tile_read && tile_cur != 0) { // allocate more base space if necessary - if (stats.tile_position_quality[tile_cur].size() == read_pos) { + if (std::size(stats.tile_position_quality[tile_cur]) == read_pos) { stats.tile_position_quality[tile_cur].push_back(0.0); stats.tile_position_count[tile_cur].push_back(0); } @@ -591,8 +586,8 @@ StreamReader::read_quality_line(FastqStats &stats) { /*******************************************************/ /*************** THIS IS VERY SLOW ********************/ // if reads are >75pb, truncate to 50 akin to FastQC -inline size_t -get_truncate_point(const size_t read_pos) { +inline std::size_t +get_truncate_point(const std::size_t read_pos) { return (read_pos <= Constants::unique_reads_max_length) ? read_pos : Constants::unique_reads_truncate; @@ -630,7 +625,7 @@ StreamReader::postprocess_fastq_record(FastqStats &stats) { } inline bool -StreamReader::check_bytes_read(const size_t read_num) { +StreamReader::check_bytes_read(const std::size_t read_num) { return ((read_num & check_bytes_read_mask) == 0); } @@ -639,7 +634,7 @@ StreamReader::check_bytes_read(const size_t read_num) { /*******************************************************/ char get_line_separator(const std::string &filename) { - FILE *fp = fopen(filename.c_str(), "r"); + FILE *fp = fopen(std::data(filename), "r"); if (fp == NULL) throw std::runtime_error("bad input file: " + filename); @@ -655,29 +650,29 @@ get_line_separator(const std::string &filename) { } // Set fastq field_separator as line_separator -FastqReader::FastqReader(FalcoConfig &_config, const size_t _buffer_size) : +FastqReader::FastqReader(FalcoConfig &_config, const std::size_t _buffer_size) : StreamReader(_config, _buffer_size, get_line_separator(_config.filename), get_line_separator(_config.filename)) { filebuf = new char[RESERVE_SIZE]; } -size_t +std::size_t get_file_size(const std::string &filename) { - FILE *fp = fopen(filename.c_str(), "r"); + FILE *fp = fopen(std::data(filename), "r"); if (fp == NULL) throw std::runtime_error("bad input file: " + filename); fseek(fp, 0L, SEEK_END); - const size_t ret = static_cast(ftell(fp)); + const std::size_t ret = static_cast(ftell(fp)); fclose(fp); return ret; } // Load fastq with zlib -size_t +std::size_t FastqReader::load() { - fileobj = fopen(filename.c_str(), "r"); + fileobj = fopen(std::data(filename), "r"); if (fileobj == NULL) throw std::runtime_error("Cannot open FASTQ file : " + filename); return get_file_size(filename); @@ -696,7 +691,7 @@ FastqReader::~FastqReader() { // Parses fastq gz by reading line by line into the gzbuf bool -FastqReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { +FastqReader::read_entry(FastqStats &stats, std::size_t &num_bytes_read) { cur_char = fgets(filebuf, RESERVE_SIZE, fileobj); // need to check here if we did not hit eof @@ -710,6 +705,7 @@ FastqReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { cur_char = fgets(filebuf, RESERVE_SIZE, fileobj); read_sequence_line(stats); + skip_separator(); cur_char = fgets(filebuf, RESERVE_SIZE, fileobj); @@ -731,6 +727,7 @@ FastqReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { // Returns if file should keep being checked if (check_bytes_read(stats.num_reads)) num_bytes_read = ftell(fileobj); + return (!is_eof() && cur_char != 0); } @@ -738,15 +735,16 @@ FastqReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { /*************** READ FASTQ GZ RCORD *******************/ /*******************************************************/ // the gz fastq constructor is the same as the fastq -GzFastqReader::GzFastqReader(FalcoConfig &_config, const size_t _buffer_size) : +GzFastqReader::GzFastqReader(FalcoConfig &_config, + const std::size_t _buffer_size) : StreamReader(_config, _buffer_size, '\n', '\n') { gzbuf = new char[RESERVE_SIZE]; } // Load fastq with zlib -size_t +std::size_t GzFastqReader::load() { - fileobj = gzopen(filename.c_str(), "r"); + fileobj = gzopen(std::data(filename), "r"); if (fileobj == Z_NULL) throw std::runtime_error("Cannot open gzip FASTQ file : " + filename); @@ -766,7 +764,7 @@ GzFastqReader::~GzFastqReader() { // Parses fastq gz by reading line by line into the gzbuf bool -GzFastqReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { +GzFastqReader::read_entry(FastqStats &stats, std::size_t &num_bytes_read) { cur_char = gzgets(fileobj, gzbuf, RESERVE_SIZE); // need to check here if we did not hit eof @@ -808,15 +806,15 @@ GzFastqReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { /*************** READ SAM RECORD ***********************/ /*******************************************************/ // set sam separator as tab -SamReader::SamReader(FalcoConfig &_config, const size_t _buffer_size) : +SamReader::SamReader(FalcoConfig &_config, const std::size_t _buffer_size) : StreamReader(_config, _buffer_size, '\t', get_line_separator(_config.filename)) { filebuf = new char[RESERVE_SIZE]; } -size_t +std::size_t SamReader::load() { - fileobj = fopen(filename.c_str(), "r"); + fileobj = fopen(std::data(filename), "r"); if (fileobj == NULL) throw std::runtime_error("Cannot open SAM file : " + filename); @@ -835,7 +833,7 @@ SamReader::is_eof() { } bool -SamReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { +SamReader::read_entry(FastqStats &stats, std::size_t &num_bytes_read) { cur_char = fgets(filebuf, RESERVE_SIZE, fileobj); if (is_eof()) @@ -845,7 +843,7 @@ SamReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { read_tile_line(stats); skip_separator(); - for (size_t i = 0; i < 8; ++i) { + for (std::size_t i = 0; i < 8; ++i) { read_fast_forward_line(); skip_separator(); } @@ -881,13 +879,13 @@ SamReader::~SamReader() { // puts base either on buffer or leftover void -BamReader::put_base_in_buffer(const size_t pos) { +BamReader::put_base_in_buffer(const std::size_t pos) { base_from_buffer = seq_nt16_str[bam_seqi(cur_char, pos)]; if (still_in_buffer) { buffer[read_pos] = base_from_buffer; } else { - if (leftover_ind == leftover_buffer.size()) + if (leftover_ind == std::size(leftover_buffer)) leftover_buffer.push_back(base_from_buffer); else leftover_buffer[leftover_ind] = base_from_buffer; @@ -910,15 +908,15 @@ BamReader::read_sequence_line(FastqStats &stats) { do_kmer_read = (stats.num_reads == next_kmer_read); adapters_found.reset(); - const size_t seq_len = b->core.l_qseq; + const std::size_t seq_len = b->core.l_qseq; // MN: TODO: make sure everything works in this scope if (do_adapters_slow) { std::string seq_line_str(seq_len, '\0'); - for (size_t i = 0; i < seq_len; i++) { + for (std::size_t i = 0; i < seq_len; i++) { seq_line_str[i] = seq_nt16_str[bam_seqi(cur_char, i)]; } - for (size_t i = 0; i != num_adapters; ++i) { - const size_t adapt_index = seq_line_str.find(adapter_seqs[i], 0); + for (std::size_t i = 0; i != num_adapters; ++i) { + const std::size_t adapt_index = seq_line_str.find(adapter_seqs[i], 0); if (adapt_index < stats.SHORT_READ_THRESHOLD) { ++stats.pos_adapter_count[((adapt_index + adapter_seqs[i].length() - 1) << Constants::bit_shift_adapter) | @@ -933,7 +931,7 @@ BamReader::read_sequence_line(FastqStats &stats) { // In the following loop, cur_char does not change, but rather i changes // and we access bases using bam_seqi(cur_char, i) in // put_base_in_buffer. - for (size_t i = 0; i < seq_len; i++, ++read_pos) { + for (std::size_t i = 0; i < seq_len; i++, ++read_pos) { // if we reached the buffer size, stop using it and start using leftover if (read_pos == buffer_size) { still_in_buffer = false; @@ -988,8 +986,8 @@ BamReader::read_quality_line(FastqStats &stats) { cur_quality = 0; still_in_buffer = true; - const size_t seq_len = b->core.l_qseq; - for (size_t i = 0; i < seq_len; ++cur_char, i++) { + const std::size_t seq_len = b->core.l_qseq; + for (std::size_t i = 0; i < seq_len; ++cur_char, i++) { if (read_pos == buffer_size) { still_in_buffer = false; @@ -1117,14 +1115,14 @@ reverse_quality_scores(bam1_t *aln) { } // set sam separator as tab -BamReader::BamReader(FalcoConfig &_config, const size_t _buffer_size) : +BamReader::BamReader(FalcoConfig &_config, const std::size_t _buffer_size) : StreamReader(_config, _buffer_size, '\t', '\n') { rd_ret = 0; } -size_t +std::size_t BamReader::load() { - if (!(hts = hts_open(filename.c_str(), "r"))) + if (!(hts = hts_open(std::data(filename), "r"))) throw std::runtime_error("cannot load bam file : " + filename); if (!(hdr = sam_hdr_read(hts))) @@ -1144,30 +1142,29 @@ BamReader::is_eof() { } bool -BamReader::read_entry(FastqStats &stats, size_t &num_bytes_read) { +BamReader::read_entry(FastqStats &stats, std::size_t &num_bytes_read) { static const uint16_t not_reverse = ~BAM_FREVERSE; if ((rd_ret = sam_read1(hts, hdr, b)) >= 0) { - if (bam_is_rev(b)) { revcomp_seq_by_byte(b); reverse_quality_scores(b); b->core.flag &= not_reverse; } - num_bytes_read = 0; do_read = (stats.num_reads == next_read); // Read tile line cur_char = bam_get_qname(b); last = cur_char + b->m_data; - const size_t first_padding_null = b->core.l_qname - b->core.l_extranul - 1; + const std::size_t first_padding_null = + b->core.l_qname - b->core.l_extranul - 1; // Turn "QUERYNAME\0\0\0" into "QUERYNAME\t\0\0" (assuming // field_separtor = '\t') to be compatible with read_fast_forward_line(). cur_char[first_padding_null] = field_separator; read_tile_line(stats); // Read sequence line - size_t seq_len = b->core.l_qseq; + std::size_t seq_len = b->core.l_qseq; cur_char = reinterpret_cast(bam_get_seq(b)); BamReader::read_sequence_line(stats); From 77c315d986b9f347e8a564a333444187cac2673e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 20 Mar 2026 18:44:48 -0700 Subject: [PATCH 2/5] src/Module.hpp: added total_bases to the ModuleBasicStatistics --- src/Module.hpp | 293 +++++++++++++++++++++++-------------------------- 1 file changed, 135 insertions(+), 158 deletions(-) diff --git a/src/Module.hpp b/src/Module.hpp index 331d50e..f80ed16 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -28,17 +28,15 @@ /* base groups for longer reads, copied from FastQC*/ struct BaseGroup { - size_t start, end; - BaseGroup(size_t _start, size_t _end) : start(_start), end(_end) {} + std::size_t start{}; + std::size_t end{}; }; -class Module { -private: - const std::string module_name; +struct Module { + const std::string module_name{}; -public: // avoid writing things prior to summarizing - bool summarized; + bool summarized{}; // The module name displayed in outputs and html // GS TODO: automate placing it in html too @@ -67,9 +65,7 @@ class Module { Module(const std::string &module_name); virtual ~Module() = 0; - /*********************************************/ - /*****Abstract functions to be implemented****/ - /*********************************************/ + // functions to be in child classes // Summarize the module virtual void @@ -85,9 +81,6 @@ class Module { virtual std::string make_html_data() = 0; - /*********************************************/ - /**************Visible functions**************/ - /*********************************************/ // Summarizes and registers that it summarized void summarize(FastqStats &stats); @@ -106,19 +99,19 @@ class Module { put_data_on_html(HtmlMaker &html_maker); }; -class ModuleBasicStatistics : public Module { -public: - bool is_nanopore; +struct ModuleBasicStatistics : public Module { + static const std::string module_name; + bool is_nanopore{}; std::string file_type; std::string file_encoding; std::string filename_stripped; - size_t avg_read_length; - size_t avg_gc; - size_t num_poor; - size_t min_read_length; - size_t max_read_length; - size_t total_sequences; - static const std::string module_name; + std::size_t avg_read_length{}; + double avg_gc{}; + std::size_t num_poor{}; + std::size_t min_read_length{}; + std::size_t max_read_length{}; + std::size_t total_bases{}; + std::size_t total_sequences{}; ModuleBasicStatistics(const FalcoConfig &config); ~ModuleBasicStatistics() {} void @@ -129,27 +122,30 @@ class ModuleBasicStatistics : public Module { write_module(std::ostream &os); std::string make_html_data(); - void read_data_line(const std::string &line); }; -class ModulePerBaseSequenceQuality : public Module { -private: +struct ModulePerBaseSequenceQuality : public Module { + static const std::string module_name; // from FastQC: whether to group bases - bool do_group; - size_t num_bases; - size_t num_groups; - // grade criteria - size_t base_lower_warn, base_lower_error, base_median_warn, base_median_error; - size_t num_warn, num_error; + bool do_group{}; + std::size_t num_bases{}; + std::size_t num_groups{}; + std::size_t base_lower_warn{}; // grade criteria + std::size_t base_lower_error{}; // grade criteria + std::size_t base_median_warn{}; + std::size_t base_median_error{}; + std::size_t num_warn{}; + std::size_t num_error{}; std::vector group_mean; - std::vector group_ldecile, group_lquartile, group_median, - group_uquartile, group_udecile; + std::vector group_ldecile; + std::vector group_lquartile; + std::vector group_median; + std::vector group_uquartile; + std::vector group_udecile; std::vector base_groups; -public: - static const std::string module_name; ModulePerBaseSequenceQuality(const FalcoConfig &config); ~ModulePerBaseSequenceQuality() {} void @@ -164,14 +160,13 @@ class ModulePerBaseSequenceQuality : public Module { make_html_data(); }; -class ModulePerTileSequenceQuality : public Module { -private: - double grade_warn, grade_error; - size_t max_read_length; - std::unordered_map> tile_position_quality; - std::vector tiles_sorted; +struct ModulePerTileSequenceQuality : public Module { + double grade_warn{}; + double grade_error{}; + std::size_t max_read_length{}; + std::unordered_map> tile_position_quality; + std::vector tiles_sorted; -public: static const std::string module_name; ModulePerTileSequenceQuality(const FalcoConfig &config); ~ModulePerTileSequenceQuality() {} @@ -185,17 +180,15 @@ class ModulePerTileSequenceQuality : public Module { make_html_data(); }; -class ModulePerSequenceQualityScores : public Module { -private: - size_t mode_val; - size_t mode_ind; - size_t offset; - std::array quality_count; +struct ModulePerSequenceQualityScores : public Module { + std::size_t mode_val{}; + std::size_t mode_ind{}; + std::size_t offset{}; + std::array quality_count{}; // grade criteria - size_t mode_warn; - size_t mode_error; + std::size_t mode_warn{}; + std::size_t mode_error{}; -public: static const std::string module_name; ModulePerSequenceQualityScores(const FalcoConfig &config); ~ModulePerSequenceQualityScores() {} @@ -209,27 +202,25 @@ class ModulePerSequenceQualityScores : public Module { make_html_data(); }; -class ModulePerBaseSequenceContent : public Module { -private: - bool do_group; - std::vector a_pct, c_pct, t_pct, g_pct; - double max_diff; - size_t num_bases; - - // flag as to whether or not dataset is WGBS - bool is_bisulfite; +struct ModulePerBaseSequenceContent : public Module { + static const std::string module_name; + bool do_group{}; + std::vector a_pct; + std::vector c_pct; + std::vector t_pct; + std::vector g_pct; + double max_diff{}; + std::size_t num_bases{}; - // if so we have to test T vs C instead of A vs G - bool is_reverse_complement; + bool is_bisulfite{}; // flag for dataset is WGBS + bool is_reverse_complement{}; // indicates test T vs C instead of A vs G - // for grade - double sequence_error, sequence_warn; + double sequence_error{}; // for grade + double sequence_warn{}; // for grade - size_t num_groups; + std::size_t num_groups{}; std::vector base_groups; -public: - static const std::string module_name; ModulePerBaseSequenceContent(const FalcoConfig &config); ~ModulePerBaseSequenceContent() {} void @@ -242,15 +233,14 @@ class ModulePerBaseSequenceContent : public Module { make_html_data(); }; -class ModulePerSequenceGCContent : public Module { -private: - double gc_warn, gc_error; - double gc_deviation; +struct ModulePerSequenceGCContent : public Module { + static const std::string module_name; + double gc_warn{}; + double gc_error{}; + double gc_deviation{}; std::array gc_count; std::array theoretical_gc_count; -public: - static const std::string module_name; ModulePerSequenceGCContent(const FalcoConfig &config); ~ModulePerSequenceGCContent() {} void @@ -263,26 +253,21 @@ class ModulePerSequenceGCContent : public Module { make_html_data(); }; -class ModulePerBaseNContent : public Module { -private: - size_t num_bases; - // for grade - size_t grade_n_warn; - size_t grade_n_error; - - double max_n_pct; - std::array gc_count; - std::array theoretical_gc_count; +struct ModulePerBaseNContent : public Module { + static const std::string module_name; + std::size_t num_bases{}; + std::size_t grade_n_warn{}; // for grade + std::size_t grade_n_error{}; // for grade + double max_n_pct{}; + std::array gc_count; + std::array theoretical_gc_count; std::vector n_pct; - // grade vars - size_t gc_warn, gc_error; - - bool do_group; - size_t num_groups; + std::size_t gc_warn{}; // grade vars + std::size_t gc_error{}; // grade vars + bool do_group{}; + std::size_t num_groups{}; std::vector base_groups; -public: - static const std::string module_name; ModulePerBaseNContent(const FalcoConfig &config); ~ModulePerBaseNContent() {} void @@ -295,23 +280,21 @@ class ModulePerBaseNContent : public Module { make_html_data(); }; -class ModuleSequenceLengthDistribution : public Module { -private: - bool do_grade_error; - bool do_grade_warn; - size_t max_read_length; - std::vector sequence_lengths; +struct ModuleSequenceLengthDistribution : public Module { + static const std::string module_name; + bool do_grade_error{}; + bool do_grade_warn{}; + std::size_t max_read_length{}; + std::vector sequence_lengths; // warn and fail criteria - bool is_all_same_length; - size_t empty_reads; + bool is_all_same_length{}; + std::size_t empty_reads{}; - bool do_group; - size_t num_groups; + bool do_group{}; + std::size_t num_groups{}; std::vector base_groups; -public: - static const std::string module_name; ModuleSequenceLengthDistribution(const FalcoConfig &config); ~ModuleSequenceLengthDistribution() {} void @@ -324,19 +307,18 @@ class ModuleSequenceLengthDistribution : public Module { make_html_data(); }; -class ModuleSequenceDuplicationLevels : public Module { -private: - double seq_total, seq_dedup; +struct ModuleSequenceDuplicationLevels : public Module { + static const std::string module_name; + double seq_total{}; + double seq_dedup{}; - double grade_dup_warn; - double grade_dup_error; - double total_deduplicated_pct; - std::array percentage_deduplicated; - std::array percentage_total; - std::unordered_map counts_by_freq; + double grade_dup_warn{}; + double grade_dup_error{}; + double total_deduplicated_pct{}; + std::array percentage_deduplicated{}; + std::array percentage_total{}; + std::unordered_map counts_by_freq; -public: - static const std::string module_name; ModuleSequenceDuplicationLevels(const FalcoConfig &config); ~ModuleSequenceDuplicationLevels() {} void @@ -349,20 +331,20 @@ class ModuleSequenceDuplicationLevels : public Module { make_html_data(); }; -class ModuleOverrepresentedSequences : public Module { -private: - size_t num_reads; - std::vector> overrep_sequences; - double grade_warn, grade_error; - const double min_fraction_to_overrepresented = 0.001; +struct ModuleOverrepresentedSequences : public Module { + static constexpr auto min_fraction_to_overrepresented = 0.001; + static const std::string module_name; + + std::size_t num_reads{}; + std::vector> overrep_sequences; + double grade_warn{}; + double grade_error{}; std::vector> contaminants; // Function to find the matching contaminant within the list std::string get_matching_contaminant(const std::string &seq); -public: - static const std::string module_name; ModuleOverrepresentedSequences(const FalcoConfig &config); ~ModuleOverrepresentedSequences() {} void @@ -375,36 +357,31 @@ class ModuleOverrepresentedSequences : public Module { make_html_data(); }; -class ModuleAdapterContent : public Module { -private: - // Number of adapters to test - size_t num_adapters; - - // number of bases to report - size_t num_bases; - +struct ModuleAdapterContent : public Module { + static const std::string module_name; + std::size_t num_adapters{}; // Number of adapters to test + std::size_t num_bases{}; // number of bases to report // adapter size to know how many bases to report - size_t adapter_size; + std::size_t adapter_size{}; - // Information from config + // info from config std::vector adapter_names; std::vector adapter_seqs; - std::vector adapter_hashes; - size_t shortest_adapter_size; + std::vector adapter_hashes; + std::size_t shortest_adapter_size{}; - // vector to be reported + // to be reported std::vector> adapter_pos_pct; - // minimum percentages for warn/fail - double grade_warn, grade_error; + // min minimum cutoffs for warn/fail (percentages) + double grade_warn{}; + double grade_error{}; // Aux function to count adapter in a position double - count_adapter(const std::vector &kmer_count, const size_t pos, - const size_t adapter_hash, const size_t adapter_size, - const size_t kmer_size); + count_adapter(const std::vector &kmer_count, + const std::size_t pos, const std::size_t adapter_hash, + const std::size_t adapter_size, const std::size_t kmer_size); -public: - static const std::string module_name; ModuleAdapterContent(const FalcoConfig &config); ~ModuleAdapterContent() {} void @@ -417,25 +394,25 @@ class ModuleAdapterContent : public Module { make_html_data(); }; -class ModuleKmerContent : public Module { -private: - size_t num_kmer_bases; - size_t kmer_size; - size_t num_kmers; - size_t num_seen_kmers; +struct ModuleKmerContent : public Module { + static constexpr std::size_t MIN_OBS_EXP_TO_REPORT = 5; + static constexpr std::size_t MAX_KMERS_TO_REPORT = 20; + static constexpr std::size_t MAX_KMERS_TO_PLOT = 10; + static const std::string module_name; + + std::size_t num_kmer_bases{}; + std::size_t kmer_size{}; + std::size_t num_kmers{}; + std::size_t num_seen_kmers{}; - double grade_warn, grade_error; - std::array pos_kmer_count; - std::vector total_kmer_counts; + double grade_warn{}; + double grade_error{}; + std::array pos_kmer_count{}; + std::vector total_kmer_counts; std::vector obs_exp_max; - std::vector where_obs_exp_is_max; - std::vector> kmers_to_report; + std::vector where_obs_exp_is_max; + std::vector> kmers_to_report; -public: - static const std::string module_name; - static const size_t MIN_OBS_EXP_TO_REPORT = 5; - static const size_t MAX_KMERS_TO_REPORT = 20; - static const size_t MAX_KMERS_TO_PLOT = 10; ModuleKmerContent(const FalcoConfig &config); ~ModuleKmerContent() {} void From 005fdf2f896bb11f50bae698da7eba455b913281 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 20 Mar 2026 18:47:19 -0700 Subject: [PATCH 3/5] src/Module.cpp: assigning to total_bases from stats.total_bases which is tabulated in StreamReader and formatting output of total bases. Calculating avg_gc as double using those total_bases and formatting the output with one digit after the decimal, rather than truncating which FastQC might be doing --- src/Module.cpp | 151 ++++++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 72 deletions(-) diff --git a/src/Module.cpp b/src/Module.cpp index c7223c9..b3365a2 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -44,22 +44,24 @@ make_default_base_groups(std::vector &base_groups, const std::size_t num_bases) { base_groups.clear(); for (std::size_t i = 0; i < num_bases; ++i) - base_groups.push_back(BaseGroup(i, i)); + base_groups.push_back({i, i}); } /************* EXP BASE GROUP **************/ void make_exponential_base_groups(std::vector &base_groups, const std::size_t &num_bases) { - std::size_t starting_base = 0, end_base, interval = 1; + std::size_t starting_base{}; + std::size_t end_base{}; + std::size_t interval{1}; base_groups.clear(); - for (; starting_base < num_bases;) { + while (starting_base < num_bases) { end_base = starting_base + interval - 1; if (end_base >= num_bases) end_base = num_bases; - base_groups.push_back(BaseGroup(starting_base, end_base)); + base_groups.push_back({starting_base, end_base}); starting_base += interval; if (starting_base == 9 && num_bases > 75) interval = 5; @@ -107,37 +109,40 @@ void make_linear_base_groups(std::vector &base_groups, const std::size_t num_bases) { - // For lengths below 75bp we just return everything. + // lengths not larger than 75bp just return everything if (num_bases <= 75) { make_default_base_groups(base_groups, num_bases); return; } - // We need to work out what interval we're going to use. + // determine the interval to use const std::size_t interval = get_linear_interval(num_bases); - std::size_t starting_base = 1; + std::size_t starting_base{1}; while (starting_base <= num_bases) { - std::size_t end_base = starting_base + interval - 1; - - if (starting_base < 10) - end_base = starting_base; - - if (starting_base == 10 && interval > 10) - end_base = interval - 1; - - if (end_base > num_bases) - end_base = num_bases; - - BaseGroup bg = BaseGroup(starting_base - 1, end_base - 1); - base_groups.push_back(bg); - - if (starting_base < 10) - starting_base++; - else if (starting_base == 10 && interval > 10) - starting_base = interval; - else - starting_base += interval; + const auto end_base = [&] { + std::size_t end_base = starting_base + interval - 1; + if (starting_base < 10) + end_base = starting_base; + if (starting_base == 10 && interval > 10) + end_base = interval - 1; + if (end_base > num_bases) + end_base = num_bases; + return end_base; + }(); + + assert(starting_base > 0u && end_base > 0u); + base_groups.push_back({starting_base - 1ul, end_base - 1ul}); + + starting_base = [&] { + if (starting_base < 10) + starting_base++; + else if (starting_base == 10 && interval > 10) + starting_base = interval; + else + starting_base += interval; + return starting_base; + }(); } } @@ -165,33 +170,31 @@ get_corrected_count(std::size_t count_at_limit, std::size_t num_reads, if (num_reads - num_obs < count_at_limit) return num_obs; - // If not then we need to see what the likelihood is that we had - // another sequence with this number of observations which we would - // have missed. We'll start by working out the probability of NOT seeing a - // sequence with this duplication level within the first count_at_limit - // sequences of num_obs. This is easier than calculating - // the probability of seeing it. + // If not then we need to see what the likelihood is that we had another + // sequence with this number of observations which we would have missed. We'll + // start by working out the probability of NOT seeing a sequence with this + // duplication level within the first count_at_limit sequences of num_obs. + // This is easier than calculating the probability of seeing it. double p_not_seeing = 1.0; // To save doing long calculations which are never going to produce anything // meaningful we'll set a limit to our p-value calculation. This is the // probability below which we won't increase our count by 0.01 of an // observation. Once we're below this we stop caring about the corrected - // value since it's going to be so close to the observed value thatwe can - // just return that instead. - double limit_of_caring = 1.0 - (num_obs / (num_obs + 0.01)); + // value since it's going to be so close to the observed value thatwe can just + // return that instead. + const double limit_of_caring = 1.0 - (num_obs / (num_obs + 0.01)); for (std::size_t i = 0; i < count_at_limit; ++i) { p_not_seeing *= static_cast((num_reads - i) - dup_level) / static_cast(num_reads - i); - if (p_not_seeing < limit_of_caring) { p_not_seeing = 0; break; } } - // Now we can assume that the number we observed can be - // scaled up by this proportion + // Now we can assume that the number we observed can be scaled up by this + // proportion return num_obs / std::max(std::numeric_limits::min(), 1.0 - p_not_seeing); } @@ -377,8 +380,9 @@ ModuleBasicStatistics::ModuleBasicStatistics(const FalcoConfig &config) : void ModuleBasicStatistics::summarize_module(FastqStats &stats) { - // Total sequences + // total sequences and bases total_sequences = stats.num_reads; + total_bases = stats.total_bases; // min and max read length min_read_length = stats.min_read_length; @@ -430,49 +434,56 @@ ModuleBasicStatistics::summarize_module(FastqStats &stats) { // Average read length avg_read_length = 0; - std::size_t total_bases = 0; + std::size_t total_bases_for_mean = 0; for (std::size_t i = 0; i < max_read_length; ++i) { if (i < FastqStats::SHORT_READ_THRESHOLD) - total_bases += i * stats.read_length_freq[i]; + total_bases_for_mean += i * stats.read_length_freq[i]; else - total_bases += + total_bases_for_mean += i * stats.long_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD]; } - - avg_read_length = - total_bases / std::max(static_cast(1), total_sequences); + avg_read_length = total_bases_for_mean / std::max(1ul, total_sequences); // counts bases G and C in each base position - avg_gc = 0; // GC % // GS: TODO delete gc calculation during stream and do it using the total G // counts in all bases avg_gc = - 100 * stats.total_gc / std::max(1.0, static_cast(total_bases)); + 100.0 * stats.total_gc / std::max(1.0, static_cast(total_bases)); } -// It's always a pass void -ModuleBasicStatistics::make_grade() {} +ModuleBasicStatistics::make_grade() {} // always a pass void ModuleBasicStatistics::write_module(std::ostream &os) { + static constexpr auto mega = 1'000'000; os << "#Measure\tValue\n"; os << "Filename\t" << filename_stripped << "\n"; os << "File type\t" << file_type << "\n"; os << "Encoding\t" << file_encoding << "\n"; os << "Total Sequences\t" << total_sequences << "\n"; + // clang-format off + os << "Total Bases\t" + << (total_bases > mega ? total_bases / mega : total_bases) + << (total_bases > mega ? " Mbp\n" : " bp\n"); + // clang-format on os << "Sequences flagged as poor quality\t" << num_poor << "\n"; os << "Sequence length\t"; - if (min_read_length == max_read_length) { - os << min_read_length; - } - else { - os << min_read_length << "-" << max_read_length; - } + os << min_read_length; + if (min_read_length != max_read_length) + os << "-" << max_read_length; os << "\n"; - os << "%GC\t" << static_cast(avg_gc) << "\n"; + const auto default_precision{os.precision()}; + // clang-format off + os << "%GC\t" + << std::setprecision(1) + << std::fixed + << avg_gc << '\n' + << std::defaultfloat + << std::setprecision(default_precision); + // clang-format on } std::string @@ -515,9 +526,9 @@ ModuleBasicStatistics::read_data_line(const std::string &line) { else if (lhs == "Encoding") file_encoding = rhs; else if (lhs == "Total Sequences") - total_sequences = atoi(rhs.c_str()); + total_sequences = std::atoi(std::data(rhs)); else if (lhs == "Sequences flagged as poor quality") - num_poor = atoi(rhs.c_str()); + num_poor = std::atoi(std::data(rhs)); else if (lhs == "Sequence length") { // non-constant sequence length @@ -526,12 +537,12 @@ ModuleBasicStatistics::read_data_line(const std::string &line) { std::string min_l, max_l; std::getline(seq_iss, min_l, '-'); std::getline(seq_iss, max_l, '-'); - min_read_length = atoi(min_l.c_str()); - max_read_length = atoi(max_l.c_str()); + min_read_length = std::atoi(std::data(min_l)); + max_read_length = std::atoi(std::data(max_l)); } } else if (lhs == "%GC") - avg_gc = atoi(rhs.c_str()); + avg_gc = std::atoi(std::data(rhs)); else { throw std::runtime_error("malformed basic statistic" + lhs); } @@ -819,7 +830,7 @@ ModulePerTileSequenceQuality::summarize_module(FastqStats &stats) { for (std::size_t i = 0; i < lim; ++i) { // transform sum of all qualities in mean const auto itr = stats.tile_position_count.find(v.first); - if (itr == cend(stats.tile_position_count)) + if (itr == std::cend(stats.tile_position_count)) throw std::runtime_error( "failure ModulePerTileSequenceQuality::summarize_module"); const std::size_t count_at_pos = itr->second[i]; @@ -1787,19 +1798,15 @@ void ModuleOverrepresentedSequences::summarize_module(FastqStats &stats) { // Keep only sequences that pass the input cutoff num_reads = stats.num_reads; - for (auto it = stats.sequence_count.begin(); it != stats.sequence_count.end(); - ++it) { + for (auto it = std::cbegin(stats.sequence_count); + it != std::cend(stats.sequence_count); ++it) { if (it->second > num_reads * min_fraction_to_overrepresented) { overrep_sequences.push_back(*it); } } - - // Sort strings by frequency - std::sort(begin(overrep_sequences), end(overrep_sequences), - [](const std::pair &a, - const std::pair &b) { - return a.second > b.second; - }); + // sort strings by frequency + std::sort(std::begin(overrep_sequences), std::end(overrep_sequences), + [](const auto &a, const auto &b) { return a.second > b.second; }); } void From 7804083f8e9550f9fe80244ff1324338301886a9 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 20 Mar 2026 19:05:17 -0700 Subject: [PATCH 4/5] src/HtmlMaker.cpp: fixing a bug in formatting the html --- src/HtmlMaker.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/HtmlMaker.cpp b/src/HtmlMaker.cpp index 1dc8ca6..ffa789d 100644 --- a/src/HtmlMaker.cpp +++ b/src/HtmlMaker.cpp @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include void HtmlMaker::put_data(const std::string &placeholder, const std::string &data) { @@ -41,11 +41,11 @@ HtmlMaker::put_data(const std::string &placeholder, const std::string &data) { void HtmlMaker::put_comment(std::string &comment_begin, std::string &comment_end, const bool done) { - if (!done) { // put html comments if analysis was skipped + if (!done) { // put html comments if analysis was skipped put_data(comment_begin, ""); } - else { // otherwise delete placeholder + else { // otherwise delete placeholder put_data(comment_begin, ""); put_data(comment_end, ""); } @@ -54,20 +54,22 @@ HtmlMaker::put_comment(std::string &comment_begin, std::string &comment_end, void HtmlMaker::put_file_details(const FalcoConfig &falco_config) { using namespace std::string_literals; - static const auto left_tag = "\\{\\{"s; - static const auto right_tag = "\\}\\}"s; + static constexpr auto left_tag = "\\{\\{"; + static constexpr auto right_tag = "\\}\\}"; + const auto filename_formatted = falco_config.filename_stripped; std::regex filename_re(left_tag + "filename"s + right_tag); - std::regex_replace(html_boilerplate, filename_re, - falco_config.filename_stripped); + html_boilerplate = + std::regex_replace(html_boilerplate, filename_re, filename_formatted); using system_clock = std::chrono::system_clock; auto time_unformatted = system_clock::to_time_t(system_clock::now()); std::string time_formatted = std::string(ctime(&time_unformatted)); std::regex date_re(left_tag + "date"s + right_tag); - std::regex_replace(html_boilerplate, date_re, time_formatted); + html_boilerplate = + std::regex_replace(html_boilerplate, date_re, time_formatted); std::regex version_re(left_tag + "version"s + right_tag); - std::regex_replace(html_boilerplate, version_re, VERSION); + html_boilerplate = std::regex_replace(html_boilerplate, version_re, VERSION); } From 29ad868734dc9fbd330d0ae644d3f99da774c924 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Fri, 20 Mar 2026 19:07:45 -0700 Subject: [PATCH 5/5] test/md5sum.txt: updating hashes for test output files based on changes to output for total bases and gc percent --- test/md5sum.txt | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/md5sum.txt b/test/md5sum.txt index faac2a1..c1564b8 100644 --- a/test/md5sum.txt +++ b/test/md5sum.txt @@ -1,30 +1,30 @@ -fe1d421b95e1289f62d5f60157b8bed0 test_output/SRR1853178_1/fastqc_data.txt +37014d8b6e5338c54f6e39a9a65448da test_output/SRR1853178_1/fastqc_data.txt 36df1dcab539ba4ef885239fc8524636 test_output/SRR1853178_1/summary.txt -19c984bdddd5d656e8bb9f50bb08fc5a test_output/SRR3897196_2/fastqc_data.txt +ce1121532724dbfc9f75408f3b6bb1b8 test_output/SRR3897196_2/fastqc_data.txt 80cd130958bcb2219f1e5a07d06a3b6e test_output/SRR3897196_2/summary.txt -9bb60254ebbca76328a0fc5c7d35d38f test_output/SRR10143153_2/fastqc_data.txt +189608afba8bc88f237259a8801417c4 test_output/SRR9624732_1/fastqc_data.txt +c94f94bdfbed9b83f156c15ffea84127 test_output/SRR9624732_1/summary.txt +a96c16a6ac4cce541f745b25d5b04fef test_output/SRR10143153_2/fastqc_data.txt 19f1811f324e4c44154f659bb6e22806 test_output/SRR10143153_2/summary.txt -b5b45d94670c42ddff565d53ff9b34e8 test_output/SRR1772703.lite.1/fastqc_data.txt +1991f6ba435c23642d3141856864ad99 test_output/SRR1772703.lite.1/fastqc_data.txt ad5727295e7c8de6eb6874837bf1518f test_output/SRR1772703.lite.1/summary.txt -3844262fde0e6c1bfeb6f3d12da3e483 test_output/SRR9624732_2/fastqc_data.txt +657b3a5b77d8a7b9d5e8ff3c08dae765 test_output/SRR9624732_2/fastqc_data.txt fefc5d746f853c14b5e00421ad1ec260 test_output/SRR9624732_2/summary.txt -e7b534295a334f21f143daf0209708a1 test_output/SRR10124060/fastqc_data.txt +e4df74ec74cf5ee69dd76d08825d1b2c test_output/SRR10124060/fastqc_data.txt 776f7d1b53bbed8683de9ca1d2529f1e test_output/SRR10124060/summary.txt -38556ea8d058797f2e30b48c30fe77c6 test_output/SRR891268_2/fastqc_data.txt +e5e62fbaefdf730452e0133eca248f69 test_output/SRR891268_2/fastqc_data.txt 20a8e50baace4c672622793874a3d7de test_output/SRR891268_2/summary.txt -3cc125e2b29921e9369194616941b9e3 test_output/SRR9878537.lite.1/fastqc_data.txt +401d183c10a50bdf0eac497c63630918 test_output/SRR9878537.lite.1/fastqc_data.txt e5c40997d4993c07e164ee5598c39cf9 test_output/SRR9878537.lite.1/summary.txt -f9ebe6a18e4438a79d535baa59d6a629 test_output/SRR891268_1/fastqc_data.txt +8334959b07f8baa92548ad939aff5df0 test_output/SRR891268_1/fastqc_data.txt 69e7d0c53cd2e67117637c408b65333a test_output/SRR891268_1/summary.txt -f11a39a545f2469161c7502c56082ee3 test_output/SRR6059706.lite.1/fastqc_data.txt +985da7c31b11a4cb40686bf20d0df9cd test_output/SRR6059706.lite.1/fastqc_data.txt e348e4bcc7fc6f05e989ac7858d2b287 test_output/SRR6059706.lite.1/summary.txt -753bb1af5e2cf52a38e5d7b3d2c2f39b test_output/SRR6387347/fastqc_data.txt +2eb1acd772bf29031455cb38072a396f test_output/SRR6387347/fastqc_data.txt a61f65047e76f93300967cf399d044de test_output/SRR6387347/summary.txt -513957b15a848d5eb29ca8e5c2ed0c45 test_output/SRR3897196_1/fastqc_data.txt +d79d49f81677f19247bbd9cd6021f903 test_output/SRR3897196_1/fastqc_data.txt b736ee95d5c450ef5c0dda31957b6818 test_output/SRR3897196_1/summary.txt -2297ae14b668630cb547aacfd2da992f test_output/SRR10143153_1/fastqc_data.txt +c93b8e4f2f14664419f57fb33edf5b64 test_output/SRR10143153_1/fastqc_data.txt 9ad191925d47a57d4f8b12f21ba0a7c3 test_output/SRR10143153_1/summary.txt -255171890adc7117a5c4fae6d355091e test_output/SRR1853178_2/fastqc_data.txt +940b43bfbddd9d22cac65895c6d37ae8 test_output/SRR1853178_2/fastqc_data.txt c331d0f7a6aa9d72be41ac531f9ba269 test_output/SRR1853178_2/summary.txt -c94f94bdfbed9b83f156c15ffea84127 test_output/SRR9624732_1/summary.txt -b433a0d30f3952f2f5f94cd90ecc6939 test_output/SRR9624732_1/fastqc_data.txt