-
Couldn't load subscription status.
- Fork 80
Description
XCMS must make a lot of EICs during peak picking and it is also used for chromatograms() etc.
For QC4Metabolomics I had to screen for a lot of contaminants, so I tried to see if I could make it faster. I tried the C version of getEIC and everything else I could think of but the only significant improvement was using join_by and between from dplyr. This is several times faster and is not affected much by the number of EICs extract from a file.
So as I discussed with Johannes at the metabolomics conference I leave here a proof of concept. Obviously this implementation is too tidyverse centric but could be used as a reference for a faster implementation.
Time comparison when doing 800 EICs is ~1 min vs 6 min.
Using the example from Metabonaut:
# Load test data ----------------------------------------------------------
library(Metabonaut)
library(MsBackendMetaboLights)
library(MsIO)
library(MsExperiment)
library(xcms)
library(purrr)
library(MSnbase)
param <- MetaboLightsParam(mtblsId = "MTBLS8735",
assayName = paste0("a_MTBLS8735_LC-MS_positive_",
"hilic_metabolite_profiling.txt"),
filePattern = ".mzML")
lcms1 <- readMsObject(MsExperiment(),
param,
keepOntology = FALSE,
keepProtocol = FALSE,
simplify = TRUE)
# Load our list of standard -----------------------------------------------
intern_standard <- read.delim("https://raw.githubusercontent.com/rformassspectrometry/Metabonaut/eb5b5674d00a65ac1ca6cec74ff6d3ed8df38e15/vignettes/intern_standard_list.txt")
# nastier version with 801 intervals
#intern_standard <- data.frame(mzmin = 200:1000, mzmax = (200:1000)+0.1 )
# Current way to get EICs -------------------------------------------------
system.time({
eic_is <- chromatogram(
lcms1,
mz = as.matrix(intern_standard[, c("mzmin", "mzmax")]),
aggregationFun = "max"
)
})
# 102 s
# nasty version: 363
Faster EIC extraction with dplyr::join_by
# Using dplyr::join_by ----------------------------------------------------
## function to get raw data into a table
get_raw_long <- function(raw){
require(dplyr)
bind_cols(scan_rt = rtime(spectra(raw)),
scan = scanIndex(spectra(raw)),
tibble(mz_int = lapply(peaksData(spectra(raw)), as_tibble))
) %>%
unnest(mz_int)
}
## function to extract the EICs from the complete raw data
extract_intervals <- function(raw, lower, upper, min_intensity = 0){
require(dplyr)
require(tidyr)
require(purrr)
# get raw data in a table
raw_long <- get_raw_long(raw)
# make a table that has all scan x interval combinations
raw_long_distinct_all <- raw_long %>%
distinct(scan, scan_rt)
raw_long_distinct_all <- map_dfr(1:length(lower), ~ mutate(raw_long_distinct_all, idx = ..1))
# filter all raw data that is below min or above max to make the join faster
# also here an intensity filter is included
raw_long <- raw_long %>%
filter(intensity>=min_intensity) %>%
filter(mz>min(lower), mz<max(upper))
EICs_final <- raw_long %>%
# join raw data with a table of the intervals using join_by
left_join(
tibble(idx = 1:length(lower), lower , upper),
by = join_by(between(mz, lower, upper)),
multiple = "all"
) %>%
filter(!is.na(idx)) %>% # remove all scans not included in an interval
group_by(idx, scan, scan_rt) %>% # group data by interval
summarise(intensity = max(intensity), .groups = "drop") %>% # take max when more values in same scan
full_join(raw_long_distinct_all, by = c("idx", "scan", "scan_rt")) %>% # join with all possible scan x interval combinations to be able to fill zeros
mutate(intensity = if_else(is.na(intensity), 0, intensity)) %>% # fill in zeros
arrange(scan) %>% # sort
group_by(idx) %>%
group_nest() %>% # make a list
pull(data) # extract list
return(EICs_final)
}Run:
# Extract the EICs using the fast function going one file at a time -------
# all the joining stuff could be done for all files in one go but at the cost of memory
system.time({
eic_is_fast <- map(1:length(lcms1), ~extract_intervals(lcms1[..1,], intern_standard[, c("mzmin")], intern_standard[, c("mzmax")]))
})
# 32 s
# nasty version: 54 sTest that they are the same:
# Comparison of results ---------------------------------------------------
eic_is_fast_1_1 <- Chromatogram(
rtime = eic_is_fast[[1]][[1]]$scan_rt,
intensity = eic_is_fast[[1]][[1]]$intensity,
msLevel = 1L,
mz = c(mzmin = intern_standard[1, c("mzmin")], mzmax = intern_standard[1, c("mzmax")]),
aggregationFun = "max"
)
compareChromatograms(eic_is_fast_1_1, eic_is[1,1])