Skip to content
Open
289 changes: 289 additions & 0 deletions automatedSelEstimscript2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# Copyright INRA
# author: Renaud VITALIS (2013)
#
# [email protected]
#
# Mariana DAMAS-MORA (2023)
#
#[email protected]
# This file is part of SelEstim.
#
# SelEstim is a computer program whose purpose is to is to detect
# and measure selection from gene frequency data.
#
# SelEstim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# SelEstim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#automatedselestimscript: Plots the specific selection coefficient of the makers in each position and the KLD values fpr each locus from the SelEstim results.
#Usage: Rscript [path to the automatedselestimscript.R] [path to summary_delta.out] [path to calibration summary_delta.out path] [path to summary_sigma.out] [path to calibration map] [path to the output specific selection coefficient plot] [path to the kld values output plot]
#source script
library(gplots)
#adaptatition to get arguments
args <- commandArgs(trailingOnly = TRUE)

# Get arguments
delta <- args[1]
calibration_delta <- args[2]
sigma <- args[3]
output_coefficient <- args[4]
output_kld <- args[5]
#calibration_map <- args[6]

plot.delta <- function(file = "summary_delta.out",map = "") {

# 'map' is the name of a file that contains two columns only: the SNP ID and its position (in bp). Be careful to keep the same order of the SNPs as in the data file)

palette(rich.colors(64))
output <- read.table(file,header = TRUE)
mean.delta <- output$mean
relative.delta <- (mean.delta / max(mean.delta))
n.snps <- length(mean.delta)
if (map != "") {
if(!file.exists(map)) {
stop(paste("The file ",map," does not exist...",sep = ""))
}
physical.map <- read.table(map,header = FALSE)
position <- physical.map[,2] / 1e6
sorted <- order(position)
plot(position[sorted],mean.delta[sorted],type = "n",xlab = "Position (Mb)",ylab = expression(Locus-specific~selection~coefficient~delta[j]),ylim = c(0,max(mean.delta) + 10))
segments(position[sorted],0,position[sorted],mean.delta[sorted],col = 1 + 63 * relative.delta[sorted],lwd = relative.delta[sorted])
}
else {
plot(seq(1,n.snps),mean.delta,type = "n",xlab = "Markers",ylab = expression(Locus-specific~selection~coefficient~delta[j]),ylim = c(0,max(mean.delta) + 10))
segments(seq(1,n.snps),0,seq(1,n.snps),mean.delta,col = 1 + 63 * relative.delta,lwd = relative.delta)
}
}

plot.kld <- function(file = "summary_delta.out",map = "",calibration_file = "calibration/summary_delta.out",limit,window.size,n.markers) {

# 'map' is the name of a file that contains two columns only: the SNP ID and its position (in bp). Be careful to keep the same order of the SNPs as in the data file)
# 'limit' (optional) is used to compute the threshold value of the empirical distribution of the KLD used to calibrate the KLD;
# e.g., if you chose limit = 0.01, then the 99\%-quantile of the KLD distribution from the pod analysis will be used as a decision criterion to discriminate between selection and neutrality.
# 'window.size (optional) is the size of the sliding window, which may be used to determine the regions that contain at least n.markers with KLD above the threshold determined by the 'limit' argument
# 'n.markers' (optional) is uded to determine the regions that contain at least 'n.markers' with KLD above the threshold determined by the 'limit' argument

output <- read.table(file,header = TRUE)
kld <- output$KLD
n.snps <- length(kld)
if (map != "") {
if (!file.exists(map)) {
stop(paste("The file ",map," does not exist...",sep = ""))
}
physical.map <- read.table(map,header = F)
position <- physical.map[,2]
sorted <- order(position)
sorted.position <- position[sorted]
sorted.position.mb <- sorted.position / 1e6
plot(sorted.position.mb,kld[sorted],cex = 0.25,xlab = "Position (Mb)",ylab = "Kullback-Leibler divergence (KLD)",pch = 16,col = "grey",type = "n")
points(sorted.position.mb,kld[sorted],cex = 0.25,pch = 16,col = "grey")
if (!missing(window.size) & !missing(n.markers) & !missing(limit)) {
if (!file.exists(calibration_file)) {
stop(paste("The file ",calibration_file," does not exist...",sep = ""))
}
calibration <- read.table(calibration_file,header = TRUE)
kld.calibration <- calibration$KLD
threshold <- quantile(kld.calibration,(1 - limit))
outstanding.region <- vector("numeric",n.snps)
for (i in 1:n.snps) {
window <- abs(sorted.position[i] - sorted.position) <= (window.size / 2)
outstanding.region[i] <- length(kld[window][kld[window] >= threshold])
}
within <- (outstanding.region >= n.markers & kld >= threshold)
if (length(within[within]) > 0) {
segments(sorted.position.mb[within],0, sorted.position.mb[within],max(kld),col = rich.colors(1,alpha = 0.2),lwd = 2)
}
points(sorted.position.mb[within][kld[within] >= threshold],kld[within][kld[within] >= threshold],cex = 0.25,col = "black",pch = 8)
}
else {
if (!missing(limit)) {
if (!file.exists(calibration_file)) {
stop(paste("The file ",calibration_file," does not exist...",sep = ""))
}
calibration <- read.table(calibration_file,header = TRUE)
kld.calibration <- calibration$KLD
threshold <- quantile(kld.calibration,(1 - limit))
# points(position.mb[sorted][kld[sorted] >= threshold],kld[sorted][kld[sorted] >= threshold],cex = 0.25,col = "black",pch = 8)
points(sorted.position.mb[kld[sorted] >= threshold],kld[sorted][kld[sorted] >= threshold],cex = 0.25,col = "black",pch = 8)
}
}
}
else {
plot(seq(1,n.snps),kld,cex = 0.25,xlab = "Markers",ylab = "Kullback-Leibler divergence (KLD)",pch = 16,col = "grey",type = "n")
points(seq(1,n.snps),kld,cex = 0.25,pch = 16,col = "grey")
if (!missing(limit)) {
if (!file.exists(calibration_file)) {
stop(paste("The file ",calibration_file," does not exist...",sep = ""))
}
calibration <- read.table(calibration_file,header = TRUE)
kld.calibration <- calibration$KLD
threshold <- quantile(kld.calibration,(1 - limit))
points(seq(1,n.snps)[kld >= threshold],kld[kld >= threshold],cex = 0.25,col = "black",pch = 8)
print(seq(1,n.snps)[kld>=threshold])
print(length(seq(1,n.snps)[kld>=threshold]))

}
}
}

randomize.reference.allele <- function (infile = "",outfile = "",pool = FALSE) {

# 'infile' is the original dataset
# 'outfile' is the dataset where reference alleles are chosen randomly
# 'pool' is an indicator variable that says whether the data consist in allele counts or reads (pooled data)
# the 'reference.allele' output file contains, for each locus, the reference allele chosen from the original data

if (infile != "") {
if(!file.exists(infile)) {
stop(paste("The file ",infile," does not exist...",sep = ""))
}
}
else {
stop(paste("\n\tThe argument \"infile\" is missing, with no default value",sep = ""))
}
if (outfile == "") {
stop(paste("\n\tThe argument \"outfile\" is missing, with no default value",sep = ""))
}
skip.lines <- 2
if (pool) {
skip.lines <- skip.lines + 1
}
orig.data <- read.table(infile,skip = skip.lines)
dummy <- scan(infile,nmax = 2)
number.of.populations <- dummy[1]
number.of.loci <- dummy[2]
if (number.of.populations != (ncol(orig.data) / 2)) {
stop(paste("\tProblem reading file \"infile\": perhaps the \"pool\" argument is misspecified",sep = ""))
}
if (number.of.loci != nrow(orig.data)) {
stop(paste("\tProblem reading file \"infile\": perhaps the \"pool\" argument is misspecified",sep = ""))
}
new.data <- matrix(nrow = nrow(orig.data),ncol = ncol(orig.data))
flip <- vector(mode = "numeric",length = ncol(orig.data))
cpt <- 0
for (i in 1: number.of.populations) {
flip[c(1,2) + cpt] <- c(2,1) + cpt
cpt <- cpt + 2
}
mask <- sample(c(TRUE,FALSE),size = number.of.loci,replace = TRUE)
same <- seq(1,number.of.loci)[mask]
anti <- seq(1,number.of.loci)[!mask]
new.data[same,] <- as.matrix(orig.data[same,])
new.data[anti,] <- as.matrix(orig.data[anti,flip])
write.table(number.of.populations,file = outfile,row.names = FALSE,col.names = FALSE,sep = '\t')
write.table(number.of.loci,file = outfile,row.names = FALSE,col.names = FALSE,sep = '\t',append = TRUE)
if (pool) {
sample.size <- read.table(infile,nrows = 1,skip = 2)
write.table(sample.size,file = outfile,row.names = FALSE,col.names = FALSE,sep = '\t',append = TRUE)
}
write.table(new.data,file = outfile,row.names = FALSE,col.names = FALSE,sep = '\t',append = TRUE)
list.alleles <- 2 - mask
write.table(cbind(seq(1,number.of.loci),list.alleles),file = "reference.allele",quote = FALSE,row.names = FALSE,col.names = c("locus","allele"),sep = '\t')
}

compute.F_ST <- function(infile = "",pool = FALSE) {

# 'infile' is the original dataset (in SelEstim format)
# 'pool' is an indicator variable that says whether the data consist in allele counts or reads (pooled data)
# The F_ST for pool-seq data is compuyed following Hivert et al. (in prep.)
# The compute.F_ST function results in a list of two elements: F_ST (a vector of locus-specific F_ST estimates) and F_ST_multilocus (the multilocus estimate of F_ST)

if (infile != "") {
if(!file.exists(infile)) {
stop(paste("The file ",infile," does not exist...",sep = ""))
}
}
else {
stop(paste("\n\tThe argument \"infile\" is missing, with no default value",sep = ""))
}
if (!pool) {
skip.lines <- 2
counts <- read.table(infile,skip = skip.lines)
dummy <- scan(infile,nmax = 2)
number.of.populations <- dummy[1]
number.of.loci <- dummy[2]
if (number.of.populations != (ncol(counts) / 2)) {
stop(paste("\tProblem reading file \"infile\": perhaps the \"pool\" argument is misspecified",sep = ""))
}
if (number.of.loci != nrow(counts)) {
stop(paste("\tProblem reading file \"infile\": perhaps the \"pool\" argument is misspecified",sep = ""))
}
r <- ncol(counts) / 2
l <- seq(1,(2 * r),2)
ss <- counts[,l] + counts[,(l + 1)]
ss2 <- rowSums((counts[,l] + counts[,(l + 1)])^2)
n <- rowSums(ss)
n_c <- (n - ss2 / n) / (r - 1.0)
p <- counts[,l] / ss
q <- counts[,(l + 1)] / ss
pbar <- rowSums(counts[,l]) / rowSums(ss)
qbar <- rowSums(counts[,(l + 1)]) / rowSums(ss)
SSI <- rowSums(ss * (p - p^2) + ss * (q - q^2))
SSP <- rowSums(ss * (p - pbar)^2 + ss * (q - qbar)^2)
MSI <- SSI / (n - r)
MSP <- SSP / (r - 1.0)
}
else {
skip.lines <- 3
reads <- read.table(infile,skip = skip.lines)
dummy <- scan(infile,nmax = 2)
number.of.populations <- dummy[1]
number.of.loci <- dummy[2]
if (number.of.populations != (ncol(reads) / 2)) {
stop(paste("\tProblem reading file \"infile\": perhaps the \"pool\" argument is misspecified",sep = ""))
}
if (number.of.loci != nrow(reads)) {
stop(paste("\tProblem reading file \"infile\": perhaps the \"pool\" argument is misspecified",sep = ""))
}
n_i <- as.vector(read.table(infile,skip = (skip.lines - 1),nrows = 1),mode = "numeric")
if (length(n_i) != (ncol(reads) / 2)) {
stop(paste("\tProblem reading file \"infile\": the pool sizes are misspecified",sep = ""))
}
nbr_loci <- nrow(reads)
n_d <- ncol(reads) / 2
l <- seq(1,(2 * n_d),2)
mtrx.n_i <- matrix(n_i,nrow = nbr_loci,ncol = n_d,byrow = TRUE)
R_1_i <- reads[,l] + reads[,(l + 1)]
R_1 <- rowSums(R_1_i)
R_2 <- rowSums(R_1_i^2)
C_1 <- rowSums(R_1_i / mtrx.n_i + (mtrx.n_i - 1) / mtrx.n_i)
C_1.star <- rowSums(R_1_i * (R_1_i / mtrx.n_i + (mtrx.n_i - 1) / mtrx.n_i)) / R_1
n_c <- (R_1 - R_2 / R_1) / (C_1 - C_1.star)
SSI <- rowSums(reads[,l] - reads[,l]^2 / R_1_i + reads[,(l + 1)] - reads[,(l + 1)]^2 / R_1_i)
SSP <- rowSums(R_1_i * ((reads[,l] / R_1_i) - (rowSums(reads[,l]) / R_1))^2 + R_1_i * ((reads[,(l + 1)] / R_1_i) - (rowSums(reads[,(l + 1)]) / R_1))^2)
MSI <- SSI / (R_1 - C_1)
MSP <- SSP / (C_1 - C_1.star)
}
F_ST <- (MSP - MSI) / (MSP + (n_c - 1) * MSI)
F_ST_multilocus <- sum(MSP - MSI) / sum(MSP + (n_c - 1) * MSI)
rslt <- list(F_ST = F_ST,F_ST_multilocus = F_ST_multilocus)
return(rslt)
}
#script from the manual
png(paste(output_coefficient, ".png", sep=""))
plot.delta(file = delta)
dev.off()
png(paste(output_kld, ".png", sep=""))
plot.kld(file = delta)
dev.off()
png(paste(output_kld, ".png", sep=""))
plot.kld(file = delta, calibration_file = calibration_delta, limit = 0.01)
rslt <- read.table(delta, header = TRUE)
top.snp <- which(rslt$KLD == max(rslt$KLD))
top.snp
#dev.off()
#png(paste(output_kld, ".png", sep=""))
#plot.kld(file = delta, calibration_file = calibration_delta, limit = 0.01)
#abline(v = 4.867859,lty = 2)
rslt$mean[top.snp]
sigma <- read.table(sigma, header = TRUE)
sigma$mean[which(sigma$locus == top.snp)]
dev.off()
66 changes: 66 additions & 0 deletions automatedbayescanplots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#automatedbayescanplots: plots the FST of each loci and the density of loci with a given selection coefficient.
#based on the file.g_fst and the file.g.sel outpputed by the selection detection program BayeScan.
#Usage:
# BayeScan_plots path to [plot_R.r] [file.g_fst] [g.sel] output_namefst output_namedensity
#Variable to allow using only the files as inputs
plot_bayescan <- function(res, FDR = 0.05, size = 1, pos = 0.35, highlight = NULL, name_highlighted = F, add_text = T)
{
if (is.character(res))
res=read.table(res)

colfstat=4
colq=colfstat-1

highlight_rows=which(is.element(as.numeric(row.names(res)),highlight))
non_highlight_rows=setdiff(1:nrow(res),highlight_rows)

outliers=as.integer(row.names(res[res[,colq]<=FDR,]))

ok_outliers=TRUE
if (sum(res[,colq]<=FDR)==0)
ok_outliers=FALSE;

res[res[,colq]<=0.0001,colq]=0.0001

# plot
plot(log10(res[,colq]),res[,colfstat],xlim=rev(range(log10(res[,colq]))),xlab="log10(q value)",ylab=names(res[colfstat]),type="n")
points(log10(res[non_highlight_rows,colq]),res[non_highlight_rows,colfstat],pch=19,cex=size)

if (name_highlighted) {
if (length(highlight_rows)>0) {
text(log10(res[highlight_rows,colq]),res[highlight_rows,colfstat],row.names(res[highlight_rows,]),col="red",cex=size*1.2,font=2)
}
}
else {
points(log10(res[highlight_rows,colq]),res[highlight_rows,colfstat],col="red",pch=19,cex=size)
# add names of loci over p and vertical line
if (ok_outliers & add_text) {
text(log10(res[res[,colq]<=FDR,][,colq])+pos*(round(runif(nrow(res[res[,colq]<=FDR,]),1,2))*2-3),res[res[,colq]<=FDR,][,colfstat],row.names(res[res[,colq]<=FDR,]),cex=size)
}
}
lines(c(log10(FDR),log10(FDR)),c(-1,1),lwd=2)

return(list("outliers"=outliers,"nb_outliers"=length(outliers)))
}
args <- commandArgs(trailingOnly = TRUE)
# Get arguments
#sourcefile <- args[1]
g_fst.txt <- args[1]
g.sel <- args[2]
output_namefst <- args[3]
output_namedensity <- args[4]
#Source plot_R.r
#source(sourcefile::plot_bayescan(res, FDR = 0.05, size = 1, pos = 0.35, highlight = NULL, name_highlighted = F, add_text = T))
#Plot FST
png(paste(output_namefst, ".png", sep=""))
plot_bayescan(g_fst.txt,0,FDR=0.05)
dev.off()
mydata=read.table(g.sel,colClasses="numeric")
parameter="Fst1"
#Plot density of loci with a given selection coefficient.
png(paste(output_namedensity, ".png", sep=""))
plot(density(mydata[[parameter]]), xlab=parameter,
main=paste(parameter,"posterior distribution"))
dev.off()
#install.packages("boa")
#boa.hpd(mydata[[parameter]],0.05)
Loading