vcf2mipanalyzer_data <- function(file = NULL, vcfR = NULL, verbose = TRUE){
#.....................
# Read and check input
#.....................
if(is.null(file) & is.null(vcfR)){
stop("You must specify an input: either a raw vcf file path or a vcfR object")
} else if(!is.null(file) & !is.null(vcfR)){
stop("You must specify one input: either a raw vcf file path or a vcfR object")
} else if(!is.null(vcfR)){ # user specified a vcfR object
if(!inherits(vcfR, "vcfR")){
stop("vcfR object must be of class vcfR")
}
vcf <- vcfR
} else if(!is.null(file)){ # user specified a file
if(!file.exists(file)){
stop("The vcf does not appear to exist at that file path")
}
vcf <- vcfR::read.vcfR(file=file, verbose=verbose) # read vcf
}
#..........................
# find features and convert
#..........................
countmat <- vcfR::extract.gt(vcf, element="AD")
counts <- sapply(1:nrow(countmat), function(x){ # R version of a for loop
ret <- t(stringr::str_split(countmat[x,], ",", simplify = T))
ret <- apply(ret, 2, as.numeric) # need to coerce to numeric here b/c of way vcfR extracts AD (can't set as.numeric T and retain all info) and to perserve df structure
return(ret)
}) # note, doing this way instead of an array to perserve the fact that the ALT count varies by loci as you could see by the command below
# d <- sapply(vcf@fix[,5], function(x){ return(length(unlist(stringr::str_split(x, ","))))}) # ALT count per loci
coverage <- vcfR::extract.gt(vcf, element="DP", as.numeric = T)
# create data
ret <- list(coverage = coverage,
counts = counts,
samples = colnames(vcf@gt)[2:ncol(vcf@gt)],
loci = vcf@fix # keep all the info here not just loci pos?
)
# return in mipanalyzer_data class
class(ret) <- "mipanalyzer_data"
return(ret)
}