#' Identity disequilibrium (g2) for different subsets of markers
#' 
#' 
#' @param genotypes data.frame with individuals in rows and loci in columns,
#'        containing genotypes coded as 0 (homozygote), 1 (heterozygote) and NA (missing)
#' @param subsets a vector specifying the sizes of subsets to draw. For a subset of 20 markers, subsets = c(2, 5, 10, 15, 20) could
#'        be a reasonable choice. The minimum subset size is 2 and the maximum is the number of markers in the data.
#' @param nboot number of re-draws per subset size.
#' @param type specifies g2 formula to take. Type "snps" for large datasets and "msats" for smaller datasets.
#' @param parallel Default is FALSE. If TRUE, resamplings are parallelized 
#' @param ncores Specify number of cores to use for parallelization. By default,
#'        all available cores are used.
#'        
#' @return 
#' \item{call}{function call.}
#' \item{g2_full}{g2 estimate for the full marker set}
#' \item{all_g2_res}{vector of g2 values for each randomly subsetted dataset}
#' \item{summary_exp_r2}{g2 mean and sd for each number of subsetted loci}
#' \item{nobs}{number of observations}
#' \item{nloc}{number of markers}
#' 
#' @references
#' Hoffman, J.I., Simpson, F., David, P., Rijks, J.M., Kuiken, T., Thorne, M.A.S., Lacey, R.C. & Dasmahapatra, K.K. (2014) High-throughput sequencing reveals inbreeding depression in a natural population.
#' Proceedings of the National Academy of Sciences of the United States of America, 111: 3775-3780. Doi: 10.1073/pnas.1318945111
#'
#' 
#' @author Martin A. Stoffel (martin.adam.stoffel@@gmail.com) 
#'        
#' @examples
#' data(mouse_msats)
#' genotypes <- convert_raw(mouse_msats)
#' (out <- resample_g2(genotypes, subsets = c(2,4,6,8,10,12), nboot = 1000, type = "msats"))
#' plot(out)
#' @export
#'
#'

resample_g2 <- function(genotypes, subsets = NULL, nboot = 100, type = c("msats", "snps"), 
                        parallel = FALSE, ncores = NULL) {
    
    genotypes <- as.matrix(genotypes)
    
    # check for subset sequence
    if ((sum(subsets < 2)) != 0) stop("You cannot subset less than 2 markers")
    if ((sum(subsets > ncol(genotypes)))!= 0) stop("The number of subsetted markers cannot exceed the overall number of markers")
    if (any(subsets%%1 != 0)) stop("All subsets have to be specified by integers")
    
    # check g2 function argument
    if (length(type) == 2){
        type <- "msats"
    } else if (!((type == "msats")|(type == "snps"))){
        stop("type argument needs to be msats or snps")
    } 
    
    # assign g2 function
    if (type == "msats"){
        g2_fun <- g2_microsats
    } else {
        g2_fun <- g2_snps
    }
    
    # full data set
    g2_full <- g2_fun(genotypes)[["g2"]]
    
    # case nboot = 0
    if ((nboot <= 0) | (is.null(subsets))){
        res <- list(call = match.call(),
                    g2_full = g2_full,
                    all_g2_res = NA,
                    summary_all_g2 = NA,
                    nobs = nrow(genotypes), 
                    nloc = ncol(genotypes))
        
        class(res) <- "inbreed"
        return(res)
    }
    
    # sorting
    subsets <- sort(subsets)
    
    # initialise
    nloc <- ncol(genotypes)
    all_g2 <- matrix(data = NA, nrow = nboot, ncol = length(subsets))
    
    # subsampling and calculating g2
    sample_genotypes <- function(genotypes, num_subsamp) {
        ind <- sample(1:ncol(genotypes), num_subsamp)
        g2 <- g2_fun(genotypes[, ind])[["g2"]]
        g2
    }
    
    step_num <- 1
    
    if (parallel == FALSE) {
    
        for (i in subsets) {
            
            cat("\n", "Iterating subset number ", step_num, " from ", length(subsets), sep = "")
            if (step_num == length(subsets)) {
                cat("\n", "Last subset!", sep = "")
            }
            
            all_g2[, step_num] <- replicate(nboot, sample_genotypes(genotypes, i))
          
            step_num <- step_num + 1
        }
    
    } else if (parallel == TRUE) {
        # define with counter for parallelized bootstraps
        sample_genotypes_parallel <- function(boot_num, genotypes, num_subsamp) {
            ind <- sample(1:ncol(genotypes), num_subsamp)
            g2 <- g2_fun(genotypes[, ind])[["g2"]]
            g2
        }
        for (i in subsets) {
            if (is.null(ncores)) {
                ncores <- parallel::detectCores()-1
                warning("No core number specified: detectCores() is used to detect the number of \n cores on the local machine")
            }
            
            cat("\n", "Iterating subset number ", step_num, " from ", length(subsets), sep = "")
            if (step_num == length(subsets)) {
                cat("\n", "Last subset!", sep = "")
            }
            
            cl <- parallel::makeCluster(ncores)
            all_g2[, step_num] <- parallel::parSapply(cl, 1:nboot, sample_genotypes_parallel, 
                                                      genotypes, i)
            parallel::stopCluster(cl)
        
            step_num <- step_num + 1
        }
    }
    
    # variable names are number of markers used
    #all_g2 <- as.data.frame(all_g2)
    
    # expected r2 per subset
    all_g2_res <- data.frame(g2 = c(all_g2), nloc = factor(rep(subsets, each = nboot)))
    
    # mean and sd per number of loci
    summary_all_g2 <- as.data.frame(as.list(stats::aggregate(g2 ~ nloc, data = all_g2_res, 
                                FUN = function(x) c(mean = mean(x, na.rm = TRUE),
                                                    sd = stats::sd(x, na.rm = TRUE)))))
    names(summary_all_g2) <- c("nloc", "Mean", "SD")
    
    res <- list(call = match.call(),
                g2_full = g2_full,
                all_g2_res = all_g2_res,
                summary_all_g2 = summary_all_g2,
                nobs = nrow(genotypes), 
                nloc = ncol(genotypes))
    
    class(res) <- "inbreed"
    return(res)
    
}


