% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ancestry.R
\name{run_ancestry_prediction}
\alias{run_ancestry_prediction}
\title{Projecting the study data set onto the PC space of the reference dataset}
\usage{
run_ancestry_prediction(
  indir,
  name,
  qcdir = indir,
  verbose = FALSE,
  path2plink2 = NULL,
  path2load_mat = NULL,
  keep_individuals = NULL,
  remove_individuals = NULL,
  extract_markers = NULL,
  exclude_markers = NULL,
  showPlinkOutput = TRUE
)
}
\arguments{
\item{indir}{[character] /path/to/directory containing the basic PLINK 2.0 data
file name.pgen, name.pvar, name.psam}

\item{name}{[character] Prefix of PLINK 2.0 files, i.e. name.pgen, name.pvar, 
name.psam}

\item{qcdir}{[character] /path/to/directory where name.sscore as returned by 
plink2 --score will be saved to. User needs writing permission to qcdir. Per 
default is qcdir=indir.}

\item{verbose}{[logical] If TRUE, progress info is printed to standard out.}

\item{path2plink2}{[character] Absolute path to PLINK executable
(\url{https://www.cog-genomics.org/plink/2.0/}) i.e.
plink 2 should be accessible as path2plink -h. The full name of the executable
should be specified: for windows OS, this means path/plink.exe, for unix
platforms this is path/plink. If not provided, assumed that PATH set-up works
and PLINK will be found by \code{\link[sys]{exec}}('plink').}

\item{path2load_mat}{[character] /path/to/directory where loading matrices are 
kept. This can be downloaded from: https://github.com/meyer-lab-cshl/plinkQCAncestryData.
Note that file names before the .acount or .eigenvec.allele must be included
in file path.}

\item{keep_individuals}{[character] Path to file with individuals to be
retained in the analysis. The file has to be a space/tab-delimited text file
with family IDs in the first column and within-family IDs in the second
column. All samples not listed in this file will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
Default: NULL, i.e. no filtering on individuals.}

\item{remove_individuals}{[character] Path to file with individuals to be
removed from the analysis. The file has to be a space/tab-delimited text file
with family IDs in the first column and within-family IDs in the second
column. All samples listed in this file will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
Default: NULL, i.e. no filtering on individuals.}

\item{extract_markers}{[character] Path to file with makers to be
included in the analysis. The file has to be a text file with a list of
variant IDs (usually one per line, but it's okay for them to just be
separated by spaces). All unlisted variants will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
Default: NULL, i.e. no filtering on markers.}

\item{exclude_markers}{[character] Path to file with makers to be
removed from the analysis. The file has to be a text file with a list of
variant IDs (usually one per line, but it's okay for them to just be
separated by spaces). All listed variants will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
Default: NULL, i.e. no filtering on markers.}

\item{showPlinkOutput}{[logical] If TRUE, plink log and error messages are
printed to standard out.}
}
\value{
A .sscore file with the input data projected onto the reference data PCs
}
\description{
Projects the study dataset onto the PC space of the reference dataset. 
The output of this function as input in a random forest classifier to predict 
the genomic ancestry of the samples. Genomic data version hg38 with variant 
identifiers in the format of 1:12345[hg38] is needed for ancestry identification 
to work.
}
\examples{
indir <- system.file("extdata", package="plinkQC")
qcdir <- tempdir()
name <- "data.hg38"
path2plink <- '/path/to/plink'
path2load_mat <- '/path/to/load_mat/merged_chrs.postQC.train.pca'
\dontrun{
# the following code is not run on package build, as the path2plink on the
# user system is not known.
superpop_classification(indir=indir, qcdir=qcdir, name=name, 
path2plink2 = path2plink2, path2load_mat = path2load_mat)
}
}
