% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/synth.R
\name{gsynth.sample}
\alias{gsynth.sample}
\title{Sample a synthetic genome from a trained Markov model}
\usage{
gsynth.sample(
  model,
  output_path = NULL,
  output_format = c("misha", "fasta", "vector"),
  mask_copy = NULL,
  seed = NULL,
  intervals = NULL,
  n_samples = 1,
  bin_merge = NULL
)
}
\arguments{
\item{model}{A gsynth.model object from \code{\link{gsynth.train}}}

\item{output_path}{Path to the output file (ignored when output_format = "vector")}

\item{output_format}{Output format:
\itemize{
  \item "misha": .seq binary format (default)
  \item "fasta": FASTA text format
  \item "vector": Return sequences as a character vector (does not write to file)
}}

\item{mask_copy}{Optional intervals to copy from the original genome instead of
sampling. Use this to preserve specific regions (e.g., repeats, regulatory
elements) exactly as they appear in the reference. Regions not in mask_copy
will be sampled using the Markov model.
Note: mask_copy intervals should be non-overlapping and sorted by start
position within each chromosome. Overlapping intervals may result in
only the first overlapping region being copied, with subsequent overlaps
skipped due to cursor advancement during sequential processing.}

\item{seed}{Random seed for reproducibility. If NULL, uses current random state.}

\item{intervals}{Genomic intervals to sample. If NULL, uses all chromosomes.}

\item{n_samples}{Number of samples to generate per interval. Default is 1.
When n_samples > 1 and output_format = "fasta", headers include "_sampleN".
When output_format = "vector", returns n_samples * n_intervals sequences.}

\item{bin_merge}{Optional list of bin merge specifications to apply during sampling,
       one per dimension (length must equal model$n_dims). Each element should be:
       \itemize{
         \item A list of merge specifications (same format as in
               \code{\link{gsynth.train}}: each spec is \code{list(from = ..., to = ...)})
         \item Or NULL to use the bin mapping from training for that dimension
       }
       This allows merging sparse bins at sampling time without re-training.
       Example for a 2D model:
       \preformatted{
bin_merge = list(
  # Dimension 1: merge bins with values >= 0.8 to bin [0.7, 0.8)
  list(list(from = c(0.8, Inf), to = c(0.7, 0.8))),
  # Dimension 2: use training-time bin_map (no override)
  NULL
)
       }}
}
\value{
When output_format is "misha" or "fasta", returns invisible NULL and
        writes the synthetic genome to output_path. When output_format is "vector",
        returns a character vector of sequences (length = n_intervals * n_samples).
}
\description{
Generates a synthetic genome by sampling from a trained stratified Markov-5
model. The generated genome preserves the k-mer statistics of the original
genome within each stratification bin.
}
\details{
\strong{N bases during sampling:} When the sampler needs to initialize the
first 5-mer context and encounters regions with only N bases, it falls back
to uniform random base selection until a valid context is established.
Similarly, if a bin has no learned statistics (sparse bin with NA CDF),
uniform sampling is used for that position.

\strong{Sparse bins:} If the model has sparse bins (from \code{min_obs} during
training), a warning is issued when sampling regions that fall into these bins.
Consider using \code{bin_merge} to redirect sparse bins to well-populated ones.
}
\examples{
gdb.init_examples()

# Create virtual tracks
gvtrack.create("g_frac", NULL, "kmer.frac", kmer = "G")
gvtrack.create("c_frac", NULL, "kmer.frac", kmer = "C")
gvtrack.create("cg_frac", NULL, "kmer.frac", kmer = "CG")
gvtrack.create("masked_frac", NULL, "masked.frac")

# Define repeat mask (regions to preserve from original)
repeats <- gscreen("masked_frac > 0.5",
    intervals = gintervals.all(),
    iterator = 100
)

# Train model (excluding repeats from training)
model <- gsynth.train(
    list(expr = "g_frac + c_frac", breaks = seq(0, 1, 0.025)),
    list(expr = "cg_frac", breaks = c(0, 0.01, 0.02, 0.03, 0.04, 0.2)),
    mask = repeats,
    iterator = 200,
    min_obs = 1000
)

# Sample with mask_copy to preserve repeats from original genome
temp_dir <- tempdir()
synthetic_genome_file <- file.path(temp_dir, "synthetic_genome.fa")
gsynth.sample(model, synthetic_genome_file,
    output_format = "fasta",
    mask_copy = repeats,
    seed = 60427,
    bin_merge = list(
        list(list(from = 0.7, to = c(0.675, 0.7))),
        list(list(from = 0.04, to = c(0.03, 0.04)))
    )
)

}
\seealso{
\code{\link{gsynth.train}}, \code{\link{gsynth.save}}
}
