#include <errno.h>
#include <set>
#include <string>
#include <vector>
#include <unordered_map>

#include "HashFunc.h"
#include "BufferedFile.h"
#include "rdbinterval.h"
#include "rdbutils.h"
#include "strutil.h"

using namespace std;
using namespace rdb;

extern "C" {

SEXP gchain2interv(SEXP _chainfile, SEXP _src_overlap_policy, SEXP _tgt_overlap_policy, SEXP _min_score, SEXP _envir)
{
	try {
		RdbInitializer rdb_init;

		if (!Rf_isString(_chainfile) || Rf_length(_chainfile) != 1)
			verror("Chain file argument is not a string");

		if (!Rf_isString(_src_overlap_policy) || Rf_length(_src_overlap_policy) != 1)
			verror("Source overlap policy argument is not a string");

		if (!Rf_isString(_tgt_overlap_policy) || Rf_length(_tgt_overlap_policy) != 1)
			verror("Target overlap policy argument is not a string");

		// Parse min_score (optional)
		double min_score = -1.0;  // negative means no filtering
		if (!Rf_isNull(_min_score)) {
			if (!Rf_isReal(_min_score) || Rf_length(_min_score) != 1)
				verror("min_score must be a single numeric value");
			min_score = REAL(_min_score)[0];
		}

		IntervUtils iu(_envir);
		const char *src_overlap_policy = CHAR(STRING_ELT(_src_overlap_policy, 0));
		const char *tgt_overlap_policy = CHAR(STRING_ELT(_tgt_overlap_policy, 0));
		std::string effective_tgt_policy = tgt_overlap_policy;
		if (!strcmp(tgt_overlap_policy, "best_source_cluster") ||
		    !strcmp(tgt_overlap_policy, "best_cluster_union") ||
		    !strcmp(tgt_overlap_policy, "best_cluster_sum") ||
		    !strcmp(tgt_overlap_policy, "best_cluster_max")) {
			effective_tgt_policy = "keep"; // Load ALL chains to resolve later in liftover
		} else if (!strcmp(tgt_overlap_policy, "auto")) {
			effective_tgt_policy = "auto_score";
		}
		const char *chainfname = CHAR(STRING_ELT(_chainfile, 0));
		BufferedFile chainfile;

		if (chainfile.open(chainfname, "r"))
			TGLError("Error opening chain file %s: %s\n", chainfname, strerror(errno));

		enum { CHAIN, SCORE, CHROM1, CHROM_SIZE1, STRAND1, START1, END1, CHROM2, CHROM_SIZE2, STRAND2, START2, END2, ID, NUM_FIELDS };
		enum { SIZE, DT, DQ };
		enum { SRC, TGT };

		ChainIntervals chain_intervs;
		vector<string> fields(NUM_FIELDS);
		unordered_map<string, int> chrom2id; // used only for source chroms
		vector<string> id2chrom;        // used only for source chroms
		vector<int64_t> chrom_sizes;    // used only for source chroms
		int chrom[2] = { -1, -1 };
		int64_t start[2] = { -1, -1 };
		int64_t end[2] = { -1, -1 };
		int strand[2] = { -1, -1 };
	double chain_score = 0.0;
	int64_t current_chain_id = 0;
		int64_t lineno = 0;
		char *endptr;
		int64_t num;
		bool skip_current_chain = false;  // for min_score filtering

		while (1) {
			lineno += split_line_by_space_chars(chainfile, fields, NUM_FIELDS);

			// Skip comment lines (starting with #)
			if (!fields.empty() && !fields[0].empty() && fields[0][0] == '#')
				continue;

			if (fields.size() == NUM_FIELDS) {
				if (chrom[TGT] >= 0) {
					if (start[SRC] != end[SRC])
						TGLError("Chain file %s, line %ld: new chain is defined before the previous one finished to map reference sequence", chainfname, lineno);

					if (start[TGT] != end[TGT])
						TGLError("Chain file %s, line %ld: new chain is defined before the previous one finished to map query sequence", chainfname, lineno);
				}

				// CHAIN
			if (strcmp(fields[CHAIN].c_str(), "chain"))
				TGLError("Chain file %s, line %ld: invalid file format", chainfname, lineno);

			// SCORE
				chain_score = strtod(fields[SCORE].c_str(), &endptr);
				if (*endptr)
					TGLError("Chain file %s, line %ld: invalid chain score", chainfname, lineno);

				// Check min_score filter
				skip_current_chain = (min_score >= 0 && chain_score < min_score);

				// CHROM1
				unordered_map<string, int>::const_iterator ichrom2id = chrom2id.find(fields[CHROM1]);
				if (ichrom2id == chrom2id.end()) {
					chrom[SRC] = id2chrom.size();
					chrom2id[fields[CHROM1]] = chrom[SRC];
					id2chrom.push_back(fields[CHROM1]);
					chrom_sizes.push_back(0);
				} else
					chrom[SRC] = ichrom2id->second;

				// CHROM_SIZE1
				num = strtoll(fields[CHROM_SIZE1].c_str(), &endptr, 10);
				if (*endptr || num <= 0)
					TGLError("Chain file %s, line %ld: invalid format of reference chrom size", chainfname, lineno);
				if (!chrom_sizes[chrom[SRC]])
					chrom_sizes[chrom[SRC]] = num;
				else if (chrom_sizes[chrom[SRC]] != num)
					TGLError("Chain file %s, line %ld: reference chrom size (%ld) differs from previous value (%ld)",
							chainfname, lineno, num, chrom_sizes[chrom[SRC]]);

				// STRAND1
				if (!strcmp(fields[STRAND1].c_str(), "+"))
					strand[SRC] = 0;
				else if (!strcmp(fields[STRAND1].c_str(), "-"))
					strand[SRC] = 1;
				else
					TGLError("Chain file %s, line %ld: invalid format of reference strand", chainfname, lineno);

				// START1
				num = strtoll(fields[START1].c_str(), &endptr, 10);
				if (*endptr || num < 0)
					TGLError("Chain file %s, line %ld: invalid value of reference start coordinate", chainfname, lineno);
				if (num >= chrom_sizes[chrom[SRC]])
					TGLError("Chain file %s, line %ld: reference start coordinate exceeds chromosome size", chainfname, lineno);
				start[SRC] = num;

				// END1
				num = strtoll(fields[END1].c_str(), &endptr, 10);
				if (*endptr)
					TGLError("Chain file %s, line %ld: invalid value of reference end coordinate", chainfname, lineno);
				if (num <= start[SRC])
					TGLError("Chain file %s, line %ld: reference end coordinate is less or equal than the start coordinate", chainfname, lineno);
				if (num > chrom_sizes[chrom[SRC]])
					TGLError("Chain file %s, line %ld: reference end coordinate exceeds chromosome size", chainfname, lineno);
				end[SRC] = num;

				// CHROM2
				try {
					chrom[TGT] = iu.get_chromkey().chrom2id(fields[CHROM2]);
				} catch (TGLException &) { // target chromosome might not exist (random chroms, etc.) => skip the mapping
					chrom[TGT] = -1;
					continue;
				}

				// CHROM_SIZE2
				num = strtoll(fields[CHROM_SIZE2].c_str(), &endptr, 10);
				if (*endptr)
					TGLError("Chain file %s, line %ld: invalid format of query chrom size", chainfname, lineno);
				if ((int64_t)iu.get_chromkey().get_chrom_size(chrom[TGT]) != num)
					TGLError("Chain file %s, line %ld: query chrom size (%ld) differs from what have been defined in the database (%ld)",
							chainfname, lineno, num, iu.get_chromkey().get_chrom_size(chrom[TGT]));

				// STRAND2
				if (!strcmp(fields[STRAND2].c_str(), "+"))
					strand[TGT] = 0;
				else if (!strcmp(fields[STRAND2].c_str(), "-"))
					strand[TGT] = 1;
				else
					TGLError("Chain file %s, line %ld: invalid format of query strand", chainfname, lineno);

				// START2
				num = strtoll(fields[START2].c_str(), &endptr, 10);
				if (*endptr || num < 0)
					TGLError("Chain file %s, line %ld: invalid value of query start coordinate", chainfname, lineno);
				if (num >= (int64_t)iu.get_chromkey().get_chrom_size(chrom[TGT]))
					TGLError("Chain file %s, line %ld: query start coordinate exceeds chromosome size", chainfname, lineno);
				start[TGT] = num;

				// END2
				num = strtoll(fields[END2].c_str(), &endptr, 10);
				if (*endptr)
					TGLError("Chain file %s, line %ld: invalid value of reference end coordinate", chainfname, lineno);
				if (num <= start[TGT])
					TGLError("Chain file %s, line %ld: reference end coordinate is less or equal than the start coordinate", chainfname, lineno);
				if (num > (int64_t)iu.get_chromkey().get_chrom_size(chrom[TGT]))
					TGLError("Chain file %s, line %ld: reference end coordinate exceeds chromosome size", chainfname, lineno);
				end[TGT] = num;

				// ID (chain identifier)
				current_chain_id = strtoll(fields[ID].c_str(), &endptr, 10);
				if (*endptr)
					TGLError("Chain file %s, line %ld: invalid chain ID", chainfname, lineno);
			} else if (fields.size() == 3 || fields.size() == 1) {
				if (chrom[SRC] < 0)
					TGLError("Chain file %s, line %ld: invalid file format", chainfname, lineno);

				// Skip chains that don't meet min_score or have invalid target chromosome
				if (chrom[TGT] < 0 || skip_current_chain) {
					// Still need to advance positions to validate chain format
					if (fields.size() == 3) {
						int64_t size = strtoll(fields[SIZE].c_str(), &endptr, 10);
						int64_t dt = strtoll(fields[DT].c_str(), &endptr, 10);
						int64_t dq = strtoll(fields[DQ].c_str(), &endptr, 10);
						start[SRC] += size + dt;
						start[TGT] += size + dq;
					} else {
						int64_t size = strtoll(fields[SIZE].c_str(), &endptr, 10);
						start[SRC] += size;
						start[TGT] += size;
					}
					continue;
				}

				int64_t size = strtoll(fields[SIZE].c_str(), &endptr, 10);
				if (*endptr || size <= 0)
					TGLError("Chain file %s, line %ld: invalid size", chainfname, lineno);
				if (start[SRC] + size > end[SRC])
					TGLError("Chain file %s, line %ld: block exceeds chain size of the reference genome", chainfname, lineno);
				if (start[TGT] + size > end[TGT])
					TGLError("Chain file %s, line %ld: block exceeds chain size of the query genome", chainfname, lineno);

				chain_intervs.push_back(ChainInterval(
					chrom[TGT],
					strand[TGT] ? iu.get_chromkey().get_chrom_size(chrom[TGT]) - start[TGT] - size : start[TGT],
					strand[TGT] ? iu.get_chromkey().get_chrom_size(chrom[TGT]) - start[TGT] : start[TGT] + size,
					strand[TGT],
					chrom[SRC],
					strand[SRC] ? chrom_sizes[chrom[SRC]] - start[SRC] - size : start[SRC],
					strand[SRC]));
				chain_intervs.back().score = chain_score;
				chain_intervs.back().chain_id = current_chain_id;

                if (fields.size() == 3) {
					int64_t dt = strtoll(fields[DT].c_str(), &endptr, 10);
					int64_t dq = strtoll(fields[DQ].c_str(), &endptr, 10);

					// Allow dt=dq=0 which represents contiguous blocks with no gap in either genome.
					if (dt < 0 || dq < 0)
						TGLError("Chain file %s, line %ld: invalid block gaps", chainfname, lineno);

					start[SRC] += size + dt;
					start[TGT] += size + dq;
				} else {
					start[SRC] += size;
					start[TGT] += size;

					if (start[SRC] != end[SRC])
						TGLError("Chain file %s, line %ld: reference chain was not fully mapped", chainfname, lineno);

					if (start[TGT] != end[TGT])
						TGLError("Chain file %s, line %ld: query chain was not fully mapped", chainfname, lineno);
				}
			} else if (fields.empty()) {
#ifdef error  // R redefines "error" which interferes with BufferedFile::error function
    #define tmp_error error
    #undef error
				if (chainfile.error())
					TGLError("Reading chain file %s: %s", chainfname, strerror(errno));
    #define error tmp_error
#endif
				break;
			} else
				TGLError("Chain file %s, line %ld: invalid file format", chainfname, lineno);
		}

		if (chain_intervs.empty())
			return R_NilValue;

		// Handle source overlaps
		chain_intervs.sort_by_src();
		chain_intervs.handle_src_overlaps(src_overlap_policy, iu.get_chromkey(), id2chrom);

		// Handle target overlaps
		chain_intervs.sort_by_tgt();
		chain_intervs.handle_tgt_overlaps(effective_tgt_policy, iu.get_chromkey(), id2chrom);
		chain_intervs.set_tgt_overlap_policy(effective_tgt_policy);

		if (chain_intervs.empty())
			return R_NilValue;

		return iu.convert_chain_intervs(chain_intervs, id2chrom);
	} catch (TGLException &e) {
		rerror("%s", e.msg());
    } catch (const bad_alloc &e) {
        rerror("Out of memory");
    }
	return R_NilValue;
}

}
