import { DataArray } from "../data/DataChunk";
import { ColumnDetect, detectAllColumnTypes } from "../parse/DetectColumnTypes";
import { parseDsvText } from "../parse/ParseDsv";
import { CsvParser, TabularColumn } from "../parse/ParseTabular";
import { promoteTimeLike } from "../parse/PromoteTimeLike";
import { splitTextByLine } from "../parse/SplitTextByLine";
import { histogram, histogramMost } from "../util/Utils";
import { ColumnsConsistency, columnsConsistency } from "./ColumnsConsistency";
import { CsvFormat, defaultCsvFormat } from "./CsvFormat";
import { detectComments } from "./DetectComments";
import { syntheticLabel } from "./LetterIndex";
import { parseCsvLines } from "./PapaParsing";

export interface CsvFormatAndColumns {
  format: CsvFormat;
  columns: TabularColumn[];
}

/** detect the csv format for a buffer of data from the front of a file
 * @param ignoreLastLine if true the last line will be ignored
 *  (set to true if the buffer may end in the middle of a line)
 */
export function detectCsv(
  buffer: Uint8Array,
  ignoreLastLine = false
): CsvParser | undefined {
  const decoder = new TextDecoder("utf8");
  const text = decoder.decode(buffer);
  return detectCsvText(text, ignoreLastLine);
}

const csvPreferenceBonus = 0.5;

export function detectCsvText(
  text: string,
  ignoreLastLine = false
): CsvParser | undefined {
  const { nonEmpty, lineBreak } = splitTextByLine(text, ignoreLastLine);
  if (nonEmpty.length === 0) {
    return undefined;
  }
  const { lines, format: baseFormat, consistency } = bestFormat(nonEmpty);
  const score = consistency.score + csvPreferenceBonus;

  const baseRowTable = parseCsvLines(lines, baseFormat);
  const columnDetects = detectAllColumnTypes(baseRowTable);
  if (!columnDetects) {
    return undefined;
  }

  const { columns: origColumns, headerLines } = labelColumns(columnDetects);
  const format = { ...baseFormat, headerLines, lineBreak, columns: [] };
  const bodyRows = parseCsvLines(lines, format); // CONSIDER do we need to reparse here?
  const columns = promoteTimeLike(origColumns, bodyRows);

  const bodyFormat = { ...format, headerLines: 0 };

  const parser: CsvParser = {
    kind: "csv",
    score,
    format,
    columns,
    parse,
    parseBody,
  };

  return parser;

  function parse(text: string): DataArray[] {
    return parseDsvText(text, { format, columns });
  }

  function parseBody(text: string): DataArray[] {
    return parseDsvText(text, { format: bodyFormat, columns });
  }
}

interface FoundFormat extends FormatScore {
  format: CsvFormat;
  lines: string[];
}

/** find the most plausible csv format for the data */
function bestFormat(nonEmptyLines: string[]): FoundFormat {
  const proposedComment = detectComments(nonEmptyLines);
  const noCommentLines = stripComments(nonEmptyLines, proposedComment);
  const possibleFormats = candidateFormats(nonEmptyLines.join("\n"), proposedComment);
  const tabularFmtScore = mostTabularFormat(noCommentLines, possibleFormats);
  const formatScore = commentCheck(tabularFmtScore, nonEmptyLines);
  const lines = formatScore === tabularFmtScore ? noCommentLines : nonEmptyLines;
  const { format, consistency } = formatScore;

  return { format, consistency, lines };
}

/** test if the chosen comment scores better than w/o comments.
 * (this is useful if the first header line starts with a comment character)
 */
function commentCheck(base: FormatScore, linesWithComments: string[]): FormatScore {
  if (base.consistency.numColumns === 1) {
    return base;
  }

  const format = { ...base.format, comment: "" };
  const consistency = columnsConsistency(linesWithComments, format);
  const { score, numColumns: meanColumns } = consistency;
  // dlog({ score, baseScore: base.consistency.score });
  if (score > base.consistency.score && meanColumns > 1) {
    return { format, consistency };
  } else {
    return base;
  }
}

export const nonZeroQuotes = ['"', "'", "`"];
const quotes = [...nonZeroQuotes, ""];
const separators = [",", "\t", ";", "|"];
const escapes = ["\\", '"', ""];

/** return all combinations of dsv formats that might apply to the sample text.
 * For efficiency, skip formats that can't apply because they depend on
 * characters not in the text.
 */
function candidateFormats(text: string, comment: string): CsvFormat[] {
  return inText(escapes, text).flatMap((escapeChar) => {
    return inText(quotes, text).flatMap((quoteChar) => {
      const foundSeparators = inText(separators, text);
      const separatorList = minElem(foundSeparators, ",");
      return separatorList.map((separator) => {
        return { ...defaultCsvFormat, escapeChar, quoteChar, comment, separator };
      });
    });
  });
}

/** debug version for debugging particular formats */
// function candidateFormats(text: string, comment: string): CsvFormat[] {
//   return [
//     { ...defaultCsvFormat },
//     { ...defaultCsvFormat, separator: ";", quoteChar: "'" },
//   ];
// }

function minElem<T>(array: T[], e: T): T[] {
  return array.length ? array : [e];
}

function inText(candidates: string[], text: string): string[] {
  return candidates.filter((s) => text.includes(s));
}

interface FormatScore {
  format: CsvFormat;
  consistency: ColumnsConsistency;
}

/** @return the format that divides rows into the largest consistent number of columns. */
function mostTabularFormat(lines: string[], formats: CsvFormat[]): FormatScore {
  const formatScores = formats.map((format) => {
    return { format, consistency: columnsConsistency(lines, format) };
  });
  return formatScores.reduce(bestScoringFormat);
}

/** chose the best scoring format, using the simpler format to break ties. */
function bestScoringFormat(a: FormatScore, b: FormatScore): FormatScore {
  return a.consistency.score >= b.consistency.score ? a : b;
}

/** remove lines starting with the comment string */
function stripComments(lines: string[], comment: string): string[] {
  if (comment.length === 0) {
    return lines;
  }
  return lines.filter((line) => !line.startsWith(comment));
}

interface ColumnReport {
  columns: TabularColumn[];
  headerLines: number;
}

/** find labels for the columns */
export function labelColumns(columnDetects: ColumnDetect[]): ColumnReport {
  const headerLines = measureHeader(columnDetects);
  const columns = columnDetects.map((col, i) => {
    return {
      label: chooseLabel(col.column, i, headerLines),
      dataParser: col.dataParser,
    };
  });
  return { columns, headerLines };
}

/** Return the number of header lines to use for parsing the table.
 * We use the most popular number of header lines estimated amon the non string columns.
 */
function measureHeader(columnDetects: ColumnDetect[]): number {
  const nonString = columnDetects.filter((cd) => cd.dataParser.parseType !== "string");
  const allHeaderCounts = nonString.map((c) => c.headerLines);
  const buckets = histogram(allHeaderCounts);
  const most = histogramMost(buckets);
  if (most) {
    return most[0];
  } else {
    return 0;
  }
}

function chooseLabel(column: string[], columnIndex: number, headerLines: number): string {
  const columnLabel = headerLines > 0 ? column[headerLines - 1].trim() : "";

  if (columnLabel && columnLabel !== "") {
    return columnLabel;
  } else {
    return syntheticLabel(columnIndex);
  }
}
