import _ from "lodash";
import { parseDebug } from "../parse/ParseDebug";
import { dlog } from "../util/DebugLog";
import { histogram, histogramMost } from "../util/Utils";
import { CsvFormat } from "./CsvFormat";
import { nonZeroQuotes } from "./DetectCsvFormat";
import { parseCsv } from "./PapaParsing";

export interface ColumnsConsistency {
  score: number;
  numColumns: number;
}
/** return a score for how likely the format parses to a consistent number of columns
 * with a preference to longer columns.
 *
 * The score is roughly: num_columns * % of rows with that number of columns.
 * . the score is adjusted downward if it seems like we're mishandling quotes
 */
export function columnsConsistency(
  lines: string[],
  format: CsvFormat
): ColumnsConsistency {
  const allLines = lines.join("\n");
  const parsed = parseCsv(allLines, format);
  const colCounts = parsed.map((row) => row.length);
  const byCount = histogram(colCounts); // number of rows parsed to include n columns

  const uniqueCounts = byCount.length;
  const mostFrequent = histogramMost(byCount)!;
  const [bestNumCols, mostCount] = mostFrequent;

  const frequencyScale = (mostCount - 1) / mostCount;
  const frequencyColScore = bestNumCols * frequencyScale;
  const uniquedScore = frequencyColScore / uniqueCounts ** 0.5;

  const misQuote = missedQuotePenalty(parsed, nonZeroQuotes, bestNumCols);
  const unQuote = unQuotePenalty(format.quoteChar, lines, bestNumCols);

  const score = uniquedScore - misQuote - unQuote;

  if (parseDebug.columnsConsistencyLog) {
    dlog({
      format,
      score,
      frequencyColScore,
      uniquedScore,
      bestNumCols,
      mostCount,
      uniqueCounts,
      rows: parsed.length,
      unQuote,
      misQuote,
    });
  }
  return { score, numColumns: bestNumCols };
}

/** penalize a format that specifies quotes but doesn't use a quote on at least half the lines */
function unQuotePenalty(quote: string, lines: string[], meanColumns: number): number {
  const hasQuote = lines.filter((line) => line.indexOf(quote) !== -1).length;
  const percent = hasQuote / lines.length;
  if (percent >= 0.25) {
    // if at least this percent of lines use the quote, then ok
    return 0;
  } else {
    // otherwise penalize proportitionally
    return percent * meanColumns;
  }
}

const penaltyPercentMin = 0.1; // roughly this percent of rows need to have a misquoted column
const penaltyPerColumn = 0.3;

/** penalize patterns where values start or end with typical quote characters */
function missedQuotePenalty(
  rows: string[][],
  quotes: string[],
  meanColumns: number
): number {
  const values = rows.flat().map((value) => value.trim());
  if (!values.length) {
    return 0;
  }
  const possibleMisses = quotes.map((quote) => {
    const scores = values.map((value) => possibleMisquote(value, quote));
    return _.sum(scores);
  });
  const missedCount = _.max(possibleMisses) || 0;
  const penaltyThreshold = (penaltyPercentMin * values.length) / meanColumns;

  if (missedCount > penaltyThreshold) {
    const penaltyColumns = Math.max(missedCount / penaltyThreshold, meanColumns);
    return penaltyColumns * penaltyPerColumn;
  } else {
    return 0;
  }
}

function possibleMisquote(value: string, separator: string): number {
  if (value.startsWith(separator) || value.trim().endsWith(separator)) {
    return 1;
  }
  return 0;
}
