// Enhanced CSV Processing function for the RAG system
// This function will improve CSV handling for quantitative data files
import * as XLSX from 'xlsx';
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf';
import 'pdfjs-dist/legacy/build/pdf.worker.entry';
import axios from 'axios';

// Improved function to extract content from a CSV file
export const extractCSVContent = async (data: any, fileType: string): Promise<string> => {
  try {
    // Parse the CSV data using XLSX
    const workbook = XLSX.read(data, { type: typeof data === 'string' ? 'string' : 'array' });
    const firstSheetName = workbook.SheetNames[0];
    const worksheet = workbook.Sheets[firstSheetName];

    // Convert to JSON to manipulate the data more easily
    const jsonData = XLSX.utils.sheet_to_json(worksheet, { header: 1 }) as any[][];

    if (!jsonData || jsonData.length < 2) {
      return "CSV file appears to be empty or improperly formatted";
    }

    // Extract headers (assuming first row contains headers)
    const headers = jsonData[0] as string[];

    // Fetch metadata to help with CSV interpretation
    let variableKeyData: any = {};
    let countrySuffixData: any = {};
    let scenarioKeyData: any = {};

    try {
      // Fetch variable metadata
      const variableKeyResponse = await axios.get('https://www.risknz.mef-uk.com/RAG_KnowlBase/list/VariableKey.json');
      variableKeyData = variableKeyResponse.data;

      // Fetch country suffix metadata
      const countrySuffixResponse = await axios.get('https://www.risknz.mef-uk.com/RAG_KnowlBase/list/cntySuffixKey.json');
      countrySuffixData = countrySuffixResponse.data;

      // Fetch scenario metadata
      const scenarioKeyResponse = await axios.get('https://www.risknz.mef-uk.com/RAG_KnowlBase/list/ScenarioKey.json');
      scenarioKeyData = scenarioKeyResponse.data;
    } catch (error) {
      console.warn("Could not fetch metadata for CSV interpretation:", error);
    }

    // Create a better text representation of the CSV data
    let textRepresentation = '';

    // First, add a title section about the data
    textRepresentation += "CSV DATA SUMMARY:\n\n";

    // If we have scenario information, identify it from the filename or content
    const scenarioMatch = Object.keys(scenarioKeyData).find(scenario =>
      headers.some(header => header.includes(scenario))
    );

    if (scenarioMatch && scenarioKeyData[scenarioMatch]) {
      textRepresentation += `Scenario: ${scenarioMatch} - ${scenarioKeyData[scenarioMatch]}\n\n`;
    }

    // Add column header interpretations
    textRepresentation += "COLUMN HEADERS EXPLAINED:\n";
    for (const header of headers) {
      if (header === "" || !header) continue;

      // Try to split into variable and country parts
      let variablePart = header;
      let countryPart = "";

      // Check if header has country suffix
      Object.keys(countrySuffixData).forEach(suffix => {
        if (header.endsWith(`_${suffix}`)) {
          variablePart = header.slice(0, -(suffix.length + 1)); // +1 for the underscore
          countryPart = suffix;
        }
      });

      // Look up variable meaning
      const variableMeaning = variableKeyData[variablePart] || "Unknown variable";
      const countryMeaning = countrySuffixData[countryPart] || (countryPart ? countryPart : "Global");

      textRepresentation += `${header}: ${variableMeaning} for ${countryMeaning}\n`;
    }
    textRepresentation += "\n";

    // Add data preview section
    textRepresentation += "DATA PREVIEW:\n";

    // Calculate summary statistics for numerical columns
    const stats: { [key: string]: { min: number; max: number; avg: number; count: number } } = {};

    // Process data rows to gather statistics and prepare text representation
    const maxRows = Math.min(10, jsonData.length - 1); // Preview up to 10 rows
    for (let rowIdx = 1; rowIdx < jsonData.length; rowIdx++) {
      const row = jsonData[rowIdx];
      if (!row || row.length === 0) continue;

      // Collect statistics for each column
      headers.forEach((header, colIdx) => {
        if (!header) return;

        const value = row[colIdx];
        if (typeof value === 'number') {
          if (!stats[header]) {
            stats[header] = { min: value, max: value, avg: value, count: 1 };
          } else {
            stats[header].min = Math.min(stats[header].min, value);
            stats[header].max = Math.max(stats[header].max, value);
            stats[header].avg = (stats[header].avg * stats[header].count + value) / (stats[header].count + 1);
            stats[header].count++;
          }
        }
      });

      // Add row data to text representation (only for the first few rows)
      if (rowIdx <= maxRows) {
        textRepresentation += `Row ${rowIdx}: `;
        headers.forEach((header, colIdx) => {
          if (!header) return;
          textRepresentation += `${header}=${row[colIdx] !== undefined ? row[colIdx] : 'N/A'}, `;
        });
        textRepresentation = textRepresentation.slice(0, -2) + "\n"; // Remove trailing comma and space
      }
    }

    // Add statistics section
    textRepresentation += "\nSTATISTICAL SUMMARY:\n";
    Object.keys(stats).forEach(header => {
      const stat = stats[header];
      textRepresentation += `${header}: min=${stat.min.toFixed(2)}, max=${stat.max.toFixed(2)}, avg=${stat.avg.toFixed(2)}, count=${stat.count}\n`;
    });

    // Add trends section
    textRepresentation += "\nDATA INSIGHTS:\n";

    // Identify key trends (e.g., columns with the highest variance)
    const columnsWithHighVariance = Object.keys(stats)
      .filter(header => stats[header].max - stats[header].min > 0.1 * stats[header].avg) // Simple heuristic for variance
      .slice(0, 5); // Top 5 columns with high variance

    if (columnsWithHighVariance.length > 0) {
      textRepresentation += "Key variables with significant variation:\n";
      columnsWithHighVariance.forEach(header => {
        const stat = stats[header];

        // Try to split into variable and country parts for better explanation
        let variablePart = header;
        let countryPart = "";

        // Check if header has country suffix
        Object.keys(countrySuffixData).forEach(suffix => {
          if (header.endsWith(`_${suffix}`)) {
            variablePart = header.slice(0, -(suffix.length + 1));
            countryPart = suffix;
          }
        });

        // Look up meanings
        const variableMeaning = variableKeyData[variablePart] || variablePart;
        const countryMeaning = countrySuffixData[countryPart] || (countryPart ? countryPart : "Global");

        textRepresentation += `- ${variableMeaning} for ${countryMeaning}: varies from ${stat.min.toFixed(2)} to ${stat.max.toFixed(2)}\n`;
      });
    } else {
      textRepresentation += "No significant variations detected in the data.\n";
    }

    return textRepresentation;
  } catch (error: any) {
    console.error('Error parsing CSV file:', error);
    return `Error processing CSV file: ${error.message}`;
  }
};

// Update the main extractContent function to use our enhanced CSV processor
const extractContent = async (data: any, fileType: string): Promise<string> => {
  switch (fileType) {
    case 'txt':
      return typeof data === 'string' ? data : new TextDecoder().decode(data);
    case 'pdf':
      try {
        const pdf = await pdfjsLib.getDocument({ data }).promise;
        let fullText = '';
        for (let i = 1; i <= pdf.numPages; i++) {
          const page = await pdf.getPage(i);
          const textContent = await page.getTextContent();
          const pageText = textContent.items.map((item: any) => item.str).join(' ');
          fullText += pageText + ' ';
        }
        return fullText;
      } catch (error: any) {
        console.error('Error extracting PDF text:', error);
        return 'Failed to extract PDF content';
      }
    case 'csv':
      return extractCSVContent(data, fileType);
    case 'xls':
    case 'xlsx':
      return extractCSVContent(data, fileType);
    default:
      return 'Unsupported file type';
  }
};

// Enhanced function to chunk CSV data more intelligently
const chunkCSVText = (text: string, chunkSize: number): string[] => {
  // First, try to identify key sections in our processed text
  const sections = [
    "CSV DATA SUMMARY:",
    "COLUMN HEADERS EXPLAINED:",
    "DATA PREVIEW:",
    "STATISTICAL SUMMARY:",
    "DATA INSIGHTS:"
  ];

  // Create chunks based on logical sections first
  const chunks: string[] = [];
  let currentSection = "";
  let currentChunk = "";

  const lines = text.split('\n');
  for (const line of lines) {
    // Check if this line starts a new section
    const isNewSection = sections.some(section => line.trim().startsWith(section));

    if (isNewSection) {
      // If we already have content in the current chunk, save it
      if (currentChunk.length > 0) {
        chunks.push(currentChunk);
      }

      currentSection = line;
      currentChunk = line + '\n';
    } else {
      // Add to current chunk
      currentChunk += line + '\n';

      // If chunk size exceeds limit, save it
      if (currentChunk.length >= chunkSize) {
        chunks.push(currentChunk);
        currentChunk = `Continuing ${currentSection}\n`;
      }
    }
  }

  // Add any remaining content
  if (currentChunk.length > 0) {
    chunks.push(currentChunk);
  }

  return chunks;
};

// Override the main chunking function to handle CSV data specially
export const chunkText = (text: string, chunkSize: number): string[] => {
  // Check if this is our processed CSV format
  if (text.includes("CSV DATA SUMMARY:") &&
    text.includes("COLUMN HEADERS EXPLAINED:")) {
    return chunkCSVText(text, chunkSize);
  }

  // Original chunking logic for other content types
  const paragraphs = text.split(/\n\s*\n/);
  const chunks: string[] = [];
  for (const paragraph of paragraphs) {
    if (paragraph.length < chunkSize) {
      chunks.push(paragraph);
    } else {
      const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [];
      let currentChunk = '';
      for (const sentence of sentences) {
        if (currentChunk.length + sentence.length > chunkSize) {
          chunks.push(currentChunk);
          currentChunk = sentence;
        } else {
          currentChunk += (currentChunk ? ' ' : '') + sentence;
        }
      }
      if (currentChunk) {
        chunks.push(currentChunk);
      }
    }
  }
  return chunks;
};
