<?php
/**
 * OpenAI helper for extracting transaction details from uploaded documents.
 * Exposes:
 *  - load_env(): loads .env values into $_ENV/putenv (non-fatal if missing)
 *  - extract_transaction_from_file($file_path, $type): returns parsed fields or error
 */

// Load environment variables from .env (simple parser)
function load_env() {
    static $loaded = false;
    if ($loaded) return;

    $env_file = __DIR__ . '/../.env';
    if (!file_exists($env_file)) {
        $loaded = true;
        return;
    }

    $lines = file($env_file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
    if ($lines === false) {
        $loaded = true;
        return;
    }

    foreach ($lines as $line) {
        if (strpos(trim($line), '#') === 0) continue;
        [$key, $value] = array_pad(explode('=', $line, 2), 2, '');
        $key = trim($key);
        $value = trim($value, " \t\n\r\0\x0B\"'");
        if ($key !== '') {
            $_ENV[$key] = $value;
            putenv("$key=$value");
        }
    }
    $loaded = true;
}

// Sanitize text to valid UTF-8 and strip control chars for safe JSON encoding
function sanitize_text($text) {
    if ($text === '' || $text === null) return '';
    if (!mb_check_encoding($text, 'UTF-8')) {
        $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
    }
    // Remove control characters except common whitespace
    $text = preg_replace('/[^\P{C}\n\t ]+/u', '', $text);
    return $text;
}

/**
 * Extract structured transaction data from a document using OpenAI.
 *
 * @param string $file_path Full path to the uploaded file
 * @param string $type      'income' or 'expense'
 * @return array [
 *   'amount' => string,
 *   'date' => string (YYYY-MM-DD if detected),
 *   'category' => string,
 *   'description' => string,
 *   'error' => string (if any)
 * ]
 */
function extract_transaction_from_file($file_path, $type = 'income') {
    load_env();

    $api_key = getenv('OPENAI_API_KEY');
    if (empty($api_key)) {
        return ['error' => 'Missing OPENAI_API_KEY. Add it to .env'];
    }

    if (!file_exists($file_path)) {
        return ['error' => 'File not found: ' . $file_path];
    }

    if (!is_readable($file_path)) {
        return ['error' => 'File is not readable'];
    }

    $mime = mime_content_type($file_path);
    if ($mime === false) {
        // Fallback: try to detect from extension
        $ext = strtolower(pathinfo($file_path, PATHINFO_EXTENSION));
        $mime_map = [
            'jpg' => 'image/jpeg',
            'jpeg' => 'image/jpeg',
            'png' => 'image/png',
            'pdf' => 'application/pdf'
        ];
        $mime = $mime_map[$ext] ?? 'unknown';
    }
    
    $allowed = ['image/jpeg', 'image/png', 'application/pdf'];
    if (!in_array($mime, $allowed)) {
        return ['error' => 'Unsupported file type (' . $mime . '). Use JPG, PNG, or PDF'];
    }

    // Prepare content for OpenAI
    $file_content = file_get_contents($file_path);
    if ($file_content === false || empty($file_content)) {
        return ['error' => 'Unable to read uploaded file or file is empty'];
    }
    
    // Check file size (OpenAI has limits)
    $file_size = strlen($file_content);
    if ($file_size > 20 * 1024 * 1024) { // 20MB limit for base64 encoding overhead
        return ['error' => 'File too large. Maximum size is 20MB'];
    }

    $messages = [
        [
            'role' => 'system',
            'content' => "You are a finance extraction assistant. Extract and return ONLY JSON with keys: amount, date, category, description.
- amount: numeric string (no currency symbol), use positive numbers.
- date: YYYY-MM-DD (best guess; if missing, return empty string).
- category: a short label (e.g., Groceries, Salary, Utilities).
- description: brief text/merchant/source.
Type context: {$type}. If something is missing, return an empty string. Do not invent amounts."
        ]
    ];

    // If image: send as image_url. If PDF: send sanitized text preview and a short base64 snippet (vision on PDFs is not directly supported).
    if ($mime === 'image/jpeg' || $mime === 'image/png') {
        $base64 = base64_encode($file_content);
        $data_url = "data:{$mime};base64,{$base64}";
        $messages[] = [
            'role' => 'user',
            'content' => [
                ['type' => 'text', 'text' => "Extract transaction fields for a {$type} document. Return JSON only."],
                ['type' => 'image_url', 'image_url' => ['url' => $data_url]]
            ]
        ];
    } else { // PDF path
        // PDF fallback: provide sanitized text preview AND a short base64 slice so the model has hints.
        $text_preview = substr($file_content, 0, 20000);
        $text_preview = sanitize_text($text_preview);
        $base64_snippet = substr(base64_encode($file_content), 0, 6000); // short to keep payload sane
        $messages[] = [
            'role' => 'user',
            'content' => [
                [
                    'type' => 'text',
                    'text' =>
                        "Extract transaction fields for a {$type} document (PDF). Return JSON only.\n" .
                        "- If data is ambiguous, leave fields empty.\n\n" .
                        "Sanitized text snippet (may be messy):\n{$text_preview}\n\n" .
                        "Base64 (first part of PDF): {$base64_snippet}"
                ]
            ]
        ];
    }

    $payload = [
        'model' => 'gpt-4o-mini',
        'messages' => $messages,
        'temperature' => 0.2,
    ];

    $body = json_encode($payload, JSON_UNESCAPED_SLASHES);
    if ($body === false) {
        $json_error = json_last_error_msg();
        return ['error' => 'Failed to encode JSON payload: ' . $json_error];
    }

    $ch = curl_init('https://api.openai.com/v1/chat/completions');
    curl_setopt_array($ch, [
        CURLOPT_POST => true,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_HTTPHEADER => [
            'Content-Type: application/json',
            "Authorization: Bearer {$api_key}"
        ],
        CURLOPT_POSTFIELDS => $body,
        CURLOPT_TIMEOUT => 45,
        CURLOPT_SSL_VERIFYPEER => true,
    ]);

    $response = curl_exec($ch);
    $curl_error = curl_error($ch);
    $status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($response === false) {
        return ['error' => 'OpenAI request failed: ' . ($curl_error ?: 'Unknown cURL error')];
    }

    if (empty($response)) {
        return ['error' => 'Empty response from OpenAI API (HTTP ' . $status . ')'];
    }

    $data = json_decode($response, true);
    if (json_last_error() !== JSON_ERROR_NONE) {
        return ['error' => 'Invalid JSON response from OpenAI: ' . json_last_error_msg()];
    }
    
    // Check for API errors first
    if ($status >= 400) {
        $err = $data['error']['message'] ?? 'OpenAI API error (HTTP ' . $status . ')';
        if (isset($data['error']['code'])) {
            $err .= ' (Code: ' . $data['error']['code'] . ')';
        }
        return ['error' => $err];
    }
    
    // Check if response structure is valid
    if (empty($data) || !isset($data['choices']) || !is_array($data['choices']) || empty($data['choices'][0])) {
        return ['error' => 'Invalid OpenAI response structure'];
    }
    
    // Check if content exists
    if (empty($data['choices'][0]['message']['content'])) {
        $err = $data['error']['message'] ?? 'No content in OpenAI response';
        return ['error' => $err];
    }

    $content = $data['choices'][0]['message']['content'];

    // Extract JSON from the response (handle code fences and markdown)
    $json_str = $content;
    // Try to extract JSON from markdown code blocks first
    if (preg_match('/```(?:json)?\s*(\{.*?\})\s*```/s', $content, $m)) {
        $json_str = $m[1];
    } else {
        // Find the first { and then match balanced braces
        $start_pos = strpos($content, '{');
        if ($start_pos !== false) {
            $brace_count = 0;
            $end_pos = $start_pos;
            for ($i = $start_pos; $i < strlen($content); $i++) {
                if ($content[$i] === '{') {
                    $brace_count++;
                } elseif ($content[$i] === '}') {
                    $brace_count--;
                    if ($brace_count === 0) {
                        $end_pos = $i + 1;
                        break;
                    }
                }
            }
            if ($brace_count === 0) {
                $json_str = substr($content, $start_pos, $end_pos - $start_pos);
            }
        }
    }

    $parsed = json_decode($json_str, true);
    if (!is_array($parsed)) {
        // Log the actual content for debugging
        $debug_content = substr($content, 0, 200);
        return ['error' => 'Could not parse JSON from OpenAI response. Content preview: ' . $debug_content];
    }

    // Normalize fields
    $result = [
        'amount' => isset($parsed['amount']) ? trim((string)$parsed['amount']) : '',
        'date' => isset($parsed['date']) ? trim((string)$parsed['date']) : '',
        'category' => isset($parsed['category']) ? trim((string)$parsed['category']) : '',
        'description' => isset($parsed['description']) ? trim((string)$parsed['description']) : '',
    ];

    return $result;
}

