<?php
// ============================================================
//  Email HTML sanitiser
// ============================================================
//
//  Pure-PHP HTML sanitiser for incoming email bodies. Two goals:
//
//  1. SAFETY — strip <script>, event handlers, javascript: URLs,
//     <iframe>, <object>, etc. so a malicious email can't run
//     code in the admin's browser.
//
//  2. PRIVACY — by default, block remote images (replace src with
//     a 1x1 transparent gif and add a placeholder). Remote images
//     are tracking pixels in disguise — they tell the sender when
//     and where the message was opened.
//
//  When the sender is on the per-account whitelist, images load
//  normally.
//
//  This is NOT a hardened library like HTML Purifier — it's a
//  pragmatic sanitiser that handles real-world email. For
//  pathological inputs (deeply nested malformed HTML), it errs
//  on the side of stripping content.
// ============================================================

/**
 * Sanitise an email's HTML body for safe inline display.
 *
 * @param string $html       Raw HTML from the email
 * @param bool   $load_images If true, allow remote images; if false, block
 * @return array{html: string, images_blocked: int}
 */
function email_sanitize_html(string $html, bool $load_images = false): array {
    if ($html === '') return ['html' => '', 'images_blocked' => 0];

    // 1. Strip nulls and control chars (some clients smuggle these)
    $html = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F]/u', '', $html);

    // 2. Strip <head>…</head> (we don't want its content) and <script>…</script>
    //    (security). These need to go BEFORE DOM loading because DOMDocument
    //    will move their content around.
    $html = preg_replace('#<head\b[^>]*>.*?</head>#is', '', $html);
    $html = preg_replace('#<script\b[^>]*>.*?</script>#is', '', $html);

    // 3. Strip <html>…</html> and <body>…</body> WRAPPERS but keep their content.
    //    Real emails almost always come wrapped this way and we need the inner
    //    content. We drop the wrappers because they're not on our whitelist;
    //    DOMDocument would otherwise discard them along with everything inside.
    $html = preg_replace('#</?(html|body)\b[^>]*>#i', '', $html);

    // 4. Use DOMDocument — much safer than regex for the remaining HTML
    $dom = new DOMDocument('1.0', 'UTF-8');
    libxml_use_internal_errors(true);
    // Wrap in a meta tag so DOMDocument treats it as UTF-8
    $wrapped = '<?xml encoding="UTF-8"?><div>' . $html . '</div>';
    $loaded = @$dom->loadHTML($wrapped, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    libxml_clear_errors();
    if (!$loaded) {
        // Couldn't parse — fall back to stripping all tags as a last resort
        return ['html' => '<pre>' . htmlspecialchars(strip_tags($html)) . '</pre>', 'images_blocked' => 0];
    }

    // 5. Walk the tree, strip dangerous tags and attributes
    $blocked_images = email_sanitize_walk($dom->documentElement, $load_images);

    // 6. Serialize back. Skip the wrapper div by getting innerHTML.
    $out = '';
    foreach ($dom->documentElement->childNodes as $child) {
        $out .= $dom->saveHTML($child);
    }

    return ['html' => $out, 'images_blocked' => $blocked_images];
}

/**
 * Recursive sanitiser walker.
 *
 * Returns total count of blocked remote images (for the "X images
 * blocked — click to load" banner).
 */
function email_sanitize_walk(DOMNode $node, bool $load_images, int &$counter = 0): int {
    static $allowed_tags = [
        'a','abbr','article','aside','b','blockquote','br','caption','center','cite','code',
        'col','colgroup','dd','del','details','dfn','div','dl','dt','em','figcaption','figure',
        'font','footer','h1','h2','h3','h4','h5','h6','header','hr','i','img','ins','kbd',
        'li','main','mark','nav','ol','p','pre','q','s','samp','section','small','span',
        'strike','strong','sub','summary','sup','table','tbody','td','tfoot','th','thead',
        'time','tr','u','ul','wbr',
        'style',  // we keep <style> but rewrite/scrub it below
    ];
    static $allowed_attrs = [
        '*'     => ['style','title','class','dir','lang','id'],
        'a'     => ['href','target','rel'],
        'img'   => ['src','alt','width','height'],
        'table' => ['border','cellpadding','cellspacing','width','align','bgcolor'],
        'td'    => ['colspan','rowspan','width','height','valign','align','bgcolor'],
        'th'    => ['colspan','rowspan','width','height','valign','align','bgcolor'],
        'tr'    => ['valign','align','bgcolor'],
        'col'   => ['span','width','align'],
        'font'  => ['color','face','size'],
    ];

    // Walk a snapshot of children; we may remove nodes during the walk
    $children = iterator_to_array($node->childNodes);
    foreach ($children as $child) {
        if ($child instanceof DOMElement) {
            $tag = strtolower($child->nodeName);

            // Remove disallowed tags entirely (including their contents)
            if (!in_array($tag, $allowed_tags, true)) {
                $child->parentNode->removeChild($child);
                continue;
            }

            // <style> — keep, but strip @import and url() pointing remote
            if ($tag === 'style') {
                $css = $child->textContent;
                $css = preg_replace('/@import\s+[^;]+;?/i', '', (string)$css);
                if (!$load_images) {
                    $css = preg_replace('/url\s*\(\s*[\'"]?(https?:|\/\/)[^)]+\)/i', 'url(about:blank)', (string)$css);
                }
                $child->nodeValue = '';
                $child->appendChild($child->ownerDocument->createTextNode($css));
                continue;
            }

            // Scrub attributes
            $keep_attrs = array_merge($allowed_attrs['*'], $allowed_attrs[$tag] ?? []);
            $attrs_to_remove = [];
            foreach ($child->attributes as $attr) {
                $name = strtolower($attr->nodeName);
                if (!in_array($name, $keep_attrs, true)) {
                    $attrs_to_remove[] = $attr->nodeName;
                    continue;
                }
                $val = trim($attr->nodeValue);

                // Block javascript:, data:, vbscript: URLs in href/src
                if (in_array($name, ['href','src'], true)) {
                    $lower = strtolower(preg_replace('/\s+/', '', $val));
                    if (preg_match('/^(javascript|vbscript|data:(?!image\/(png|jpe?g|gif|svg\+xml|webp);base64,))/i', $lower)) {
                        $attrs_to_remove[] = $attr->nodeName;
                        continue;
                    }
                }

                // Block remote images unless allowed
                if ($tag === 'img' && $name === 'src' && !$load_images) {
                    $is_remote = preg_match('#^(https?:)?//#i', $val);
                    $is_cid    = stripos($val, 'cid:') === 0;
                    $is_data   = stripos($val, 'data:image/') === 0;
                    if ($is_remote && !$is_cid && !$is_data) {
                        // Replace with placeholder + remember original
                        $child->setAttribute('data-blocked-src', $val);
                        $attr->nodeValue = 'data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==';
                        $child->setAttribute('style', trim(($child->getAttribute('style') ?: '') . ';border:1px dashed #cbd5e1;min-width:60px;min-height:20px;background:#f1f5f9;'));
                        $child->setAttribute('alt', '[image blocked: ' . htmlspecialchars(parse_url($val, PHP_URL_HOST) ?: 'remote') . ']');
                        $counter++;
                        continue;
                    }
                    if ($is_cid) {
                        // Inline image referenced by Content-ID — we don't
                        // pull the part now, just block. Phase 2 will
                        // resolve cid: refs on demand.
                        $child->setAttribute('data-cid-src', $val);
                        $attr->nodeValue = 'data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==';
                        $child->setAttribute('alt', '[inline image]');
                        continue;
                    }
                }

                // Scrub inline style: drop url() and expression() in CSS
                if ($name === 'style') {
                    $clean = preg_replace('/expression\s*\(/i', 'voidexpr(', $val);
                    if (!$load_images) {
                        $clean = preg_replace('/url\s*\(\s*[\'"]?(https?:|\/\/)[^)]+\)/i', 'url(about:blank)', (string)$clean);
                    }
                    $attr->nodeValue = $clean;
                }

                // Force-open links in a new tab and noopener
                if ($tag === 'a' && $name === 'href') {
                    // We'll set target/rel below if not present
                }
            }
            foreach ($attrs_to_remove as $n) $child->removeAttribute($n);

            if ($tag === 'a') {
                $child->setAttribute('target', '_blank');
                $child->setAttribute('rel', 'noopener noreferrer');
            }

            // Recurse
            email_sanitize_walk($child, $load_images, $counter);
        }
        // Text nodes, comments — leave text, drop comments (rare in mail)
        elseif ($child instanceof DOMComment) {
            $child->parentNode->removeChild($child);
        }
    }
    return $counter;
}

/**
 * Wrap sanitised body in a basic style isolation container so the
 * email's own CSS doesn't leak into the admin UI.
 */
function email_render_html(string $sanitised_html): string {
    return '<div class="email-body-isolate">' . $sanitised_html . '</div>';
}

/**
 * Convert plain-text to safe HTML for display: escape, autolink
 * URLs, preserve newlines.
 */
function email_render_plain(string $text): string {
    $h = htmlspecialchars($text);
    $h = preg_replace_callback(
        '#(https?://[^\s<>"\']+)#i',
        fn($m) => '<a href="' . $m[1] . '" target="_blank" rel="noopener noreferrer">' . $m[1] . '</a>',
        $h
    );
    return '<pre class="email-body-plain">' . $h . '</pre>';
}