from __future__ import annotations

import re
import urllib.error
import urllib.request
from html import unescape
from html.parser import HTMLParser


class _TextExtractor(HTMLParser):
    SKIP = frozenset({"script", "style", "noscript", "svg", "nav", "footer", "header"})

    def __init__(self) -> None:
        super().__init__()
        self._skip_depth = 0
        self.chunks: list[str] = []
        self.title = ""

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        t = tag.lower()
        if t in self.SKIP:
            self._skip_depth += 1
        if t == "title" and not self.title:
            self._in_title = True
        else:
            self._in_title = getattr(self, "_in_title", False)

    def handle_endtag(self, tag: str) -> None:
        t = tag.lower()
        if t in self.SKIP and self._skip_depth:
            self._skip_depth -= 1
        if t == "title":
            self._in_title = False

    def handle_data(self, data: str) -> None:
        if self._skip_depth:
            return
        text = data.strip()
        if not text:
            return
        if getattr(self, "_in_title", False):
            self.title = (self.title + " " + text).strip()
        else:
            self.chunks.append(text)


def _collapse(text: str) -> str:
    return re.sub(r"\s+", " ", unescape(text)).strip()


def fetch_article_source(url: str, max_chars: int = 12000) -> dict[str, str]:
    headers = {
        "User-Agent": "arnaudmerigeau-article-studio/1.0 (+local)",
        "Accept": "text/html,application/xhtml+xml",
    }
    req = urllib.request.Request(url, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=45) as resp:
            ctype = resp.headers.get("Content-Type", "")
            raw = resp.read()
    except urllib.error.HTTPError as e:
        raise RuntimeError(f"HTTP {e.code} en récupérant {url}") from e
    except urllib.error.URLError as e:
        raise RuntimeError(f"URL inaccessible : {url} — {e.reason}") from e

    charset = "utf-8"
    m = re.search(r"charset=([\w-]+)", ctype, re.I)
    if m:
        charset = m.group(1)

    html = raw.decode(charset, errors="replace")
    parser = _TextExtractor()
    parser.feed(html)
    body = _collapse(" ".join(parser.chunks))
    title = _collapse(parser.title)

    og_title = re.search(
        r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)',
        html,
        re.I,
    )
    if og_title:
        title = _collapse(og_title.group(1)) or title

    og_desc = re.search(
        r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)',
        html,
        re.I,
    )
    description = _collapse(og_desc.group(1)) if og_desc else ""

    if len(body) > max_chars:
        body = body[:max_chars] + "…"

    return {
        "url": url,
        "title": title,
        "description": description,
        "text": body,
    }
