ParamManagerScripts/backend/script_groups/EmailCrono/utils/email_parser.py

# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto, guardar_imagen
import os


def _get_payload_safely(parte):
    """
    Obtiene el payload de una parte del email de forma segura
    """
    try:
        if parte.is_multipart():
            return None
        payload = parte.get_payload(decode=True)
        if payload is None:
            return None
        charset = parte.get_content_charset() or "utf-8"
        return payload.decode(charset, errors="ignore")
    except Exception as e:
        print(f"Error getting payload: {str(e)}")
        return None


def _extract_subject_from_text(text):
    """
    Extrae el asunto de un texto dados diferentes formatos de cabecera
    """
    subject_headers = {
        "Oggetto: ": 9,  # Italian
        "Subject: ": 9,  # English
        "Asunto: ": 8,  # Spanish
        "Sujet: ": 7,  # French
        "Betreff: ": 9,  # German
    }

    for line in text.split("\n"):
        line = line.strip()
        for header, offset in subject_headers.items():
            if line.startswith(header):
                return line[offset:].strip()
    return None


def _should_skip_line(line):
    """
    Determina si una línea debe ser omitida por ser una cabecera de email
    """
    headers_to_skip = [
        "Da: ",
        "Inviato: ",
        "A: ",  # Italian
        "From: ",
        "Sent: ",
        "To: ",  # English
        "De: ",
        "Enviado: ",
        "Para: ",  # Spanish
        "Von: ",
        "Gesendet: ",
        "An: ",  # German
        "De : ",
        "Envoyé : ",
        "À : ",  # French
    ]
    return any(line.strip().startswith(header) for header in headers_to_skip)


def _find_vault_root(start_path):
    """
    Busca hacia arriba un directorio que contenga la carpeta hermana '.obsidian'.
    Devuelve la ruta del directorio raíz del vault o None si no se encuentra.
    """
    current = os.path.abspath(start_path)
    if os.path.isfile(current):
        current = os.path.dirname(current)
    while True:
        obsidian_dir = os.path.join(current, ".obsidian")
        if os.path.isdir(obsidian_dir):
            return current
        parent = os.path.dirname(current)
        if parent == current:
            return None
        current = parent


def _ruta_relativa_vault(abs_path):
    """
    Convierte una ruta absoluta a una ruta relativa al root del vault Obsidian
    si se detecta. Si no se detecta, devuelve la ruta original.
    """
    abs_path = os.path.abspath(abs_path)
    vault_root = _find_vault_root(abs_path)
    if not vault_root:
        return abs_path
    try:
        rel = os.path.relpath(abs_path, vault_root)
        # Normalizar separadores a '/'
        return rel.replace("\\", "/")
    except Exception:
        return abs_path


def _html_a_markdown(html, cid_to_link=None):
    """
    Convierte contenido HTML a texto markdown, extrayendo el asunto si está
    presente
    """
    if html is None:
        return (None, "")

    try:
        # Limpieza básica
        html = html.replace("\xa0", " ")  # NBSP a espacio normal
        html = html.replace("\r\n", "\n")  # CRLF a LF
        html = html.replace("\r", "\n")  # CR a LF

        soup = BeautifulSoup(html, "html.parser")

        # Reemplazar imágenes inline referenciadas por cid en su lugar,
        # conservando un ancho máximo de 800px
        if cid_to_link:
            for img in soup.find_all("img"):
                src = img.get("src", "")
                if src.startswith("cid:"):
                    cid = src[4:].strip("<>")
                    embed_path = cid_to_link.get(cid)
                    if embed_path:
                        # Determinar ancho solicitado
                        width_attr = img.get("width")
                        style_attr = img.get("style", "")
                        width_px = None
                        if width_attr:
                            try:
                                digits = "".join(
                                    ch for ch in str(width_attr) if ch.isdigit()
                                )
                                if digits:
                                    width_px = int(digits)
                            except Exception:
                                width_px = None
                        if width_px is None and style_attr:
                            m = re.search(r"width\s*:\s*(\d+)px", style_attr, re.I)
                            if m:
                                try:
                                    width_px = int(m.group(1))
                                except Exception:
                                    width_px = None
                            if width_px is None:
                                m2 = re.search(
                                    r"max-width\s*:\s*(\d+)px", style_attr, re.I
                                )
                                if m2:
                                    try:
                                        width_px = int(m2.group(1))
                                    except Exception:
                                        width_px = None

                        if width_px is None:
                            width_px = 800
                        else:
                            width_px = min(width_px, 800)

                        # Obsidian embed con modificador de ancho
                        img.replace_with(
                            soup.new_string(f"![[{embed_path}|{width_px}]]")
                        )

        # Procesar tablas
        for table in soup.find_all("table"):
            try:
                rows = table.find_all("tr")
                if not rows:
                    continue

                # Matriz para almacenar la tabla procesada
                table_matrix = []
                max_cols = 0

                # Primera pasada: crear matriz y procesar rowspans/colspans
                row_idx = 0
                while row_idx < len(rows):
                    row = rows[row_idx]
                    cells = row.find_all(["th", "td"])
                    if not cells:
                        row_idx += 1
                        continue

                    # Expandir matriz si es necesario
                    while len(table_matrix) <= row_idx:
                        table_matrix.append([])

                    col_idx = 0
                    for cell in cells:
                        # Encontrar la siguiente columna disponible
                        while (
                            col_idx < len(table_matrix[row_idx])
                            and table_matrix[row_idx][col_idx] is not None
                        ):
                            col_idx += 1

                        # Obtener rowspan y colspan
                        rowspan = int(cell.get("rowspan", 1))
                        colspan = int(cell.get("colspan", 1))

                        # Procesar texto de la celda reemplazando saltos de
                        # línea por <br>
                        cell_text = cell.get_text().strip()
                        cell_text = cell_text.replace("\n", "<br>")
                        cell_text = re.sub(
                            r"\s*<br>\s*<br>\s*", "<br>", cell_text
                        )  # Eliminar <br> múltiples
                        cell_text = cell_text.strip()

                        # Rellenar la matriz con el texto y None para celdas
                        # combinadas
                        for r in range(rowspan):
                            current_row = row_idx + r
                            # Expandir matriz si es necesario
                            while len(table_matrix) <= current_row:
                                table_matrix.append([])
                        # Expandir fila si es necesario
                        while len(table_matrix[current_row]) <= col_idx + colspan - 1:
                            table_matrix[current_row].append(None)

                            for c in range(colspan):
                                if r == 0 and c == 0:
                                    table_matrix[current_row][col_idx + c] = cell_text
                                else:
                                    table_matrix[current_row][col_idx + c] = ""

                        col_idx += colspan

                    max_cols = max(max_cols, col_idx)
                    row_idx += 1

                # Asegurar que todas las filas tengan el mismo número de columnas
                for row in table_matrix:
                    while len(row) < max_cols:
                        row.append("")

                # Calcular anchos máximos por columna
                col_widths = [0] * max_cols
                for row in table_matrix:
                    for col_idx, cell in enumerate(row):
                        if cell is not None:
                            col_widths[col_idx] = max(
                                col_widths[col_idx], len(str(cell))
                            )

                # Generar tabla Markdown
                markdown_table = []

                # Cabecera
                if table_matrix:
                    header = "|"
                    for col_idx, width in enumerate(col_widths):
                        cell = str(table_matrix[0][col_idx] or "")
                        header += f" {cell.ljust(width)} |"
                    markdown_table.append(header)

                    # Separador
                    separator = "|"
                    for width in col_widths:
                        separator += "-" * (width + 2) + "|"
                    markdown_table.append(separator)

                    # Contenido
                    for row_idx in range(1, len(table_matrix)):
                        row_text = "|"
                        for col_idx, width in enumerate(col_widths):
                            cell = str(table_matrix[row_idx][col_idx] or "")
                            row_text += f" {cell.ljust(width)} |"
                        markdown_table.append(row_text)

                # Reemplazar la tabla HTML con la versión Markdown
                if markdown_table:
                    replacement = "\n" + "\n".join(markdown_table) + "\n"
                    table.replace_with(soup.new_string(replacement))

            except Exception as e:
                print(f"Error procesando tabla: {str(e)}")
                continue

        # Procesar saltos de línea
        for br in soup.find_all("br"):
            br.replace_with("\n")

        # Obtener texto limpio
        text = soup.get_text()

        # Procesar líneas
        cleaned_lines = []
        subject = None

        for line in text.split("\n"):
            if not subject:
                subject = _extract_subject_from_text(line)

            if not _should_skip_line(line):
                cleaned_lines.append(line)

        final_text = "\n".join(cleaned_lines).strip()
        return (subject, final_text)

    except Exception as e:
        print(f"Error en html_a_markdown: {str(e)}")
        return (None, html if html else "")


def _procesar_email_adjunto(parte, dir_adjuntos, dir_adjuntos_cronologia=None):
    """
    Procesa un email que viene como adjunto dentro de otro email.
    """
    try:
        mensajes = []
        if parte.is_multipart():
            # Si es multipart, procesar cada subparte
            for subparte in parte.walk():
                if subparte.get_content_type() == "message/rfc822":
                    # Si es un mensaje RFC822, obtener el payload como lista
                    payload = subparte.get_payload()
                    if isinstance(payload, list):
                        for msg in payload:
                            mensajes.extend(
                                procesar_eml_interno(
                                    msg, dir_adjuntos, dir_adjuntos_cronologia
                                )
                            )
                    elif isinstance(payload, email.message.Message):
                        mensajes.extend(
                            procesar_eml_interno(
                                payload, dir_adjuntos, dir_adjuntos_cronologia
                            )
                        )
        else:
            # Si no es multipart, intentar procesar como mensaje único
            payload = parte.get_payload()
            if isinstance(payload, list):
                for msg in payload:
                    mensajes.extend(
                        procesar_eml_interno(msg, dir_adjuntos, dir_adjuntos_cronologia)
                    )
            elif isinstance(payload, email.message.Message):
                mensajes.extend(
                    procesar_eml_interno(payload, dir_adjuntos, dir_adjuntos_cronologia)
                )

        return mensajes
    except Exception as e:
        print(f"Error procesando email adjunto: {str(e)}")
        return []


def procesar_eml(ruta_archivo, dir_adjuntos, dir_adjuntos_cronologia=None):
    """
    Punto de entrada principal para procesar archivos .eml
    """
    try:
        print(f"  📧 Abriendo archivo: {ruta_archivo}")
        with open(ruta_archivo, "rb") as eml:
            mensaje = BytesParser(policy=policy.default).parse(eml)

        mensajes = procesar_eml_interno(mensaje, dir_adjuntos, dir_adjuntos_cronologia)
        print(f"  📧 Procesamiento completado: {len(mensajes)} mensajes extraídos")
        return mensajes
    except Exception as e:
        print(f"❌ Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []


def procesar_eml_interno(mensaje, dir_adjuntos, dir_adjuntos_cronologia=None):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []

    try:
        remitente = mensaje.get("from", "")
        fecha_str = mensaje.get("date", "")
        fecha = _parsear_fecha(fecha_str)

        # Get subject from email headers first
        subject = mensaje.get("subject", "")
        if subject:
            # Try to decode if it's encoded
            decoded = email.header.decode_header(subject)
            subject = str(email.header.make_header(decoded))

        contenido = ""
        adjuntos = []
        imagenes = []
        tiene_html = False

        # Primer pase: detectar si hay HTML
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                if parte.get_content_type() == "text/html":
                    tiene_html = True
                    break
        else:
            tiene_html = mensaje.get_content_type() == "text/html"

        # Segundo pase: procesar contenido y adjuntos
        if mensaje.is_multipart():
            # Capturar SOLO una vez el cuerpo principal y no sobrescribirlo
            contenido_set = False  # flag para no re-asignar contenido principal
            # Construir mapa cid->ruta de embed (relativa al vault) para inline
            cid_to_link = {}
            if dir_adjuntos_cronologia:
                for parte in mensaje.walk():
                    ctype = parte.get_content_type()
                    dispo = parte.get_content_disposition()
                    if ctype.startswith("image/") and dispo in (None, "inline"):
                        cid_header = parte.get("Content-ID", "") or parte.get(
                            "Content-Id", ""
                        )
                        if cid_header:
                            cid_clean = cid_header.strip("<>")
                            # Guardar SOLO en adjuntos/cronologia
                            ruta_img = guardar_imagen(parte, dir_adjuntos_cronologia)
                            if ruta_img:
                                # Convertir a ruta relativa al vault
                                embed_path = _ruta_relativa_vault(ruta_img)
                                cid_to_link[cid_clean] = embed_path

            for parte in mensaje.walk():
                content_type = parte.get_content_type()

                try:
                    # -----------------------------
                    # 1. CONTENIDO PRINCIPAL
                    # -----------------------------
                    if (not contenido_set) and (
                        content_type == "text/html"
                        or (content_type == "text/plain" and not tiene_html)
                    ):
                        # Procesamos la primera ocurrencia válida
                        if content_type == "text/html":
                            html_content = _get_payload_safely(parte)
                            if html_content:
                                part_subject, text = _html_a_markdown(
                                    html_content, cid_to_link
                                )
                                if not subject and part_subject:
                                    subject = part_subject
                                if text:
                                    contenido = text
                                    contenido_set = True
                        else:  # text/plain
                            text = _get_payload_safely(parte)
                            if text:
                                contenido = text
                                contenido_set = True

                    # -----------------------------
                    # 2. EMAILS RFC822 ADJUNTOS
                    # -----------------------------
                    elif content_type == "message/rfc822":
                        mensajes_adjuntos = _procesar_email_adjunto(
                            parte, dir_adjuntos, dir_adjuntos_cronologia
                        )
                        mensajes.extend(mensajes_adjuntos)

                    # -----------------------------
                    # 3. OTROS ADJUNTOS
                    # -----------------------------
                    elif parte.get_content_disposition() == "attachment":
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith(".eml"):
                            mensajes_adjuntos = _procesar_email_adjunto(
                                parte, dir_adjuntos, dir_adjuntos_cronologia
                            )
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            # Imagen adjunta (no inline): solo guardar en adjuntos
                            if content_type.startswith("image/"):
                                ruta_img = guardar_imagen(parte, dir_adjuntos)
                                if ruta_img:
                                    adjuntos.append(Path(ruta_img).name)
                            else:
                                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                                if ruta_adjunto:
                                    adjuntos.append(Path(ruta_adjunto).name)

                    # 4. IMÁGENES INLINE: ya manejadas para embebido; no listar
                    elif content_type.startswith("image/") and (
                        parte.get_content_disposition() in (None, "inline")
                    ):
                        # Nada que hacer aquí; ya se guardó en cronologia y
                        # se reemplazó en el HTML
                        pass

                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                html_content = _get_payload_safely(mensaje)
                if html_content:
                    # Para mensajes no multipart, no hay inline cid a resolver
                    part_subject, contenido = _html_a_markdown(html_content, {})
                    if not subject and part_subject:
                        subject = part_subject
            else:
                contenido = _get_payload_safely(mensaje) or ""

        # Solo agregar el mensaje si tiene contenido útil
        if contenido or subject or adjuntos:
            mensaje_nuevo = MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos,
            )
            print("    ✉️ Mensaje extraído:")
            print(f"      - Subject: {subject}")
            print(f"      - Remitente: {remitente}")
            print(f"      - Fecha: {fecha}")
            print(f"      - Adjuntos: {len(adjuntos)} archivos")
            print(f"      - Contenido: {len(contenido)} caracteres")
            print(f"      - Hash generado: {mensaje_nuevo.hash}")
            mensajes.append(mensaje_nuevo)
        else:
            print("    ⚠️ Mensaje vacío o sin contenido útil - no se agregará")

    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")

    return mensajes


def _parsear_fecha(fecha_str):
    try:
        fecha = parsedate_to_datetime(fecha_str)
        return fecha.replace(tzinfo=None)  # Remove timezone info
    except Exception:
        try:
            fecha_match = re.search(
                r"venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})", fecha_str
            )
            if fecha_match:
                dia, mes, año, hora, minuto = fecha_match.groups()
                meses_it = {
                    "gennaio": 1,
                    "febbraio": 2,
                    "marzo": 3,
                    "aprile": 4,
                    "maggio": 5,
                    "giugno": 6,
                    "luglio": 7,
                    "agosto": 8,
                    "settembre": 9,
                    "ottobre": 10,
                    "novembre": 11,
                    "dicembre": 12,
                }
                mes_num = meses_it.get(mes.lower(), 1)
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except Exception:
            pass
        return datetime.now()