ParamManagerScripts/backend/script_groups/EmailCrono/utils/email_parser.py

# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto
import tempfile
import os


def _get_payload_safely(parte):
    """
    Obtiene el payload de una parte del email de forma segura
    """
    try:
        if parte.is_multipart():
            return None
        payload = parte.get_payload(decode=True)
        if payload is None:
            return None
        charset = parte.get_content_charset() or "utf-8"
        return payload.decode(charset, errors="ignore")
    except Exception as e:
        print(f"Error getting payload: {str(e)}")
        return None


def _extract_subject_from_text(text):
    """
    Extrae el asunto de un texto dados diferentes formatos de cabecera
    """
    subject_headers = {
        "Oggetto: ": 9,  # Italian
        "Subject: ": 9,  # English
        "Asunto: ": 8,  # Spanish
        "Sujet: ": 7,  # French
        "Betreff: ": 9,  # German
    }

    for line in text.split("\n"):
        line = line.strip()
        for header, offset in subject_headers.items():
            if line.startswith(header):
                return line[offset:].strip()
    return None


def _should_skip_line(line):
    """
    Determina si una línea debe ser omitida por ser una cabecera de email
    """
    headers_to_skip = [
        "Da: ",
        "Inviato: ",
        "A: ",  # Italian
        "From: ",
        "Sent: ",
        "To: ",  # English
        "De: ",
        "Enviado: ",
        "Para: ",  # Spanish
        "Von: ",
        "Gesendet: ",
        "An: ",  # German
        "De : ",
        "Envoyé : ",
        "À : ",  # French
    ]
    return any(line.strip().startswith(header) for header in headers_to_skip)


def _html_a_markdown(html):
    """
    Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente
    """
    if html is None:
        return (None, "")

    try:
        # Limpieza básica
        html = html.replace("\xa0", " ")  # NBSP a espacio normal
        html = html.replace("\r\n", "\n")  # CRLF a LF
        html = html.replace("\r", "\n")  # CR a LF

        soup = BeautifulSoup(html, "html.parser")

        # Procesar tablas
        for table in soup.find_all("table"):
            try:
                rows = table.find_all("tr")
                if not rows:
                    continue

                # Matriz para almacenar la tabla procesada
                table_matrix = []
                max_cols = 0

                # Primera pasada: crear matriz y procesar rowspans/colspans
                row_idx = 0
                while row_idx < len(rows):
                    row = rows[row_idx]
                    cells = row.find_all(["th", "td"])
                    if not cells:
                        row_idx += 1
                        continue

                    # Expandir matriz si es necesario
                    while len(table_matrix) <= row_idx:
                        table_matrix.append([])

                    col_idx = 0
                    for cell in cells:
                        # Encontrar la siguiente columna disponible
                        while (
                            col_idx < len(table_matrix[row_idx])
                            and table_matrix[row_idx][col_idx] is not None
                        ):
                            col_idx += 1

                        # Obtener rowspan y colspan
                        rowspan = int(cell.get("rowspan", 1))
                        colspan = int(cell.get("colspan", 1))

                        # Procesar el texto de la celda reemplazando saltos de línea por <br>
                        cell_text = cell.get_text().strip()
                        cell_text = cell_text.replace("\n", "<br>")
                        cell_text = re.sub(
                            r"\s*<br>\s*<br>\s*", "<br>", cell_text
                        )  # Eliminar <br> múltiples
                        cell_text = cell_text.strip()

                        # Rellenar la matriz con el texto y None para las celdas combinadas
                        for r in range(rowspan):
                            current_row = row_idx + r
                            # Expandir matriz si es necesario
                            while len(table_matrix) <= current_row:
                                table_matrix.append([])
                            # Expandir fila si es necesario
                            while (
                                len(table_matrix[current_row]) <= col_idx + colspan - 1
                            ):
                                table_matrix[current_row].append(None)

                            for c in range(colspan):
                                if r == 0 and c == 0:
                                    table_matrix[current_row][col_idx + c] = cell_text
                                else:
                                    table_matrix[current_row][col_idx + c] = ""

                        col_idx += colspan

                    max_cols = max(max_cols, col_idx)
                    row_idx += 1

                # Asegurar que todas las filas tengan el mismo número de columnas
                for row in table_matrix:
                    while len(row) < max_cols:
                        row.append("")

                # Calcular anchos máximos por columna
                col_widths = [0] * max_cols
                for row in table_matrix:
                    for col_idx, cell in enumerate(row):
                        if cell is not None:
                            col_widths[col_idx] = max(
                                col_widths[col_idx], len(str(cell))
                            )

                # Generar tabla Markdown
                markdown_table = []

                # Cabecera
                if table_matrix:
                    header = "|"
                    for col_idx, width in enumerate(col_widths):
                        cell = str(table_matrix[0][col_idx] or "")
                        header += f" {cell.ljust(width)} |"
                    markdown_table.append(header)

                    # Separador
                    separator = "|"
                    for width in col_widths:
                        separator += "-" * (width + 2) + "|"
                    markdown_table.append(separator)

                    # Contenido
                    for row_idx in range(1, len(table_matrix)):
                        row_text = "|"
                        for col_idx, width in enumerate(col_widths):
                            cell = str(table_matrix[row_idx][col_idx] or "")
                            row_text += f" {cell.ljust(width)} |"
                        markdown_table.append(row_text)

                # Reemplazar la tabla HTML con la versión Markdown
                if markdown_table:
                    table.replace_with(
                        soup.new_string("\n" + "\n".join(markdown_table) + "\n")
                    )

            except Exception as e:
                print(f"Error procesando tabla: {str(e)}")
                continue

        # Procesar saltos de línea
        for br in soup.find_all("br"):
            br.replace_with("\n")

        # Obtener texto limpio
        text = soup.get_text()

        # Procesar líneas
        cleaned_lines = []
        subject = None

        for line in text.split("\n"):
            if not subject:
                subject = _extract_subject_from_text(line)

            if not _should_skip_line(line):
                cleaned_lines.append(line)

        final_text = "\n".join(cleaned_lines).strip()
        return (subject, final_text)

    except Exception as e:
        print(f"Error en html_a_markdown: {str(e)}")
        return (None, html if html else "")


def _procesar_email_adjunto(parte, dir_adjuntos):
    """
    Procesa un email que viene como adjunto dentro de otro email.
    """
    try:
        mensajes = []
        if parte.is_multipart():
            # Si es multipart, procesar cada subparte
            for subparte in parte.walk():
                if subparte.get_content_type() == "message/rfc822":
                    # Si es un mensaje RFC822, obtener el payload como lista
                    payload = subparte.get_payload()
                    if isinstance(payload, list):
                        for msg in payload:
                            mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
                    elif isinstance(payload, email.message.Message):
                        mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
        else:
            # Si no es multipart, intentar procesar como mensaje único
            payload = parte.get_payload()
            if isinstance(payload, list):
                for msg in payload:
                    mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
            elif isinstance(payload, email.message.Message):
                mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))

        return mensajes
    except Exception as e:
        print(f"Error procesando email adjunto: {str(e)}")
        return []


def procesar_eml(ruta_archivo, dir_adjuntos):
    """
    Punto de entrada principal para procesar archivos .eml
    """
    try:
        print(f"  📧 Abriendo archivo: {ruta_archivo}")
        with open(ruta_archivo, "rb") as eml:
            mensaje = BytesParser(policy=policy.default).parse(eml)

        mensajes = procesar_eml_interno(mensaje, dir_adjuntos)
        print(f"  📧 Procesamiento completado: {len(mensajes)} mensajes extraídos")
        return mensajes
    except Exception as e:
        print(f"❌ Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []


def procesar_eml_interno(mensaje, dir_adjuntos):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []

    try:
        remitente = mensaje.get("from", "")
        fecha_str = mensaje.get("date", "")
        fecha = _parsear_fecha(fecha_str)

        # Get subject from email headers first
        subject = mensaje.get("subject", "")
        if subject:
            # Try to decode if it's encoded
            subject = str(email.header.make_header(email.header.decode_header(subject)))

        contenido = ""
        adjuntos = []
        tiene_html = False

        # First pass: check for HTML content
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                if parte.get_content_type() == "text/html":
                    tiene_html = True
                    break
        else:
            tiene_html = mensaje.get_content_type() == "text/html"

        # Second pass: process content and attachments
        if mensaje.is_multipart():
            # Asegurarnos de capturar SOLO una vez el cuerpo principal y no sobrescribirlo
            contenido_set = False  # flag para no re-asignar contenido principal
            for parte in mensaje.walk():
                content_type = parte.get_content_type()

                try:
                    # -----------------------------
                    # 1. CONTENIDO PRINCIPAL
                    # -----------------------------
                    if (not contenido_set) and (
                        content_type == "text/html"
                        or (content_type == "text/plain" and not tiene_html)
                    ):
                        # Procesamos la primera ocurrencia válida
                        if content_type == "text/html":
                            html_content = _get_payload_safely(parte)
                            if html_content:
                                part_subject, text = _html_a_markdown(html_content)
                                if not subject and part_subject:
                                    subject = part_subject
                                if text:
                                    contenido = text
                                    contenido_set = True
                        else:  # text/plain
                            text = _get_payload_safely(parte)
                            if text:
                                contenido = text
                                contenido_set = True

                    # -----------------------------
                    # 2. EMAILS RFC822 ADJUNTOS
                    # -----------------------------
                    elif content_type == "message/rfc822":
                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                        mensajes.extend(mensajes_adjuntos)

                    # -----------------------------
                    # 3. OTROS ADJUNTOS
                    # -----------------------------
                    elif parte.get_content_disposition() == "attachment":
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith(".eml"):
                            mensajes_adjuntos = _procesar_email_adjunto(
                                parte, dir_adjuntos
                            )
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                            if ruta_adjunto:
                                adjuntos.append(Path(ruta_adjunto).name)

                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                html_content = _get_payload_safely(mensaje)
                if html_content:
                    part_subject, contenido = _html_a_markdown(html_content)
                    if not subject and part_subject:
                        subject = part_subject
            else:
                contenido = _get_payload_safely(mensaje) or ""

        # Solo agregar el mensaje si tiene contenido útil
        if contenido or subject or adjuntos:
            mensaje_nuevo = MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos,
            )
            print(f"    ✉️ Mensaje extraído:")
            print(f"      - Subject: {subject}")
            print(f"      - Remitente: {remitente}")
            print(f"      - Fecha: {fecha}")
            print(f"      - Adjuntos: {len(adjuntos)} archivos")
            print(f"      - Contenido: {len(contenido)} caracteres")
            print(f"      - Hash generado: {mensaje_nuevo.hash}")
            mensajes.append(mensaje_nuevo)
        else:
            print(f"    ⚠️ Mensaje vacío o sin contenido útil - no se agregará")

    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")

    return mensajes


def _parsear_fecha(fecha_str):
    try:
        fecha = parsedate_to_datetime(fecha_str)
        return fecha.replace(tzinfo=None)  # Remove timezone info
    except:
        try:
            fecha_match = re.search(
                r"venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})", fecha_str
            )
            if fecha_match:
                dia, mes, año, hora, minuto = fecha_match.groups()
                meses_it = {
                    "gennaio": 1,
                    "febbraio": 2,
                    "marzo": 3,
                    "aprile": 4,
                    "maggio": 5,
                    "giugno": 6,
                    "luglio": 7,
                    "agosto": 8,
                    "settembre": 9,
                    "ottobre": 10,
                    "novembre": 11,
                    "dicembre": 12,
                }
                mes_num = meses_it.get(mes.lower(), 1)
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
        return datetime.now()