EmailCrono/utils/email_parser.py

# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto
from utils.forward_handler import extract_forwarded_messages
import tempfile
import os

def _get_payload_safely(parte):
    """
    Obtiene el payload de una parte del email de forma segura
    """
    try:
        if parte.is_multipart():
            return None
        payload = parte.get_payload(decode=True)
        if payload is None:
            return None
        charset = parte.get_content_charset() or 'utf-8'
        return payload.decode(charset, errors='ignore')
    except Exception as e:
        print(f"Error getting payload: {str(e)}")
        return None

def _extract_subject_from_text(text):
    """
    Extrae el asunto de un texto dados diferentes formatos de cabecera
    """
    subject_headers = {
        'Oggetto: ': 9,      # Italian
        'Subject: ': 9,      # English
        'Asunto: ': 8,       # Spanish
        'Sujet: ': 7,        # French
        'Betreff: ': 9       # German
    }

    for line in text.split('\n'):
        line = line.strip()
        for header, offset in subject_headers.items():
            if line.startswith(header):
                return line[offset:].strip()
    return None

def _should_skip_line(line):
    """
    Determina si una línea debe ser omitida por ser una cabecera de email
    """
    headers_to_skip = [
        'Da: ', 'Inviato: ', 'A: ',           # Italian
        'From: ', 'Sent: ', 'To: ',           # English
        'De: ', 'Enviado: ', 'Para: ',        # Spanish
        'Von: ', 'Gesendet: ', 'An: ',        # German
        'De : ', 'Envoyé : ', 'À : '          # French
    ]
    return any(line.strip().startswith(header) for header in headers_to_skip)

def _html_a_markdown(html):
    """
    Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente
    """
    if html is None:
        return (None, "")

    try:
        # Limpieza básica
        html = html.replace('\xa0', ' ')  # NBSP a espacio normal
        html = html.replace('\r\n', '\n') # CRLF a LF
        html = html.replace('\r', '\n')   # CR a LF

        soup = BeautifulSoup(html, 'html.parser')

        # Procesar tablas
        for table in soup.find_all('table'):
            try:
                rows = table.find_all('tr')
                if not rows:
                    continue

                markdown_table = []
                max_widths = []

                # Calcular anchos máximos
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    while len(max_widths) < len(cells):
                        max_widths.append(0)
                    for i, cell in enumerate(cells):
                        cell_text = cell.get_text().strip()
                        max_widths[i] = max(max_widths[i], len(cell_text))

                # Construir tabla markdown
                if max_widths:  # Solo si tenemos celdas válidas
                    header_row = rows[0].find_all(['th', 'td'])
                    header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
                                             for i, cell in enumerate(header_row)) + ' |'
                    separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'

                    markdown_table.append(header)
                    markdown_table.append(separator)

                    for row in rows[1:]:
                        cells = row.find_all(['td', 'th'])
                        row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
                                                   for i, cell in enumerate(cells)) + ' |'
                        markdown_table.append(row_text)

                    table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))
            except Exception as e:
                print(f"Error procesando tabla: {str(e)}")
                continue

        # Procesar saltos de línea
        for br in soup.find_all('br'):
            br.replace_with('\n')

        # Obtener texto limpio
        text = soup.get_text()

        # Procesar líneas
        cleaned_lines = []
        subject = None

        for line in text.split('\n'):
            if not subject:
                subject = _extract_subject_from_text(line)

            if not _should_skip_line(line):
                cleaned_lines.append(line)

        final_text = '\n'.join(cleaned_lines).strip()
        return (subject, final_text)

    except Exception as e:
        print(f"Error en html_a_markdown: {str(e)}")
        return (None, html if html else "")

def _procesar_email_adjunto(parte, dir_adjuntos):
    """
    Procesa un email que viene como adjunto dentro de otro email.
    """
    try:
        mensajes = []
        if parte.is_multipart():
            # Si es multipart, procesar cada subparte
            for subparte in parte.walk():
                if subparte.get_content_type() == "message/rfc822":
                    # Si es un mensaje RFC822, obtener el payload como lista
                    payload = subparte.get_payload()
                    if isinstance(payload, list):
                        for msg in payload:
                            mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
                    elif isinstance(payload, email.message.Message):
                        mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
        else:
            # Si no es multipart, intentar procesar como mensaje único
            payload = parte.get_payload()
            if isinstance(payload, list):
                for msg in payload:
                    mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
            elif isinstance(payload, email.message.Message):
                mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))

        return mensajes
    except Exception as e:
        print(f"Error procesando email adjunto: {str(e)}")
        return []

def procesar_eml(ruta_archivo, dir_adjuntos):
    """
    Punto de entrada principal para procesar archivos .eml
    """
    try:
        with open(ruta_archivo, 'rb') as eml:
            mensaje = BytesParser(policy=policy.default).parse(eml)
        return procesar_eml_interno(mensaje, dir_adjuntos)
    except Exception as e:
        print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []


def _parsear_fecha(fecha_str):
    try:
        fecha = parsedate_to_datetime(fecha_str)
        return fecha.replace(tzinfo=None)  # Remove timezone info
    except:
        try:
            fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
            if fecha_match:
                dia, mes, año, hora, minuto = fecha_match.groups()
                meses_it = {
                    'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
                    'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
                    'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
                }
                mes_num = meses_it.get(mes.lower(), 1)
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
        return datetime.now()

def procesar_eml_interno(mensaje, dir_adjuntos):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []

    try:
        remitente = mensaje.get('from', '')
        fecha_str = mensaje.get('date', '')
        fecha = _parsear_fecha(fecha_str)

        # Get subject from email headers first
        subject = mensaje.get('subject', '')
        if subject:
            subject = str(email.header.make_header(email.header.decode_header(subject)))

        contenido = ""
        adjuntos = []
        contenido_html = None

        # First pass: check for HTML content and extract it
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                content_type = parte.get_content_type()

                try:
                    if content_type == "text/html":
                        html_content = _get_payload_safely(parte)
                        if html_content:
                            contenido_html = html_content
                    elif content_type == "text/plain" and not contenido_html:
                        contenido = _get_payload_safely(parte) or ""
                    elif content_type == "message/rfc822":
                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                        mensajes.extend(mensajes_adjuntos)
                    elif parte.get_content_disposition() == 'attachment':
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith('.eml'):
                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                            if ruta_adjunto:
                                adjuntos.append(Path(ruta_adjunto).name)
                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                contenido_html = _get_payload_safely(mensaje)
            else:
                contenido = _get_payload_safely(mensaje) or ""

        # Process HTML content if available
        if contenido_html:
            part_subject, text = _html_a_markdown(contenido_html)
            if not subject and part_subject:
                subject = part_subject
            contenido = text

        # Process forwarded messages from the markdown content
        if contenido:
            print(f"\nBuscando mensajes reenviados en contenido ({len(contenido)} chars)")
            print("Primeros 200 chars:", contenido[:200])
            contenido_principal, mensajes_reenviados = extract_forwarded_messages(contenido)
            print(f"Encontrados {len(mensajes_reenviados)} mensajes reenviados")
            contenido = contenido_principal
            mensajes.extend(mensajes_reenviados)

        # Solo agregar el mensaje principal si tiene contenido útil
        if contenido or subject or adjuntos:
            mensajes.insert(0, MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos
            ))

    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")

    return mensajes