Intento fallido de separar los emails reenviados

2025-02-05 15:04:43 +01:00 · 2025-02-05 15:04:43 +01:00 · 220e911731
parent b9c3024e04
commit 220e911731
5 changed files with 230 additions and 1408 deletions
--- a/TEST.eml
+++ b/TEST.eml
--- a/utils/pycache/email_parser.cpython-310.pyc
+++ b/utils/pycache/email_parser.cpython-310.pyc
--- a/utils/pycache/forward_handler.cpython-310.pyc
+++ b/utils/pycache/forward_handler.cpython-310.pyc
--- a/utils/email_parser.py
+++ b/utils/email_parser.py
@ -9,6 +9,7 @@ from bs4 import BeautifulSoup
 from email.utils import parsedate_to_datetime
 from models.mensaje_email import MensajeEmail
 from utils.attachment_handler import guardar_adjunto
 from utils.forward_handler import extract_forwarded_messages
 import tempfile
 import os
@ -183,96 +184,6 @@ def procesar_eml(ruta_archivo, dir_adjuntos):
        print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []
 def procesar_eml_interno(mensaje, dir_adjuntos):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []
    try:
        remitente = mensaje.get('from', '')
        fecha_str = mensaje.get('date', '')
        fecha = _parsear_fecha(fecha_str)
        # Get subject from email headers first
        subject = mensaje.get('subject', '')
        if subject:
            # Try to decode if it's encoded
            subject = str(email.header.make_header(email.header.decode_header(subject)))
        contenido = ""
        adjuntos = []
        tiene_html = False
        # First pass: check for HTML content
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                if parte.get_content_type() == "text/html":
                    tiene_html = True
                    break
        else:
            tiene_html = mensaje.get_content_type() == "text/html"
        # Second pass: process content and attachments
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                content_type = parte.get_content_type()
                try:
                    if content_type == "text/html":
                        html_content = _get_payload_safely(parte)
                        if html_content:
                            part_subject, text = _html_a_markdown(html_content)
                            if not subject and part_subject:
                                subject = part_subject
                            if text:
                                contenido = text
                    elif content_type == "text/plain" and not tiene_html:
                        text = _get_payload_safely(parte)
                        if text:
                            contenido = text
                    elif content_type == "message/rfc822":
                        # Procesar email adjunto
                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                        mensajes.extend(mensajes_adjuntos)
                    elif parte.get_content_disposition() == 'attachment':
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith('.eml'):
                            # Si es un archivo .eml adjunto
                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            # Otros tipos de adjuntos
                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                            if ruta_adjunto:
                                adjuntos.append(Path(ruta_adjunto).name)
                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                html_content = _get_payload_safely(mensaje)
                if html_content:
                    part_subject, contenido = _html_a_markdown(html_content)
                    if not subject and part_subject:
                        subject = part_subject
            else:
                contenido = _get_payload_safely(mensaje) or ""
        # Solo agregar el mensaje si tiene contenido útil
        if contenido or subject or adjuntos:
            mensajes.append(MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos
            ))
    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")
    return mensajes
 def _parsear_fecha(fecha_str):
    try:
@ -292,4 +203,88 @@ def _parsear_fecha(fecha_str):
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
-        return datetime.now()
+        return datetime.now()
 def procesar_eml_interno(mensaje, dir_adjuntos):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []
    try:
        remitente = mensaje.get('from', '')
        fecha_str = mensaje.get('date', '')
        fecha = _parsear_fecha(fecha_str)
        # Get subject from email headers first
        subject = mensaje.get('subject', '')
        if subject:
            subject = str(email.header.make_header(email.header.decode_header(subject)))
        contenido = ""
        adjuntos = []
        contenido_html = None
        # First pass: check for HTML content and extract it
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                content_type = parte.get_content_type()
                try:
                    if content_type == "text/html":
                        html_content = _get_payload_safely(parte)
                        if html_content:
                            contenido_html = html_content
                    elif content_type == "text/plain" and not contenido_html:
                        contenido = _get_payload_safely(parte) or ""
                    elif content_type == "message/rfc822":
                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                        mensajes.extend(mensajes_adjuntos)
                    elif parte.get_content_disposition() == 'attachment':
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith('.eml'):
                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                            if ruta_adjunto:
                                adjuntos.append(Path(ruta_adjunto).name)
                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                contenido_html = _get_payload_safely(mensaje)
            else:
                contenido = _get_payload_safely(mensaje) or ""
        # Process HTML content if available
        if contenido_html:
            part_subject, text = _html_a_markdown(contenido_html)
            if not subject and part_subject:
                subject = part_subject
            contenido = text
        # Process forwarded messages from the markdown content
        if contenido:
            print(f"\nBuscando mensajes reenviados en contenido ({len(contenido)} chars)")
            print("Primeros 200 chars:", contenido[:200])
            contenido_principal, mensajes_reenviados = extract_forwarded_messages(contenido)
            print(f"Encontrados {len(mensajes_reenviados)} mensajes reenviados")
            contenido = contenido_principal
            mensajes.extend(mensajes_reenviados)
        # Solo agregar el mensaje principal si tiene contenido útil
        if contenido or subject or adjuntos:
            mensajes.insert(0, MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos
            ))
    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")
    return mensajes
--- a/utils/forward_handler.py
+++ b/utils/forward_handler.py
@ -0,0 +1,144 @@
 # utils/forward_handler.py
 import re
 import os
 from datetime import datetime
 from email.utils import parseaddr
 from models.mensaje_email import MensajeEmail
 # Patrones de inicio de mensaje reenviado en diferentes idiomas
 FORWARD_PATTERNS = [
    r"[-]{3,}\s*Messaggio originale\s*[-]{3,}",     # Italiano
    r"[-]{3,}\s*Original Message\s*[-]{3,}",        # Inglés
    r"[-]{3,}\s*Mensaje original\s*[-]{3,}",        # Español
    r"[-]{3,}\s*Message d'origine\s*[-]{3,}",       # Francés
    r"[-]{3,}\s*Ursprüngliche Nachricht\s*[-]{3,}", # Alemán
    # Variantes más flexibles
    r"[-]{3,}\s*Forwarded message\s*[-]{3,}",
    r"[-]{3,}\s*Mensaje reenviado\s*[-]{3,}",
    r"[-]{3,}\s*Messaggio inoltrato\s*[-]{3,}",
    # Patrones con > que suelen aparecer en texto plano
    r"(?m)^>\s*[-]{3,}\s*Messaggio originale\s*[-]{3,}",
    r"(?m)^>\s*[-]{3,}\s*Original Message\s*[-]{3,}"
 ]
 # Patrones de headers en diferentes idiomas
 HEADER_PATTERNS = {
    'from': [
        r"Da:\s*(.*)",       # Italiano
        r"From:\s*(.*)",     # Inglés
        r"De:\s*(.*)",       # Español
        r"Von:\s*(.*)",      # Alemán
        r"De :\s*(.*)"       # Francés
    ],
    'date': [
        r"Inviato:\s*(.*)",  # Italiano
        r"Sent:\s*(.*)",     # Inglés
        r"Enviado:\s*(.*)",  # Español
        r"Gesendet:\s*(.*)", # Alemán
        r"Envoyé :\s*(.*)"   # Francés
    ],
    'subject': [
        r"Oggetto:\s*(.*)",  # Italiano
        r"Subject:\s*(.*)",  # Inglés
        r"Asunto:\s*(.*)",   # Español
        r"Betreff:\s*(.*)",  # Alemán
        r"Sujet :\s*(.*)"    # Francés
    ]
 }
 def extract_forwarded_messages(contenido):
    """
    Extrae mensajes reenviados del contenido del email
    Retorna una lista de objetos MensajeEmail
    """
    mensajes = []
    # Crear el patrón de división combinando todos los patrones de reenvío
    split_pattern = '|'.join(f"({pattern})" for pattern in FORWARD_PATTERNS)
    # Dividir el contenido usando el patrón combinado
    partes = re.split(split_pattern, contenido)
    # El primer elemento es el contenido original del email
    contenido_original = partes[0].strip()
    # Procesar cada parte que coincide con un patrón de reenvío
    for i in range(1, len(partes), len(FORWARD_PATTERNS) + 1):
        # Encontrar qué patrón coincidió
        patron_encontrado = next((p for p in partes[i:i+len(FORWARD_PATTERNS)] if p), None)
        if patron_encontrado and i + len(FORWARD_PATTERNS) < len(partes):
            contenido_reenviado = partes[i + len(FORWARD_PATTERNS)].strip()
            if contenido_reenviado:
                mensaje = _procesar_contenido_reenviado(contenido_reenviado)
                if mensaje:
                    mensajes.append(mensaje)
    return contenido_original, mensajes
 def _procesar_contenido_reenviado(contenido):
    """
    Procesa el contenido de un mensaje reenviado y extrae la información relevante
    """
    # Extraer headers
    remitente = None
    fecha_str = None
    subject = None
    cuerpo = contenido
    # Buscar headers al inicio del mensaje
    lineas = contenido.split('\n')
    headers_encontrados = 0
    i = 0
    while i < len(lineas) and headers_encontrados < 3:
        linea = lineas[i].strip()
        # Buscar remitente
        if not remitente:
            for pattern in HEADER_PATTERNS['from']:
                match = re.match(pattern, linea)
                if match:
                    remitente = match.group(1).strip()
                    headers_encontrados += 1
                    break
        # Buscar fecha
        if not fecha_str:
            for pattern in HEADER_PATTERNS['date']:
                match = re.match(pattern, linea)
                if match:
                    fecha_str = match.group(1).strip()
                    headers_encontrados += 1
                    break
        # Buscar asunto
        if not subject:
            for pattern in HEADER_PATTERNS['subject']:
                match = re.match(pattern, linea)
                if match:
                    subject = match.group(1).strip()
                    headers_encontrados += 1
                    break
        i += 1
    # Si encontramos headers, el cuerpo comienza después de ellos
    if headers_encontrados > 0:
        cuerpo = '\n'.join(lineas[i:]).strip()
    # Si no tenemos la información mínima necesaria, retornar None
    if not (remitente or fecha_str or cuerpo):
        return None
    # Crear el objeto MensajeEmail
    try:
        return MensajeEmail(
            remitente=remitente or "Remitente Desconocido",
            fecha=fecha_str or datetime.now(),
            contenido=cuerpo,
            subject=subject,
            adjuntos=[]
        )
    except Exception as e:
        print(f"Error creando mensaje reenviado: {str(e)}")
        return None