Intento fallido de separar los emails reenviados

2025-02-05 15:04:43 +01:00 · 2025-02-05 15:04:43 +01:00 · 220e911731
parent b9c3024e04
commit 220e911731
5 changed files with 230 additions and 1408 deletions
--- a/TEST.eml
+++ b/TEST.eml
--- a/utils/pycache/email_parser.cpython-310.pyc
+++ b/utils/pycache/email_parser.cpython-310.pyc
--- a/utils/pycache/forward_handler.cpython-310.pyc
+++ b/utils/pycache/forward_handler.cpython-310.pyc
--- a/utils/email_parser.py
+++ b/utils/email_parser.py
@ -9,6 +9,7 @@ from bs4 import BeautifulSoup
 from email.utils import parsedate_to_datetime
 from models.mensaje_email import MensajeEmail
 from utils.attachment_handler import guardar_adjunto
+from utils.forward_handler import extract_forwarded_messages
 import tempfile
 import os

@ -183,96 +184,6 @@ def procesar_eml(ruta_archivo, dir_adjuntos):
        print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []

-def procesar_eml_interno(mensaje, dir_adjuntos):
-    """
-    Procesa un mensaje de email, ya sea desde archivo o adjunto
-    """
-    mensajes = []
-    
-    try:
-        remitente = mensaje.get('from', '')
-        fecha_str = mensaje.get('date', '')
-        fecha = _parsear_fecha(fecha_str)
-        
-        # Get subject from email headers first
-        subject = mensaje.get('subject', '')
-        if subject:
-            # Try to decode if it's encoded
-            subject = str(email.header.make_header(email.header.decode_header(subject)))
-        
-        contenido = ""
-        adjuntos = []
-        tiene_html = False
-        
-        # First pass: check for HTML content
-        if mensaje.is_multipart():
-            for parte in mensaje.walk():
-                if parte.get_content_type() == "text/html":
-                    tiene_html = True
-                    break
-        else:
-            tiene_html = mensaje.get_content_type() == "text/html"
-        
-        # Second pass: process content and attachments
-        if mensaje.is_multipart():
-            for parte in mensaje.walk():
-                content_type = parte.get_content_type()
-                
-                try:
-                    if content_type == "text/html":
-                        html_content = _get_payload_safely(parte)
-                        if html_content:
-                            part_subject, text = _html_a_markdown(html_content)
-                            if not subject and part_subject:
-                                subject = part_subject
-                            if text:
-                                contenido = text
-                    elif content_type == "text/plain" and not tiene_html:
-                        text = _get_payload_safely(parte)
-                        if text:
-                            contenido = text
-                    elif content_type == "message/rfc822":
-                        # Procesar email adjunto
-                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
-                        mensajes.extend(mensajes_adjuntos)
-                    elif parte.get_content_disposition() == 'attachment':
-                        nombre = parte.get_filename()
-                        if nombre and nombre.lower().endswith('.eml'):
-                            # Si es un archivo .eml adjunto
-                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
-                            mensajes.extend(mensajes_adjuntos)
-                        else:
-                            # Otros tipos de adjuntos
-                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
-                            if ruta_adjunto:
-                                adjuntos.append(Path(ruta_adjunto).name)
-                except Exception as e:
-                    print(f"Error procesando parte del mensaje: {str(e)}")
-                    continue
-        else:
-            if mensaje.get_content_type() == "text/html":
-                html_content = _get_payload_safely(mensaje)
-                if html_content:
-                    part_subject, contenido = _html_a_markdown(html_content)
-                    if not subject and part_subject:
-                        subject = part_subject
-            else:
-                contenido = _get_payload_safely(mensaje) or ""
-        
-        # Solo agregar el mensaje si tiene contenido útil
-        if contenido or subject or adjuntos:
-            mensajes.append(MensajeEmail(
-                remitente=remitente,
-                fecha=fecha,
-                contenido=contenido,
-                subject=subject,
-                adjuntos=adjuntos
-            ))
-        
-    except Exception as e:
-        print(f"Error procesando mensaje: {str(e)}")
-    
-    return mensajes

 def _parsear_fecha(fecha_str):
    try:
@ -292,4 +203,88 @@ def _parsear_fecha(fecha_str):
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
-        return datetime.now()
+        return datetime.now()
+    
+def procesar_eml_interno(mensaje, dir_adjuntos):
+    """
+    Procesa un mensaje de email, ya sea desde archivo o adjunto
+    """
+    mensajes = []
+    
+    try:
+        remitente = mensaje.get('from', '')
+        fecha_str = mensaje.get('date', '')
+        fecha = _parsear_fecha(fecha_str)
+        
+        # Get subject from email headers first
+        subject = mensaje.get('subject', '')
+        if subject:
+            subject = str(email.header.make_header(email.header.decode_header(subject)))
+        
+        contenido = ""
+        adjuntos = []
+        contenido_html = None
+        
+        # First pass: check for HTML content and extract it
+        if mensaje.is_multipart():
+            for parte in mensaje.walk():
+                content_type = parte.get_content_type()
+                
+                try:
+                    if content_type == "text/html":
+                        html_content = _get_payload_safely(parte)
+                        if html_content:
+                            contenido_html = html_content
+                    elif content_type == "text/plain" and not contenido_html:
+                        contenido = _get_payload_safely(parte) or ""
+                    elif content_type == "message/rfc822":
+                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
+                        mensajes.extend(mensajes_adjuntos)
+                    elif parte.get_content_disposition() == 'attachment':
+                        nombre = parte.get_filename()
+                        if nombre and nombre.lower().endswith('.eml'):
+                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
+                            mensajes.extend(mensajes_adjuntos)
+                        else:
+                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
+                            if ruta_adjunto:
+                                adjuntos.append(Path(ruta_adjunto).name)
+                except Exception as e:
+                    print(f"Error procesando parte del mensaje: {str(e)}")
+                    continue
+        else:
+            if mensaje.get_content_type() == "text/html":
+                contenido_html = _get_payload_safely(mensaje)
+            else:
+                contenido = _get_payload_safely(mensaje) or ""
+        
+        # Process HTML content if available
+        if contenido_html:
+            part_subject, text = _html_a_markdown(contenido_html)
+            if not subject and part_subject:
+                subject = part_subject
+            contenido = text
+        
+        # Process forwarded messages from the markdown content
+        if contenido:
+            print(f"\nBuscando mensajes reenviados en contenido ({len(contenido)} chars)")
+            print("Primeros 200 chars:", contenido[:200])
+            contenido_principal, mensajes_reenviados = extract_forwarded_messages(contenido)
+            print(f"Encontrados {len(mensajes_reenviados)} mensajes reenviados")
+            contenido = contenido_principal
+            mensajes.extend(mensajes_reenviados)
+        
+        # Solo agregar el mensaje principal si tiene contenido útil
+        if contenido or subject or adjuntos:
+            mensajes.insert(0, MensajeEmail(
+                remitente=remitente,
+                fecha=fecha,
+                contenido=contenido,
+                subject=subject,
+                adjuntos=adjuntos
+            ))
+        
+    except Exception as e:
+        print(f"Error procesando mensaje: {str(e)}")
+    
+    return mensajes
--- a/utils/forward_handler.py
+++ b/utils/forward_handler.py
@ -0,0 +1,144 @@
+# utils/forward_handler.py
+import re
+import os
+from datetime import datetime
+from email.utils import parseaddr
+from models.mensaje_email import MensajeEmail
+
+# Patrones de inicio de mensaje reenviado en diferentes idiomas
+FORWARD_PATTERNS = [
+    r"[-]{3,}\s*Messaggio originale\s*[-]{3,}",     # Italiano
+    r"[-]{3,}\s*Original Message\s*[-]{3,}",        # Inglés
+    r"[-]{3,}\s*Mensaje original\s*[-]{3,}",        # Español
+    r"[-]{3,}\s*Message d'origine\s*[-]{3,}",       # Francés
+    r"[-]{3,}\s*Ursprüngliche Nachricht\s*[-]{3,}", # Alemán
+    # Variantes más flexibles
+    r"[-]{3,}\s*Forwarded message\s*[-]{3,}",
+    r"[-]{3,}\s*Mensaje reenviado\s*[-]{3,}",
+    r"[-]{3,}\s*Messaggio inoltrato\s*[-]{3,}",
+    # Patrones con > que suelen aparecer en texto plano
+    r"(?m)^>\s*[-]{3,}\s*Messaggio originale\s*[-]{3,}",
+    r"(?m)^>\s*[-]{3,}\s*Original Message\s*[-]{3,}"
+]
+
+# Patrones de headers en diferentes idiomas
+HEADER_PATTERNS = {
+    'from': [
+        r"Da:\s*(.*)",       # Italiano
+        r"From:\s*(.*)",     # Inglés
+        r"De:\s*(.*)",       # Español
+        r"Von:\s*(.*)",      # Alemán
+        r"De :\s*(.*)"       # Francés
+    ],
+    'date': [
+        r"Inviato:\s*(.*)",  # Italiano
+        r"Sent:\s*(.*)",     # Inglés
+        r"Enviado:\s*(.*)",  # Español
+        r"Gesendet:\s*(.*)", # Alemán
+        r"Envoyé :\s*(.*)"   # Francés
+    ],
+    'subject': [
+        r"Oggetto:\s*(.*)",  # Italiano
+        r"Subject:\s*(.*)",  # Inglés
+        r"Asunto:\s*(.*)",   # Español
+        r"Betreff:\s*(.*)",  # Alemán
+        r"Sujet :\s*(.*)"    # Francés
+    ]
+}
+
+def extract_forwarded_messages(contenido):
+    """
+    Extrae mensajes reenviados del contenido del email
+    Retorna una lista de objetos MensajeEmail
+    """
+    mensajes = []
+    
+    # Crear el patrón de división combinando todos los patrones de reenvío
+    split_pattern = '|'.join(f"({pattern})" for pattern in FORWARD_PATTERNS)
+    
+    # Dividir el contenido usando el patrón combinado
+    partes = re.split(split_pattern, contenido)
+    
+    # El primer elemento es el contenido original del email
+    contenido_original = partes[0].strip()
+    
+    # Procesar cada parte que coincide con un patrón de reenvío
+    for i in range(1, len(partes), len(FORWARD_PATTERNS) + 1):
+        # Encontrar qué patrón coincidió
+        patron_encontrado = next((p for p in partes[i:i+len(FORWARD_PATTERNS)] if p), None)
+        if patron_encontrado and i + len(FORWARD_PATTERNS) < len(partes):
+            contenido_reenviado = partes[i + len(FORWARD_PATTERNS)].strip()
+            if contenido_reenviado:
+                mensaje = _procesar_contenido_reenviado(contenido_reenviado)
+                if mensaje:
+                    mensajes.append(mensaje)
+    
+    return contenido_original, mensajes
+
+def _procesar_contenido_reenviado(contenido):
+    """
+    Procesa el contenido de un mensaje reenviado y extrae la información relevante
+    """
+    # Extraer headers
+    remitente = None
+    fecha_str = None
+    subject = None
+    cuerpo = contenido
+    
+    # Buscar headers al inicio del mensaje
+    lineas = contenido.split('\n')
+    headers_encontrados = 0
+    i = 0
+    
+    while i < len(lineas) and headers_encontrados < 3:
+        linea = lineas[i].strip()
+        
+        # Buscar remitente
+        if not remitente:
+            for pattern in HEADER_PATTERNS['from']:
+                match = re.match(pattern, linea)
+                if match:
+                    remitente = match.group(1).strip()
+                    headers_encontrados += 1
+                    break
+        
+        # Buscar fecha
+        if not fecha_str:
+            for pattern in HEADER_PATTERNS['date']:
+                match = re.match(pattern, linea)
+                if match:
+                    fecha_str = match.group(1).strip()
+                    headers_encontrados += 1
+                    break
+        
+        # Buscar asunto
+        if not subject:
+            for pattern in HEADER_PATTERNS['subject']:
+                match = re.match(pattern, linea)
+                if match:
+                    subject = match.group(1).strip()
+                    headers_encontrados += 1
+                    break
+        
+        i += 1
+    
+    # Si encontramos headers, el cuerpo comienza después de ellos
+    if headers_encontrados > 0:
+        cuerpo = '\n'.join(lineas[i:]).strip()
+    
+    # Si no tenemos la información mínima necesaria, retornar None
+    if not (remitente or fecha_str or cuerpo):
+        return None
+    
+    # Crear el objeto MensajeEmail
+    try:
+        return MensajeEmail(
+            remitente=remitente or "Remitente Desconocido",
+            fecha=fecha_str or datetime.now(),
+            contenido=cuerpo,
+            subject=subject,
+            adjuntos=[]
+        )
+    except Exception as e:
+        print(f"Error creando mensaje reenviado: {str(e)}")
+        return None