Eliminado del lineas en blanco

2025-02-05 12:25:43 +01:00 · 2025-02-05 12:25:43 +01:00 · 9be8f227cb
parent b0150a58dd
commit 9be8f227cb
4 changed files with 26 additions and 6 deletions
--- a/models/pycache/mensaje_email.cpython-310.pyc
+++ b/models/pycache/mensaje_email.cpython-310.pyc
--- a/models/mensaje_email.py
+++ b/models/mensaje_email.py
@ -30,9 +30,12 @@ class MensajeEmail:
        # Unir las líneas
        text = '\n'.join(cleaned_lines)
        
+        # Primero limpiamos la combinación específica de CRLF+NBSP+CRLF
+        text = re.sub(r'\r?\n\xa0\r?\n', '\n', text)
+
        # Reemplazar 3 o más saltos de línea por dos
        text = re.sub(r'\n{3,}', '\n\n', text)
-        
+
        return text.strip()
    
    def to_markdown(self):
--- a/utils/pycache/email_parser.cpython-310.pyc
+++ b/utils/pycache/email_parser.cpython-310.pyc
--- a/utils/email_parser.py
+++ b/utils/email_parser.py
@ -11,6 +11,11 @@ from models.mensaje_email import MensajeEmail
 from utils.attachment_handler import guardar_adjunto

 def _html_a_markdown(html):
+    # Primero limpiamos los caracteres especiales en el HTML
+    html = html.replace('\xa0', ' ')  # NBSP a espacio normal
+    html = html.replace('\r\n', '\n') # CRLF a LF
+    html = html.replace('\r', '\n')   # CR a LF
+    
    soup = BeautifulSoup(html, 'html.parser')
    
    # Convert tables, keeping all newlines
@ -88,18 +93,30 @@ def procesar_eml(ruta_archivo, dir_adjuntos):
    contenido = ""
    subject = None
    adjuntos = []
+    tiene_html = False
    
+    # Primera pasada: verificar si hay contenido HTML
    if mensaje.is_multipart():
        for parte in mensaje.walk():
-            if parte.get_content_type() == "text/plain":
-                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
-                contenido += text
-            elif parte.get_content_type() == "text/html":
+            if parte.get_content_type() == "text/html":
+                tiene_html = True
+                break
+    else:
+        tiene_html = mensaje.get_content_type() == "text/html"
+    
+    # Segunda pasada: procesar el contenido
+    if mensaje.is_multipart():
+        for parte in mensaje.walk():
+            if parte.get_content_type() == "text/html":
                html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                part_subject, text = _html_a_markdown(html_content)
                if part_subject and not subject:
                    subject = part_subject
-                contenido += text
+                contenido = text  # Reemplazar en lugar de concatenar
+            elif parte.get_content_type() == "text/plain" and not tiene_html:
+                # Solo usar texto plano si no hay HTML
+                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
+                contenido = text
            elif parte.get_content_disposition() == 'attachment':
                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                if ruta_adjunto: