Eliminado del lineas en blanco

2025-02-05 12:25:43 +01:00 · 2025-02-05 12:25:43 +01:00 · 9be8f227cb
parent b0150a58dd
commit 9be8f227cb
4 changed files with 26 additions and 6 deletions
--- a/models/pycache/mensaje_email.cpython-310.pyc
+++ b/models/pycache/mensaje_email.cpython-310.pyc
--- a/models/mensaje_email.py
+++ b/models/mensaje_email.py
@ -30,6 +30,9 @@ class MensajeEmail:
        # Unir las líneas
        text = '\n'.join(cleaned_lines)
        # Primero limpiamos la combinación específica de CRLF+NBSP+CRLF
        text = re.sub(r'\r?\n\xa0\r?\n', '\n', text)
        # Reemplazar 3 o más saltos de línea por dos
        text = re.sub(r'\n{3,}', '\n\n', text)
--- a/utils/pycache/email_parser.cpython-310.pyc
+++ b/utils/pycache/email_parser.cpython-310.pyc
--- a/utils/email_parser.py
+++ b/utils/email_parser.py
@ -11,6 +11,11 @@ from models.mensaje_email import MensajeEmail
 from utils.attachment_handler import guardar_adjunto
 def _html_a_markdown(html):
    # Primero limpiamos los caracteres especiales en el HTML
    html = html.replace('\xa0', ' ')  # NBSP a espacio normal
    html = html.replace('\r\n', '\n') # CRLF a LF
    html = html.replace('\r', '\n')   # CR a LF
    soup = BeautifulSoup(html, 'html.parser')
    # Convert tables, keeping all newlines
@ -88,18 +93,30 @@ def procesar_eml(ruta_archivo, dir_adjuntos):
    contenido = ""
    subject = None
    adjuntos = []
    tiene_html = False
    # Primera pasada: verificar si hay contenido HTML
    if mensaje.is_multipart():
        for parte in mensaje.walk():
-            if parte.get_content_type() == "text/plain":
+            if parte.get_content_type() == "text/html":
-                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
+                tiene_html = True
-                contenido += text
+                break
-            elif parte.get_content_type() == "text/html":
+    else:
        tiene_html = mensaje.get_content_type() == "text/html"
    # Segunda pasada: procesar el contenido
    if mensaje.is_multipart():
        for parte in mensaje.walk():
            if parte.get_content_type() == "text/html":
                html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                part_subject, text = _html_a_markdown(html_content)
                if part_subject and not subject:
                    subject = part_subject
-                contenido += text
+                contenido = text  # Reemplazar en lugar de concatenar
            elif parte.get_content_type() == "text/plain" and not tiene_html:
                # Solo usar texto plano si no hay HTML
                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                contenido = text
            elif parte.get_content_disposition() == 'attachment':
                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                if ruta_adjunto: