Funcionando mejor

2025-02-05 12:58:41 +01:00 · 2025-02-05 12:58:41 +01:00 · b9c3024e04
parent 9be8f227cb
commit b9c3024e04
6 changed files with 311 additions and 119 deletions
--- a/config.json
+++ b/config.json
@ -1,5 +1,5 @@
 {
-    "input_dir": "D:\\Proyectos\\Scripts\\EmailCrono",
+    "input_dir": "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\Emails",
    "output_dir": "C:\\Users\\migue\\OneDrive\\Miguel\\Obsidean\\Trabajo\\VM\\04-InLavoro\\HENKEL\\93040 - HENKEL - BowlingGreen\\Description\\HENKEL - ALPLA - AUTEFA - Batch Data",
    "cronologia_file": "cronologia.md",
    "attachments_dir": "adjuntos"
--- a/main.py
+++ b/main.py
@ -4,6 +4,7 @@ from pathlib import Path
 from utils.email_parser import procesar_eml
 from utils.markdown_handler import cargar_cronologia_existente
 from config.config import Config
+import hashlib

 def main():
    config = Config()
@ -31,19 +32,35 @@ def main():
    mensajes = []
    print(f"Loaded {len(mensajes)} existing messages")
    mensajes_hash = {msg.hash for msg in mensajes}
+    
+    total_procesados = 0
+    total_nuevos = 0
+    mensajes_duplicados = 0

    for archivo in eml_files:
-        print(f"Processing {archivo}")
+        print(f"\nProcessing {archivo}")
        nuevos_mensajes = procesar_eml(archivo, config.get_attachments_dir())
+        total_procesados += len(nuevos_mensajes)
+        
+        # Verificar duplicados
        for msg in nuevos_mensajes:
            if msg.hash not in mensajes_hash:
                mensajes.append(msg)
                mensajes_hash.add(msg.hash)
+                total_nuevos += 1
+            else:
+                mensajes_duplicados += 1

+    print(f"\nEstadísticas de procesamiento:")
+    print(f"- Total mensajes encontrados: {total_procesados}")
+    print(f"- Mensajes únicos añadidos: {total_nuevos}")
+    print(f"- Mensajes duplicados ignorados: {mensajes_duplicados}")
+
+    # Ordenar todos los mensajes por fecha
    mensajes.sort(key=lambda x: x.fecha)

    output_file = config.get_cronologia_file()
-    print(f"Writing to {output_file}")
+    print(f"\nWriting {len(mensajes)} messages to {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for msg in mensajes:
            f.write(msg.to_markdown())
--- a/models/pycache/mensaje_email.cpython-310.pyc
+++ b/models/pycache/mensaje_email.cpython-310.pyc
--- a/models/mensaje_email.py
+++ b/models/mensaje_email.py
@ -25,17 +25,26 @@ class MensajeEmail:
            # Skip metadata lines
            if line.strip().startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')) or line.strip().startswith('Oggetto: '):
                continue
-            cleaned_lines.append(line)
+            # Limpiar espacios múltiples dentro de cada línea, pero mantener la línea completa
+            cleaned_line = re.sub(r' +', ' ', line)
+            cleaned_lines.append(cleaned_line)
        
-        # Unir las líneas
+        # Unir las líneas preservando los saltos de línea
        text = '\n'.join(cleaned_lines)
        
-        # Primero limpiamos la combinación específica de CRLF+NBSP+CRLF
+        # Limpiar la combinación específica de CRLF+NBSP+CRLF
        text = re.sub(r'\r?\n\xa0\r?\n', '\n', text)
-
+        
+        # Reemplazar CRLF por LF
+        text = text.replace('\r\n', '\n')
+        
+        # Reemplazar CR por LF
+        text = text.replace('\r', '\n')
+        
        # Reemplazar 3 o más saltos de línea por dos
        text = re.sub(r'\n{3,}', '\n\n', text)
-
+        
+        # Eliminar espacios al inicio y final del texto completo
        return text.strip()
    
    def to_markdown(self):
@ -80,5 +89,27 @@ class MensajeEmail:
        return fecha

    def _generar_hash(self):
-        texto = f"{self.remitente}{self.fecha.isoformat()}{self.contenido}"
-        return hashlib.md5(texto.encode()).hexdigest()
+        """
+        Genera un hash único para el mensaje basado en una combinación de campos
+        que identifican únicamente el mensaje
+        """
+        # Limpiar y normalizar el contenido para el hash
+        # Para el hash, sí normalizamos completamente los espacios
+        contenido_hash = re.sub(r'\s+', ' ', self.contenido).strip()
+        
+        # Normalizar el subject
+        subject_normalizado = re.sub(r'\s+', ' ', self.subject if self.subject else '').strip()
+        
+        # Crear una cadena con los elementos clave del mensaje
+        elementos_hash = [
+            self.remitente.strip(),
+            self.fecha.strftime('%Y%m%d%H%M'),  # Solo hasta minutos para permitir pequeñas variaciones
+            subject_normalizado,
+            contenido_hash[:500]  # Usar solo los primeros 500 caracteres del contenido normalizado
+        ]
+        
+        # Unir todos los elementos con un separador único
+        texto_hash = '|'.join(elementos_hash)
+        
+        # Generar el hash
+        return hashlib.md5(texto_hash.encode()).hexdigest()
--- a/utils/pycache/email_parser.cpython-310.pyc
+++ b/utils/pycache/email_parser.cpython-310.pyc
--- a/utils/email_parser.py
+++ b/utils/email_parser.py
@ -9,126 +9,270 @@ from bs4 import BeautifulSoup
 from email.utils import parsedate_to_datetime
 from models.mensaje_email import MensajeEmail
 from utils.attachment_handler import guardar_adjunto
+import tempfile
+import os
+
+def _get_payload_safely(parte):
+    """
+    Obtiene el payload de una parte del email de forma segura
+    """
+    try:
+        if parte.is_multipart():
+            return None
+        payload = parte.get_payload(decode=True)
+        if payload is None:
+            return None
+        charset = parte.get_content_charset() or 'utf-8'
+        return payload.decode(charset, errors='ignore')
+    except Exception as e:
+        print(f"Error getting payload: {str(e)}")
+        return None
+
+def _extract_subject_from_text(text):
+    """
+    Extrae el asunto de un texto dados diferentes formatos de cabecera
+    """
+    subject_headers = {
+        'Oggetto: ': 9,      # Italian
+        'Subject: ': 9,      # English
+        'Asunto: ': 8,       # Spanish
+        'Sujet: ': 7,        # French
+        'Betreff: ': 9       # German
+    }
+    
+    for line in text.split('\n'):
+        line = line.strip()
+        for header, offset in subject_headers.items():
+            if line.startswith(header):
+                return line[offset:].strip()
+    return None
+
+def _should_skip_line(line):
+    """
+    Determina si una línea debe ser omitida por ser una cabecera de email
+    """
+    headers_to_skip = [
+        'Da: ', 'Inviato: ', 'A: ',           # Italian
+        'From: ', 'Sent: ', 'To: ',           # English
+        'De: ', 'Enviado: ', 'Para: ',        # Spanish
+        'Von: ', 'Gesendet: ', 'An: ',        # German
+        'De : ', 'Envoyé : ', 'À : '          # French
+    ]
+    return any(line.strip().startswith(header) for header in headers_to_skip)

 def _html_a_markdown(html):
-    # Primero limpiamos los caracteres especiales en el HTML
-    html = html.replace('\xa0', ' ')  # NBSP a espacio normal
-    html = html.replace('\r\n', '\n') # CRLF a LF
-    html = html.replace('\r', '\n')   # CR a LF
-    
-    soup = BeautifulSoup(html, 'html.parser')
-    
-    # Convert tables, keeping all newlines
-    for table in soup.find_all('table'):
-        rows = table.find_all('tr')
+    """
+    Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente
+    """
+    if html is None:
+        return (None, "")
        
-        if rows:
-            markdown_table = []
-            # Get maximum width for each column
-            max_widths = []
-            for row in rows:
-                cells = row.find_all(['th', 'td'])
-                while len(max_widths) < len(cells):
-                    max_widths.append(0)
-                for i, cell in enumerate(cells):
-                    max_widths[i] = max(max_widths[i], len(cell.get_text().strip()))
-            
-            # Build table rows
-            header_row = rows[0].find_all(['th', 'td'])
-            header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
-                                     for i, cell in enumerate(header_row)) + ' |'
-            separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
-            
-            markdown_table.append(header)
-            markdown_table.append(separator)
-            
-            for row in rows[1:]:
-                cells = row.find_all(['td', 'th'])
-                row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
-                                           for i, cell in enumerate(cells)) + ' |'
-                markdown_table.append(row_text)
-            
-            # Join with newlines and replace
-            new_text = '\n' + '\n'.join(markdown_table)
-            table.replace_with(soup.new_string(new_text))
-    
-    # Handle basic HTML elements
-    for br in soup.find_all('br'):
-        br.replace_with('\n')
-    
-    # Get text content
-    text = soup.get_text()
-    
-    # Only extract subject and remove basic email headers
-    lines = text.split('\n')
-    cleaned_lines = []
-    subject = None
-    
-    for line in lines:
-        # Extract subject if present
-        if line.startswith('Oggetto: '):
-            subject = line[9:].strip()
-            continue
+    try:
+        # Limpieza básica
+        html = html.replace('\xa0', ' ')  # NBSP a espacio normal
+        html = html.replace('\r\n', '\n') # CRLF a LF
+        html = html.replace('\r', '\n')   # CR a LF
        
-        # Skip only the most basic email headers
-        if line.startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')):
-            continue
+        soup = BeautifulSoup(html, 'html.parser')
+        
+        # Procesar tablas
+        for table in soup.find_all('table'):
+            try:
+                rows = table.find_all('tr')
+                if not rows:
+                    continue
+                    
+                markdown_table = []
+                max_widths = []
+                
+                # Calcular anchos máximos
+                for row in rows:
+                    cells = row.find_all(['th', 'td'])
+                    while len(max_widths) < len(cells):
+                        max_widths.append(0)
+                    for i, cell in enumerate(cells):
+                        cell_text = cell.get_text().strip()
+                        max_widths[i] = max(max_widths[i], len(cell_text))
+                
+                # Construir tabla markdown
+                if max_widths:  # Solo si tenemos celdas válidas
+                    header_row = rows[0].find_all(['th', 'td'])
+                    header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
+                                             for i, cell in enumerate(header_row)) + ' |'
+                    separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
+                    
+                    markdown_table.append(header)
+                    markdown_table.append(separator)
+                    
+                    for row in rows[1:]:
+                        cells = row.find_all(['td', 'th'])
+                        row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
+                                                   for i, cell in enumerate(cells)) + ' |'
+                        markdown_table.append(row_text)
+                    
+                    table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))
+            except Exception as e:
+                print(f"Error procesando tabla: {str(e)}")
+                continue
+        
+        # Procesar saltos de línea
+        for br in soup.find_all('br'):
+            br.replace_with('\n')
+        
+        # Obtener texto limpio
+        text = soup.get_text()
+        
+        # Procesar líneas
+        cleaned_lines = []
+        subject = None
+        
+        for line in text.split('\n'):
+            if not subject:
+                subject = _extract_subject_from_text(line)
            
-        # Keep the line as is, with all its spacing
-        cleaned_lines.append(line)
-    
-    # Join lines preserving all newlines
-    text = '\n'.join(cleaned_lines)
-    
-    return subject, text
+            if not _should_skip_line(line):
+                cleaned_lines.append(line)
+        
+        final_text = '\n'.join(cleaned_lines).strip()
+        return (subject, final_text)
+        
+    except Exception as e:
+        print(f"Error en html_a_markdown: {str(e)}")
+        return (None, html if html else "")
+
+def _procesar_email_adjunto(parte, dir_adjuntos):
+    """
+    Procesa un email que viene como adjunto dentro de otro email.
+    """
+    try:
+        mensajes = []
+        if parte.is_multipart():
+            # Si es multipart, procesar cada subparte
+            for subparte in parte.walk():
+                if subparte.get_content_type() == "message/rfc822":
+                    # Si es un mensaje RFC822, obtener el payload como lista
+                    payload = subparte.get_payload()
+                    if isinstance(payload, list):
+                        for msg in payload:
+                            mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
+                    elif isinstance(payload, email.message.Message):
+                        mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
+        else:
+            # Si no es multipart, intentar procesar como mensaje único
+            payload = parte.get_payload()
+            if isinstance(payload, list):
+                for msg in payload:
+                    mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
+            elif isinstance(payload, email.message.Message):
+                mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
+            
+        return mensajes
+    except Exception as e:
+        print(f"Error procesando email adjunto: {str(e)}")
+        return []

 def procesar_eml(ruta_archivo, dir_adjuntos):
-    with open(ruta_archivo, 'rb') as eml:
-        mensaje = BytesParser(policy=policy.default).parse(eml)
+    """
+    Punto de entrada principal para procesar archivos .eml
+    """
+    try:
+        with open(ruta_archivo, 'rb') as eml:
+            mensaje = BytesParser(policy=policy.default).parse(eml)
+        return procesar_eml_interno(mensaje, dir_adjuntos)
+    except Exception as e:
+        print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
+        return []

-    remitente = mensaje.get('from', '')
-    fecha_str = mensaje.get('date', '')
-    fecha = _parsear_fecha(fecha_str)
+def procesar_eml_interno(mensaje, dir_adjuntos):
+    """
+    Procesa un mensaje de email, ya sea desde archivo o adjunto
+    """
+    mensajes = []
    
-    contenido = ""
-    subject = None
-    adjuntos = []
-    tiene_html = False
-    
-    # Primera pasada: verificar si hay contenido HTML
-    if mensaje.is_multipart():
-        for parte in mensaje.walk():
-            if parte.get_content_type() == "text/html":
-                tiene_html = True
-                break
-    else:
-        tiene_html = mensaje.get_content_type() == "text/html"
-    
-    # Segunda pasada: procesar el contenido
-    if mensaje.is_multipart():
-        for parte in mensaje.walk():
-            if parte.get_content_type() == "text/html":
-                html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
-                part_subject, text = _html_a_markdown(html_content)
-                if part_subject and not subject:
-                    subject = part_subject
-                contenido = text  # Reemplazar en lugar de concatenar
-            elif parte.get_content_type() == "text/plain" and not tiene_html:
-                # Solo usar texto plano si no hay HTML
-                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
-                contenido = text
-            elif parte.get_content_disposition() == 'attachment':
-                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
-                if ruta_adjunto:
-                    adjuntos.append(Path(ruta_adjunto).name)
-    else:
-        if mensaje.get_content_type() == "text/html":
-            html_content = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
-            subject, contenido = _html_a_markdown(html_content)
+    try:
+        remitente = mensaje.get('from', '')
+        fecha_str = mensaje.get('date', '')
+        fecha = _parsear_fecha(fecha_str)
+        
+        # Get subject from email headers first
+        subject = mensaje.get('subject', '')
+        if subject:
+            # Try to decode if it's encoded
+            subject = str(email.header.make_header(email.header.decode_header(subject)))
+        
+        contenido = ""
+        adjuntos = []
+        tiene_html = False
+        
+        # First pass: check for HTML content
+        if mensaje.is_multipart():
+            for parte in mensaje.walk():
+                if parte.get_content_type() == "text/html":
+                    tiene_html = True
+                    break
        else:
-            contenido = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
+            tiene_html = mensaje.get_content_type() == "text/html"
+        
+        # Second pass: process content and attachments
+        if mensaje.is_multipart():
+            for parte in mensaje.walk():
+                content_type = parte.get_content_type()
+                
+                try:
+                    if content_type == "text/html":
+                        html_content = _get_payload_safely(parte)
+                        if html_content:
+                            part_subject, text = _html_a_markdown(html_content)
+                            if not subject and part_subject:
+                                subject = part_subject
+                            if text:
+                                contenido = text
+                    elif content_type == "text/plain" and not tiene_html:
+                        text = _get_payload_safely(parte)
+                        if text:
+                            contenido = text
+                    elif content_type == "message/rfc822":
+                        # Procesar email adjunto
+                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
+                        mensajes.extend(mensajes_adjuntos)
+                    elif parte.get_content_disposition() == 'attachment':
+                        nombre = parte.get_filename()
+                        if nombre and nombre.lower().endswith('.eml'):
+                            # Si es un archivo .eml adjunto
+                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
+                            mensajes.extend(mensajes_adjuntos)
+                        else:
+                            # Otros tipos de adjuntos
+                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
+                            if ruta_adjunto:
+                                adjuntos.append(Path(ruta_adjunto).name)
+                except Exception as e:
+                    print(f"Error procesando parte del mensaje: {str(e)}")
+                    continue
+        else:
+            if mensaje.get_content_type() == "text/html":
+                html_content = _get_payload_safely(mensaje)
+                if html_content:
+                    part_subject, contenido = _html_a_markdown(html_content)
+                    if not subject and part_subject:
+                        subject = part_subject
+            else:
+                contenido = _get_payload_safely(mensaje) or ""
+        
+        # Solo agregar el mensaje si tiene contenido útil
+        if contenido or subject or adjuntos:
+            mensajes.append(MensajeEmail(
+                remitente=remitente,
+                fecha=fecha,
+                contenido=contenido,
+                subject=subject,
+                adjuntos=adjuntos
+            ))
+        
+    except Exception as e:
+        print(f"Error procesando mensaje: {str(e)}")
    
-    return [MensajeEmail(remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos)]
+    return mensajes

 def _parsear_fecha(fecha_str):
    try: