Intento fallido de separar los emails reenviados

Funcionando mejor
Eliminado del lineas en blanco
2025-02-05 15:04:43 +01:00 · 2025-02-05 12:58:41 +01:00 · 2025-02-05 12:25:43 +01:00
9 changed files with 454 additions and 1420 deletions
--- a/TEST.eml
+++ b/TEST.eml
--- a/config.json
+++ b/config.json
@ -1,5 +1,5 @@
 {
-    "input_dir": "D:\\Proyectos\\Scripts\\EmailCrono",
+    "input_dir": "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\Emails",
    "output_dir": "C:\\Users\\migue\\OneDrive\\Miguel\\Obsidean\\Trabajo\\VM\\04-InLavoro\\HENKEL\\93040 - HENKEL - BowlingGreen\\Description\\HENKEL - ALPLA - AUTEFA - Batch Data",
    "cronologia_file": "cronologia.md",
    "attachments_dir": "adjuntos"
--- a/main.py
+++ b/main.py
@ -4,6 +4,7 @@ from pathlib import Path
 from utils.email_parser import procesar_eml
 from utils.markdown_handler import cargar_cronologia_existente
 from config.config import Config
 import hashlib
 def main():
    config = Config()
@ -32,18 +33,34 @@ def main():
    print(f"Loaded {len(mensajes)} existing messages")
    mensajes_hash = {msg.hash for msg in mensajes}
    total_procesados = 0
    total_nuevos = 0
    mensajes_duplicados = 0
    for archivo in eml_files:
-        print(f"Processing {archivo}")
+        print(f"\nProcessing {archivo}")
        nuevos_mensajes = procesar_eml(archivo, config.get_attachments_dir())
        total_procesados += len(nuevos_mensajes)
        # Verificar duplicados
        for msg in nuevos_mensajes:
            if msg.hash not in mensajes_hash:
                mensajes.append(msg)
                mensajes_hash.add(msg.hash)
                total_nuevos += 1
            else:
                mensajes_duplicados += 1
    print(f"\nEstadísticas de procesamiento:")
    print(f"- Total mensajes encontrados: {total_procesados}")
    print(f"- Mensajes únicos añadidos: {total_nuevos}")
    print(f"- Mensajes duplicados ignorados: {mensajes_duplicados}")
    # Ordenar todos los mensajes por fecha
    mensajes.sort(key=lambda x: x.fecha)
    output_file = config.get_cronologia_file()
-    print(f"Writing to {output_file}")
+    print(f"\nWriting {len(mensajes)} messages to {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for msg in mensajes:
            f.write(msg.to_markdown())
--- a/models/pycache/mensaje_email.cpython-310.pyc
+++ b/models/pycache/mensaje_email.cpython-310.pyc
--- a/models/mensaje_email.py
+++ b/models/mensaje_email.py
@ -25,14 +25,26 @@ class MensajeEmail:
            # Skip metadata lines
            if line.strip().startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')) or line.strip().startswith('Oggetto: '):
                continue
-            cleaned_lines.append(line)
+            # Limpiar espacios múltiples dentro de cada línea, pero mantener la línea completa
            cleaned_line = re.sub(r' +', ' ', line)
            cleaned_lines.append(cleaned_line)
-        # Unir las líneas
+        # Unir las líneas preservando los saltos de línea
        text = '\n'.join(cleaned_lines)
        # Limpiar la combinación específica de CRLF+NBSP+CRLF
        text = re.sub(r'\r?\n\xa0\r?\n', '\n', text)
        # Reemplazar CRLF por LF
        text = text.replace('\r\n', '\n')
        # Reemplazar CR por LF
        text = text.replace('\r', '\n')
        # Reemplazar 3 o más saltos de línea por dos
        text = re.sub(r'\n{3,}', '\n\n', text)
        # Eliminar espacios al inicio y final del texto completo
        return text.strip()
    def to_markdown(self):
@ -77,5 +89,27 @@ class MensajeEmail:
        return fecha
    def _generar_hash(self):
-        texto = f"{self.remitente}{self.fecha.isoformat()}{self.contenido}"
+        """
-        return hashlib.md5(texto.encode()).hexdigest()
+        Genera un hash único para el mensaje basado en una combinación de campos
        que identifican únicamente el mensaje
        """
        # Limpiar y normalizar el contenido para el hash
        # Para el hash, sí normalizamos completamente los espacios
        contenido_hash = re.sub(r'\s+', ' ', self.contenido).strip()
        # Normalizar el subject
        subject_normalizado = re.sub(r'\s+', ' ', self.subject if self.subject else '').strip()
        # Crear una cadena con los elementos clave del mensaje
        elementos_hash = [
            self.remitente.strip(),
            self.fecha.strftime('%Y%m%d%H%M'),  # Solo hasta minutos para permitir pequeñas variaciones
            subject_normalizado,
            contenido_hash[:500]  # Usar solo los primeros 500 caracteres del contenido normalizado
        ]
        # Unir todos los elementos con un separador único
        texto_hash = '|'.join(elementos_hash)
        # Generar el hash
        return hashlib.md5(texto_hash.encode()).hexdigest()
--- a/utils/pycache/email_parser.cpython-310.pyc
+++ b/utils/pycache/email_parser.cpython-310.pyc
--- a/utils/pycache/forward_handler.cpython-310.pyc
+++ b/utils/pycache/forward_handler.cpython-310.pyc
--- a/utils/email_parser.py
+++ b/utils/email_parser.py
@ -9,26 +9,94 @@ from bs4 import BeautifulSoup
 from email.utils import parsedate_to_datetime
 from models.mensaje_email import MensajeEmail
 from utils.attachment_handler import guardar_adjunto
 from utils.forward_handler import extract_forwarded_messages
 import tempfile
 import os
 def _get_payload_safely(parte):
    """
    Obtiene el payload de una parte del email de forma segura
    """
    try:
        if parte.is_multipart():
            return None
        payload = parte.get_payload(decode=True)
        if payload is None:
            return None
        charset = parte.get_content_charset() or 'utf-8'
        return payload.decode(charset, errors='ignore')
    except Exception as e:
        print(f"Error getting payload: {str(e)}")
        return None
 def _extract_subject_from_text(text):
    """
    Extrae el asunto de un texto dados diferentes formatos de cabecera
    """
    subject_headers = {
        'Oggetto: ': 9,      # Italian
        'Subject: ': 9,      # English
        'Asunto: ': 8,       # Spanish
        'Sujet: ': 7,        # French
        'Betreff: ': 9       # German
    }
    for line in text.split('\n'):
        line = line.strip()
        for header, offset in subject_headers.items():
            if line.startswith(header):
                return line[offset:].strip()
    return None
 def _should_skip_line(line):
    """
    Determina si una línea debe ser omitida por ser una cabecera de email
    """
    headers_to_skip = [
        'Da: ', 'Inviato: ', 'A: ',           # Italian
        'From: ', 'Sent: ', 'To: ',           # English
        'De: ', 'Enviado: ', 'Para: ',        # Spanish
        'Von: ', 'Gesendet: ', 'An: ',        # German
        'De : ', 'Envoyé : ', 'À : '          # French
    ]
    return any(line.strip().startswith(header) for header in headers_to_skip)
 def _html_a_markdown(html):
    """
    Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente
    """
    if html is None:
        return (None, "")
    try:
        # Limpieza básica
        html = html.replace('\xa0', ' ')  # NBSP a espacio normal
        html = html.replace('\r\n', '\n') # CRLF a LF
        html = html.replace('\r', '\n')   # CR a LF
        soup = BeautifulSoup(html, 'html.parser')
-    # Convert tables, keeping all newlines
+        # Procesar tablas
        for table in soup.find_all('table'):
            try:
                rows = table.find_all('tr')
                if not rows:
                    continue
        if rows:
                markdown_table = []
            # Get maximum width for each column
                max_widths = []
                # Calcular anchos máximos
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    while len(max_widths) < len(cells):
                        max_widths.append(0)
                    for i, cell in enumerate(cells):
-                    max_widths[i] = max(max_widths[i], len(cell.get_text().strip()))
+                        cell_text = cell.get_text().strip()
                        max_widths[i] = max(max_widths[i], len(cell_text))
-            # Build table rows
+                # Construir tabla markdown
                if max_widths:  # Solo si tenemos celdas válidas
                    header_row = rows[0].find_all(['th', 'td'])
                    header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
                                             for i, cell in enumerate(header_row)) + ' |'
@ -43,75 +111,79 @@ def _html_a_markdown(html):
                                                   for i, cell in enumerate(cells)) + ' |'
                        markdown_table.append(row_text)
-            # Join with newlines and replace
+                    table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))
-            new_text = '\n' + '\n'.join(markdown_table)
+            except Exception as e:
-            table.replace_with(soup.new_string(new_text))
+                print(f"Error procesando tabla: {str(e)}")
                continue
-    # Handle basic HTML elements
+        # Procesar saltos de línea
        for br in soup.find_all('br'):
            br.replace_with('\n')
-    # Get text content
+        # Obtener texto limpio
        text = soup.get_text()
-    # Only extract subject and remove basic email headers
+        # Procesar líneas
    lines = text.split('\n')
        cleaned_lines = []
        subject = None
-    for line in lines:
+        for line in text.split('\n'):
-        # Extract subject if present
+            if not subject:
-        if line.startswith('Oggetto: '):
+                subject = _extract_subject_from_text(line)
            subject = line[9:].strip()
            continue
-        # Skip only the most basic email headers
+            if not _should_skip_line(line):
        if line.startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')):
            continue
        # Keep the line as is, with all its spacing
                cleaned_lines.append(line)
-    # Join lines preserving all newlines
+        final_text = '\n'.join(cleaned_lines).strip()
-    text = '\n'.join(cleaned_lines)
+        return (subject, final_text)
-    return subject, text
+    except Exception as e:
        print(f"Error en html_a_markdown: {str(e)}")
        return (None, html if html else "")
 def _procesar_email_adjunto(parte, dir_adjuntos):
    """
    Procesa un email que viene como adjunto dentro de otro email.
    """
    try:
        mensajes = []
        if parte.is_multipart():
            # Si es multipart, procesar cada subparte
            for subparte in parte.walk():
                if subparte.get_content_type() == "message/rfc822":
                    # Si es un mensaje RFC822, obtener el payload como lista
                    payload = subparte.get_payload()
                    if isinstance(payload, list):
                        for msg in payload:
                            mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
                    elif isinstance(payload, email.message.Message):
                        mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
        else:
            # Si no es multipart, intentar procesar como mensaje único
            payload = parte.get_payload()
            if isinstance(payload, list):
                for msg in payload:
                    mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
            elif isinstance(payload, email.message.Message):
                mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
        return mensajes
    except Exception as e:
        print(f"Error procesando email adjunto: {str(e)}")
        return []
 def procesar_eml(ruta_archivo, dir_adjuntos):
    """
    Punto de entrada principal para procesar archivos .eml
    """
    try:
        with open(ruta_archivo, 'rb') as eml:
            mensaje = BytesParser(policy=policy.default).parse(eml)
        return procesar_eml_interno(mensaje, dir_adjuntos)
    except Exception as e:
        print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []
    remitente = mensaje.get('from', '')
    fecha_str = mensaje.get('date', '')
    fecha = _parsear_fecha(fecha_str)
    contenido = ""
    subject = None
    adjuntos = []
    if mensaje.is_multipart():
        for parte in mensaje.walk():
            if parte.get_content_type() == "text/plain":
                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                contenido += text
            elif parte.get_content_type() == "text/html":
                html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                part_subject, text = _html_a_markdown(html_content)
                if part_subject and not subject:
                    subject = part_subject
                contenido += text
            elif parte.get_content_disposition() == 'attachment':
                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                if ruta_adjunto:
                    adjuntos.append(Path(ruta_adjunto).name)
    else:
        if mensaje.get_content_type() == "text/html":
            html_content = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
            subject, contenido = _html_a_markdown(html_content)
        else:
            contenido = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
    return [MensajeEmail(remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos)]
 def _parsear_fecha(fecha_str):
    try:
@ -132,3 +204,87 @@ def _parsear_fecha(fecha_str):
        except:
            pass
        return datetime.now()
 def procesar_eml_interno(mensaje, dir_adjuntos):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []
    try:
        remitente = mensaje.get('from', '')
        fecha_str = mensaje.get('date', '')
        fecha = _parsear_fecha(fecha_str)
        # Get subject from email headers first
        subject = mensaje.get('subject', '')
        if subject:
            subject = str(email.header.make_header(email.header.decode_header(subject)))
        contenido = ""
        adjuntos = []
        contenido_html = None
        # First pass: check for HTML content and extract it
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                content_type = parte.get_content_type()
                try:
                    if content_type == "text/html":
                        html_content = _get_payload_safely(parte)
                        if html_content:
                            contenido_html = html_content
                    elif content_type == "text/plain" and not contenido_html:
                        contenido = _get_payload_safely(parte) or ""
                    elif content_type == "message/rfc822":
                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                        mensajes.extend(mensajes_adjuntos)
                    elif parte.get_content_disposition() == 'attachment':
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith('.eml'):
                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                            if ruta_adjunto:
                                adjuntos.append(Path(ruta_adjunto).name)
                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                contenido_html = _get_payload_safely(mensaje)
            else:
                contenido = _get_payload_safely(mensaje) or ""
        # Process HTML content if available
        if contenido_html:
            part_subject, text = _html_a_markdown(contenido_html)
            if not subject and part_subject:
                subject = part_subject
            contenido = text
        # Process forwarded messages from the markdown content
        if contenido:
            print(f"\nBuscando mensajes reenviados en contenido ({len(contenido)} chars)")
            print("Primeros 200 chars:", contenido[:200])
            contenido_principal, mensajes_reenviados = extract_forwarded_messages(contenido)
            print(f"Encontrados {len(mensajes_reenviados)} mensajes reenviados")
            contenido = contenido_principal
            mensajes.extend(mensajes_reenviados)
        # Solo agregar el mensaje principal si tiene contenido útil
        if contenido or subject or adjuntos:
            mensajes.insert(0, MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos
            ))
    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")
    return mensajes
--- a/utils/forward_handler.py
+++ b/utils/forward_handler.py
@ -0,0 +1,144 @@
 # utils/forward_handler.py
 import re
 import os
 from datetime import datetime
 from email.utils import parseaddr
 from models.mensaje_email import MensajeEmail
 # Patrones de inicio de mensaje reenviado en diferentes idiomas
 FORWARD_PATTERNS = [
    r"[-]{3,}\s*Messaggio originale\s*[-]{3,}",     # Italiano
    r"[-]{3,}\s*Original Message\s*[-]{3,}",        # Inglés
    r"[-]{3,}\s*Mensaje original\s*[-]{3,}",        # Español
    r"[-]{3,}\s*Message d'origine\s*[-]{3,}",       # Francés
    r"[-]{3,}\s*Ursprüngliche Nachricht\s*[-]{3,}", # Alemán
    # Variantes más flexibles
    r"[-]{3,}\s*Forwarded message\s*[-]{3,}",
    r"[-]{3,}\s*Mensaje reenviado\s*[-]{3,}",
    r"[-]{3,}\s*Messaggio inoltrato\s*[-]{3,}",
    # Patrones con > que suelen aparecer en texto plano
    r"(?m)^>\s*[-]{3,}\s*Messaggio originale\s*[-]{3,}",
    r"(?m)^>\s*[-]{3,}\s*Original Message\s*[-]{3,}"
 ]
 # Patrones de headers en diferentes idiomas
 HEADER_PATTERNS = {
    'from': [
        r"Da:\s*(.*)",       # Italiano
        r"From:\s*(.*)",     # Inglés
        r"De:\s*(.*)",       # Español
        r"Von:\s*(.*)",      # Alemán
        r"De :\s*(.*)"       # Francés
    ],
    'date': [
        r"Inviato:\s*(.*)",  # Italiano
        r"Sent:\s*(.*)",     # Inglés
        r"Enviado:\s*(.*)",  # Español
        r"Gesendet:\s*(.*)", # Alemán
        r"Envoyé :\s*(.*)"   # Francés
    ],
    'subject': [
        r"Oggetto:\s*(.*)",  # Italiano
        r"Subject:\s*(.*)",  # Inglés
        r"Asunto:\s*(.*)",   # Español
        r"Betreff:\s*(.*)",  # Alemán
        r"Sujet :\s*(.*)"    # Francés
    ]
 }
 def extract_forwarded_messages(contenido):
    """
    Extrae mensajes reenviados del contenido del email
    Retorna una lista de objetos MensajeEmail
    """
    mensajes = []
    # Crear el patrón de división combinando todos los patrones de reenvío
    split_pattern = '|'.join(f"({pattern})" for pattern in FORWARD_PATTERNS)
    # Dividir el contenido usando el patrón combinado
    partes = re.split(split_pattern, contenido)
    # El primer elemento es el contenido original del email
    contenido_original = partes[0].strip()
    # Procesar cada parte que coincide con un patrón de reenvío
    for i in range(1, len(partes), len(FORWARD_PATTERNS) + 1):
        # Encontrar qué patrón coincidió
        patron_encontrado = next((p for p in partes[i:i+len(FORWARD_PATTERNS)] if p), None)
        if patron_encontrado and i + len(FORWARD_PATTERNS) < len(partes):
            contenido_reenviado = partes[i + len(FORWARD_PATTERNS)].strip()
            if contenido_reenviado:
                mensaje = _procesar_contenido_reenviado(contenido_reenviado)
                if mensaje:
                    mensajes.append(mensaje)
    return contenido_original, mensajes
 def _procesar_contenido_reenviado(contenido):
    """
    Procesa el contenido de un mensaje reenviado y extrae la información relevante
    """
    # Extraer headers
    remitente = None
    fecha_str = None
    subject = None
    cuerpo = contenido
    # Buscar headers al inicio del mensaje
    lineas = contenido.split('\n')
    headers_encontrados = 0
    i = 0
    while i < len(lineas) and headers_encontrados < 3:
        linea = lineas[i].strip()
        # Buscar remitente
        if not remitente:
            for pattern in HEADER_PATTERNS['from']:
                match = re.match(pattern, linea)
                if match:
                    remitente = match.group(1).strip()
                    headers_encontrados += 1
                    break
        # Buscar fecha
        if not fecha_str:
            for pattern in HEADER_PATTERNS['date']:
                match = re.match(pattern, linea)
                if match:
                    fecha_str = match.group(1).strip()
                    headers_encontrados += 1
                    break
        # Buscar asunto
        if not subject:
            for pattern in HEADER_PATTERNS['subject']:
                match = re.match(pattern, linea)
                if match:
                    subject = match.group(1).strip()
                    headers_encontrados += 1
                    break
        i += 1
    # Si encontramos headers, el cuerpo comienza después de ellos
    if headers_encontrados > 0:
        cuerpo = '\n'.join(lineas[i:]).strip()
    # Si no tenemos la información mínima necesaria, retornar None
    if not (remitente or fecha_str or cuerpo):
        return None
    # Crear el objeto MensajeEmail
    try:
        return MensajeEmail(
            remitente=remitente or "Remitente Desconocido",
            fecha=fecha_str or datetime.now(),
            contenido=cuerpo,
            subject=subject,
            adjuntos=[]
        )
    except Exception as e:
        print(f"Error creando mensaje reenviado: {str(e)}")
        return None
Author	SHA1	Message	Date
Miguel	220e911731	Intento fallido de separar los emails reenviados	2025-02-05 15:04:43 +01:00
Miguel	b9c3024e04	Funcionando mejor	2025-02-05 12:58:41 +01:00
Miguel	9be8f227cb	Eliminado del lineas en blanco	2025-02-05 12:25:43 +01:00