# utils/email_parser.py import email from email import policy from email.parser import BytesParser from datetime import datetime import re from pathlib import Path from bs4 import BeautifulSoup from email.utils import parsedate_to_datetime from models.mensaje_email import MensajeEmail from utils.attachment_handler import guardar_adjunto import tempfile import os def _get_payload_safely(parte): """ Obtiene el payload de una parte del email de forma segura """ try: if parte.is_multipart(): return None payload = parte.get_payload(decode=True) if payload is None: return None charset = parte.get_content_charset() or "utf-8" return payload.decode(charset, errors="ignore") except Exception as e: print(f"Error getting payload: {str(e)}") return None def _extract_subject_from_text(text): """ Extrae el asunto de un texto dados diferentes formatos de cabecera """ subject_headers = { "Oggetto: ": 9, # Italian "Subject: ": 9, # English "Asunto: ": 8, # Spanish "Sujet: ": 7, # French "Betreff: ": 9, # German } for line in text.split("\n"): line = line.strip() for header, offset in subject_headers.items(): if line.startswith(header): return line[offset:].strip() return None def _should_skip_line(line): """ Determina si una línea debe ser omitida por ser una cabecera de email """ headers_to_skip = [ "Da: ", "Inviato: ", "A: ", # Italian "From: ", "Sent: ", "To: ", # English "De: ", "Enviado: ", "Para: ", # Spanish "Von: ", "Gesendet: ", "An: ", # German "De : ", "Envoyé : ", "À : ", # French ] return any(line.strip().startswith(header) for header in headers_to_skip) def _html_a_markdown(html): """ Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente """ if html is None: return (None, "") try: # Limpieza básica html = html.replace("\xa0", " ") # NBSP a espacio normal html = html.replace("\r\n", "\n") # CRLF a LF html = html.replace("\r", "\n") # CR a LF soup = BeautifulSoup(html, "html.parser") # Procesar tablas for table in soup.find_all("table"): try: rows = table.find_all("tr") if not rows: continue # Matriz para almacenar la tabla procesada table_matrix = [] max_cols = 0 # Primera pasada: crear matriz y procesar rowspans/colspans row_idx = 0 while row_idx < len(rows): row = rows[row_idx] cells = row.find_all(["th", "td"]) if not cells: row_idx += 1 continue # Expandir matriz si es necesario while len(table_matrix) <= row_idx: table_matrix.append([]) col_idx = 0 for cell in cells: # Encontrar la siguiente columna disponible while ( col_idx < len(table_matrix[row_idx]) and table_matrix[row_idx][col_idx] is not None ): col_idx += 1 # Obtener rowspan y colspan rowspan = int(cell.get("rowspan", 1)) colspan = int(cell.get("colspan", 1)) # Procesar el texto de la celda reemplazando saltos de línea por
cell_text = cell.get_text().strip() cell_text = cell_text.replace("\n", "
") cell_text = re.sub( r"\s*
\s*
\s*", "
", cell_text ) # Eliminar
múltiples cell_text = cell_text.strip() # Rellenar la matriz con el texto y None para las celdas combinadas for r in range(rowspan): current_row = row_idx + r # Expandir matriz si es necesario while len(table_matrix) <= current_row: table_matrix.append([]) # Expandir fila si es necesario while ( len(table_matrix[current_row]) <= col_idx + colspan - 1 ): table_matrix[current_row].append(None) for c in range(colspan): if r == 0 and c == 0: table_matrix[current_row][col_idx + c] = cell_text else: table_matrix[current_row][col_idx + c] = "" col_idx += colspan max_cols = max(max_cols, col_idx) row_idx += 1 # Asegurar que todas las filas tengan el mismo número de columnas for row in table_matrix: while len(row) < max_cols: row.append("") # Calcular anchos máximos por columna col_widths = [0] * max_cols for row in table_matrix: for col_idx, cell in enumerate(row): if cell is not None: col_widths[col_idx] = max( col_widths[col_idx], len(str(cell)) ) # Generar tabla Markdown markdown_table = [] # Cabecera if table_matrix: header = "|" for col_idx, width in enumerate(col_widths): cell = str(table_matrix[0][col_idx] or "") header += f" {cell.ljust(width)} |" markdown_table.append(header) # Separador separator = "|" for width in col_widths: separator += "-" * (width + 2) + "|" markdown_table.append(separator) # Contenido for row_idx in range(1, len(table_matrix)): row_text = "|" for col_idx, width in enumerate(col_widths): cell = str(table_matrix[row_idx][col_idx] or "") row_text += f" {cell.ljust(width)} |" markdown_table.append(row_text) # Reemplazar la tabla HTML con la versión Markdown if markdown_table: table.replace_with( soup.new_string("\n" + "\n".join(markdown_table) + "\n") ) except Exception as e: print(f"Error procesando tabla: {str(e)}") continue # Procesar saltos de línea for br in soup.find_all("br"): br.replace_with("\n") # Obtener texto limpio text = soup.get_text() # Procesar líneas cleaned_lines = [] subject = None for line in text.split("\n"): if not subject: subject = _extract_subject_from_text(line) if not _should_skip_line(line): cleaned_lines.append(line) final_text = "\n".join(cleaned_lines).strip() return (subject, final_text) except Exception as e: print(f"Error en html_a_markdown: {str(e)}") return (None, html if html else "") def _procesar_email_adjunto(parte, dir_adjuntos): """ Procesa un email que viene como adjunto dentro de otro email. """ try: mensajes = [] if parte.is_multipart(): # Si es multipart, procesar cada subparte for subparte in parte.walk(): if subparte.get_content_type() == "message/rfc822": # Si es un mensaje RFC822, obtener el payload como lista payload = subparte.get_payload() if isinstance(payload, list): for msg in payload: mensajes.extend(procesar_eml_interno(msg, dir_adjuntos)) elif isinstance(payload, email.message.Message): mensajes.extend(procesar_eml_interno(payload, dir_adjuntos)) else: # Si no es multipart, intentar procesar como mensaje único payload = parte.get_payload() if isinstance(payload, list): for msg in payload: mensajes.extend(procesar_eml_interno(msg, dir_adjuntos)) elif isinstance(payload, email.message.Message): mensajes.extend(procesar_eml_interno(payload, dir_adjuntos)) return mensajes except Exception as e: print(f"Error procesando email adjunto: {str(e)}") return [] def procesar_eml(ruta_archivo, dir_adjuntos): """ Punto de entrada principal para procesar archivos .eml """ try: print(f" 📧 Abriendo archivo: {ruta_archivo}") with open(ruta_archivo, "rb") as eml: mensaje = BytesParser(policy=policy.default).parse(eml) mensajes = procesar_eml_interno(mensaje, dir_adjuntos) print(f" 📧 Procesamiento completado: {len(mensajes)} mensajes extraídos") return mensajes except Exception as e: print(f"❌ Error al abrir el archivo {ruta_archivo}: {str(e)}") return [] def procesar_eml_interno(mensaje, dir_adjuntos): """ Procesa un mensaje de email, ya sea desde archivo o adjunto """ mensajes = [] try: remitente = mensaje.get("from", "") fecha_str = mensaje.get("date", "") fecha = _parsear_fecha(fecha_str) # Get subject from email headers first subject = mensaje.get("subject", "") if subject: # Try to decode if it's encoded subject = str(email.header.make_header(email.header.decode_header(subject))) contenido = "" adjuntos = [] tiene_html = False # First pass: check for HTML content if mensaje.is_multipart(): for parte in mensaje.walk(): if parte.get_content_type() == "text/html": tiene_html = True break else: tiene_html = mensaje.get_content_type() == "text/html" # Second pass: process content and attachments if mensaje.is_multipart(): # Asegurarnos de capturar SOLO una vez el cuerpo principal y no sobrescribirlo contenido_set = False # flag para no re-asignar contenido principal for parte in mensaje.walk(): content_type = parte.get_content_type() try: # ----------------------------- # 1. CONTENIDO PRINCIPAL # ----------------------------- if (not contenido_set) and ( content_type == "text/html" or (content_type == "text/plain" and not tiene_html) ): # Procesamos la primera ocurrencia válida if content_type == "text/html": html_content = _get_payload_safely(parte) if html_content: part_subject, text = _html_a_markdown(html_content) if not subject and part_subject: subject = part_subject if text: contenido = text contenido_set = True else: # text/plain text = _get_payload_safely(parte) if text: contenido = text contenido_set = True # ----------------------------- # 2. EMAILS RFC822 ADJUNTOS # ----------------------------- elif content_type == "message/rfc822": mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos) mensajes.extend(mensajes_adjuntos) # ----------------------------- # 3. OTROS ADJUNTOS # ----------------------------- elif parte.get_content_disposition() == "attachment": nombre = parte.get_filename() if nombre and nombre.lower().endswith(".eml"): mensajes_adjuntos = _procesar_email_adjunto( parte, dir_adjuntos ) mensajes.extend(mensajes_adjuntos) else: ruta_adjunto = guardar_adjunto(parte, dir_adjuntos) if ruta_adjunto: adjuntos.append(Path(ruta_adjunto).name) except Exception as e: print(f"Error procesando parte del mensaje: {str(e)}") continue else: if mensaje.get_content_type() == "text/html": html_content = _get_payload_safely(mensaje) if html_content: part_subject, contenido = _html_a_markdown(html_content) if not subject and part_subject: subject = part_subject else: contenido = _get_payload_safely(mensaje) or "" # Solo agregar el mensaje si tiene contenido útil if contenido or subject or adjuntos: mensaje_nuevo = MensajeEmail( remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos, ) print(f" ✉️ Mensaje extraído:") print(f" - Subject: {subject}") print(f" - Remitente: {remitente}") print(f" - Fecha: {fecha}") print(f" - Adjuntos: {len(adjuntos)} archivos") print(f" - Contenido: {len(contenido)} caracteres") print(f" - Hash generado: {mensaje_nuevo.hash}") mensajes.append(mensaje_nuevo) else: print(f" ⚠️ Mensaje vacío o sin contenido útil - no se agregará") except Exception as e: print(f"Error procesando mensaje: {str(e)}") return mensajes def _parsear_fecha(fecha_str): try: fecha = parsedate_to_datetime(fecha_str) return fecha.replace(tzinfo=None) # Remove timezone info except: try: fecha_match = re.search( r"venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})", fecha_str ) if fecha_match: dia, mes, año, hora, minuto = fecha_match.groups() meses_it = { "gennaio": 1, "febbraio": 2, "marzo": 3, "aprile": 4, "maggio": 5, "giugno": 6, "luglio": 7, "agosto": 8, "settembre": 9, "ottobre": 10, "novembre": 11, "dicembre": 12, } mes_num = meses_it.get(mes.lower(), 1) return datetime(int(año), mes_num, int(dia), int(hora), int(minuto)) except: pass return datetime.now()