2025-02-05 08:14:36 -03:00
|
|
|
# utils/email_parser.py
|
|
|
|
import email
|
|
|
|
from email import policy
|
|
|
|
from email.parser import BytesParser
|
|
|
|
from datetime import datetime
|
|
|
|
import re
|
|
|
|
from pathlib import Path
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
from models.mensaje_email import MensajeEmail
|
|
|
|
from utils.attachment_handler import guardar_adjunto
|
2025-02-05 08:58:41 -03:00
|
|
|
import tempfile
|
|
|
|
import os
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
def _get_payload_safely(parte):
|
|
|
|
"""
|
|
|
|
Obtiene el payload de una parte del email de forma segura
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
if parte.is_multipart():
|
|
|
|
return None
|
|
|
|
payload = parte.get_payload(decode=True)
|
|
|
|
if payload is None:
|
|
|
|
return None
|
|
|
|
charset = parte.get_content_charset() or 'utf-8'
|
|
|
|
return payload.decode(charset, errors='ignore')
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error getting payload: {str(e)}")
|
|
|
|
return None
|
|
|
|
|
|
|
|
def _extract_subject_from_text(text):
|
|
|
|
"""
|
|
|
|
Extrae el asunto de un texto dados diferentes formatos de cabecera
|
|
|
|
"""
|
|
|
|
subject_headers = {
|
|
|
|
'Oggetto: ': 9, # Italian
|
|
|
|
'Subject: ': 9, # English
|
|
|
|
'Asunto: ': 8, # Spanish
|
|
|
|
'Sujet: ': 7, # French
|
|
|
|
'Betreff: ': 9 # German
|
|
|
|
}
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
for line in text.split('\n'):
|
|
|
|
line = line.strip()
|
|
|
|
for header, offset in subject_headers.items():
|
|
|
|
if line.startswith(header):
|
|
|
|
return line[offset:].strip()
|
|
|
|
return None
|
|
|
|
|
|
|
|
def _should_skip_line(line):
|
|
|
|
"""
|
|
|
|
Determina si una línea debe ser omitida por ser una cabecera de email
|
|
|
|
"""
|
|
|
|
headers_to_skip = [
|
|
|
|
'Da: ', 'Inviato: ', 'A: ', # Italian
|
|
|
|
'From: ', 'Sent: ', 'To: ', # English
|
|
|
|
'De: ', 'Enviado: ', 'Para: ', # Spanish
|
|
|
|
'Von: ', 'Gesendet: ', 'An: ', # German
|
|
|
|
'De : ', 'Envoyé : ', 'À : ' # French
|
|
|
|
]
|
|
|
|
return any(line.strip().startswith(header) for header in headers_to_skip)
|
|
|
|
|
|
|
|
def _html_a_markdown(html):
|
|
|
|
"""
|
|
|
|
Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente
|
|
|
|
"""
|
|
|
|
if html is None:
|
|
|
|
return (None, "")
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
try:
|
|
|
|
# Limpieza básica
|
|
|
|
html = html.replace('\xa0', ' ') # NBSP a espacio normal
|
|
|
|
html = html.replace('\r\n', '\n') # CRLF a LF
|
|
|
|
html = html.replace('\r', '\n') # CR a LF
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
|
|
|
|
# Procesar tablas
|
|
|
|
for table in soup.find_all('table'):
|
|
|
|
try:
|
|
|
|
rows = table.find_all('tr')
|
|
|
|
if not rows:
|
|
|
|
continue
|
|
|
|
|
|
|
|
markdown_table = []
|
|
|
|
max_widths = []
|
|
|
|
|
|
|
|
# Calcular anchos máximos
|
|
|
|
for row in rows:
|
|
|
|
cells = row.find_all(['th', 'td'])
|
|
|
|
while len(max_widths) < len(cells):
|
|
|
|
max_widths.append(0)
|
|
|
|
for i, cell in enumerate(cells):
|
|
|
|
cell_text = cell.get_text().strip()
|
|
|
|
max_widths[i] = max(max_widths[i], len(cell_text))
|
|
|
|
|
|
|
|
# Construir tabla markdown
|
|
|
|
if max_widths: # Solo si tenemos celdas válidas
|
|
|
|
header_row = rows[0].find_all(['th', 'td'])
|
|
|
|
header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
|
|
|
|
for i, cell in enumerate(header_row)) + ' |'
|
|
|
|
separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
|
|
|
|
|
|
|
|
markdown_table.append(header)
|
|
|
|
markdown_table.append(separator)
|
|
|
|
|
|
|
|
for row in rows[1:]:
|
|
|
|
cells = row.find_all(['td', 'th'])
|
|
|
|
row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
|
|
|
|
for i, cell in enumerate(cells)) + ' |'
|
|
|
|
markdown_table.append(row_text)
|
|
|
|
|
|
|
|
table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error procesando tabla: {str(e)}")
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Procesar saltos de línea
|
|
|
|
for br in soup.find_all('br'):
|
|
|
|
br.replace_with('\n')
|
|
|
|
|
|
|
|
# Obtener texto limpio
|
|
|
|
text = soup.get_text()
|
|
|
|
|
|
|
|
# Procesar líneas
|
|
|
|
cleaned_lines = []
|
|
|
|
subject = None
|
|
|
|
|
|
|
|
for line in text.split('\n'):
|
|
|
|
if not subject:
|
|
|
|
subject = _extract_subject_from_text(line)
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
if not _should_skip_line(line):
|
|
|
|
cleaned_lines.append(line)
|
|
|
|
|
|
|
|
final_text = '\n'.join(cleaned_lines).strip()
|
|
|
|
return (subject, final_text)
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
except Exception as e:
|
|
|
|
print(f"Error en html_a_markdown: {str(e)}")
|
|
|
|
return (None, html if html else "")
|
|
|
|
|
|
|
|
def _procesar_email_adjunto(parte, dir_adjuntos):
|
|
|
|
"""
|
|
|
|
Procesa un email que viene como adjunto dentro de otro email.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
mensajes = []
|
|
|
|
if parte.is_multipart():
|
|
|
|
# Si es multipart, procesar cada subparte
|
|
|
|
for subparte in parte.walk():
|
|
|
|
if subparte.get_content_type() == "message/rfc822":
|
|
|
|
# Si es un mensaje RFC822, obtener el payload como lista
|
|
|
|
payload = subparte.get_payload()
|
|
|
|
if isinstance(payload, list):
|
|
|
|
for msg in payload:
|
|
|
|
mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
|
|
|
|
elif isinstance(payload, email.message.Message):
|
|
|
|
mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
|
|
|
|
else:
|
|
|
|
# Si no es multipart, intentar procesar como mensaje único
|
|
|
|
payload = parte.get_payload()
|
|
|
|
if isinstance(payload, list):
|
|
|
|
for msg in payload:
|
|
|
|
mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
|
|
|
|
elif isinstance(payload, email.message.Message):
|
|
|
|
mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
return mensajes
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error procesando email adjunto: {str(e)}")
|
|
|
|
return []
|
2025-02-05 08:14:36 -03:00
|
|
|
|
|
|
|
def procesar_eml(ruta_archivo, dir_adjuntos):
|
2025-02-05 08:58:41 -03:00
|
|
|
"""
|
|
|
|
Punto de entrada principal para procesar archivos .eml
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
with open(ruta_archivo, 'rb') as eml:
|
|
|
|
mensaje = BytesParser(policy=policy.default).parse(eml)
|
|
|
|
return procesar_eml_interno(mensaje, dir_adjuntos)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
|
|
|
|
return []
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
def procesar_eml_interno(mensaje, dir_adjuntos):
|
|
|
|
"""
|
|
|
|
Procesa un mensaje de email, ya sea desde archivo o adjunto
|
|
|
|
"""
|
|
|
|
mensajes = []
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
try:
|
|
|
|
remitente = mensaje.get('from', '')
|
|
|
|
fecha_str = mensaje.get('date', '')
|
|
|
|
fecha = _parsear_fecha(fecha_str)
|
|
|
|
|
|
|
|
# Get subject from email headers first
|
|
|
|
subject = mensaje.get('subject', '')
|
|
|
|
if subject:
|
|
|
|
# Try to decode if it's encoded
|
|
|
|
subject = str(email.header.make_header(email.header.decode_header(subject)))
|
|
|
|
|
|
|
|
contenido = ""
|
|
|
|
adjuntos = []
|
|
|
|
tiene_html = False
|
|
|
|
|
|
|
|
# First pass: check for HTML content
|
|
|
|
if mensaje.is_multipart():
|
|
|
|
for parte in mensaje.walk():
|
|
|
|
if parte.get_content_type() == "text/html":
|
|
|
|
tiene_html = True
|
|
|
|
break
|
2025-02-05 08:14:36 -03:00
|
|
|
else:
|
2025-02-05 08:58:41 -03:00
|
|
|
tiene_html = mensaje.get_content_type() == "text/html"
|
|
|
|
|
|
|
|
# Second pass: process content and attachments
|
|
|
|
if mensaje.is_multipart():
|
|
|
|
for parte in mensaje.walk():
|
|
|
|
content_type = parte.get_content_type()
|
|
|
|
|
|
|
|
try:
|
|
|
|
if content_type == "text/html":
|
|
|
|
html_content = _get_payload_safely(parte)
|
|
|
|
if html_content:
|
|
|
|
part_subject, text = _html_a_markdown(html_content)
|
|
|
|
if not subject and part_subject:
|
|
|
|
subject = part_subject
|
|
|
|
if text:
|
|
|
|
contenido = text
|
|
|
|
elif content_type == "text/plain" and not tiene_html:
|
|
|
|
text = _get_payload_safely(parte)
|
|
|
|
if text:
|
|
|
|
contenido = text
|
|
|
|
elif content_type == "message/rfc822":
|
|
|
|
# Procesar email adjunto
|
|
|
|
mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
|
|
|
|
mensajes.extend(mensajes_adjuntos)
|
|
|
|
elif parte.get_content_disposition() == 'attachment':
|
|
|
|
nombre = parte.get_filename()
|
|
|
|
if nombre and nombre.lower().endswith('.eml'):
|
|
|
|
# Si es un archivo .eml adjunto
|
|
|
|
mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
|
|
|
|
mensajes.extend(mensajes_adjuntos)
|
|
|
|
else:
|
|
|
|
# Otros tipos de adjuntos
|
|
|
|
ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
|
|
|
|
if ruta_adjunto:
|
|
|
|
adjuntos.append(Path(ruta_adjunto).name)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error procesando parte del mensaje: {str(e)}")
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
if mensaje.get_content_type() == "text/html":
|
|
|
|
html_content = _get_payload_safely(mensaje)
|
|
|
|
if html_content:
|
|
|
|
part_subject, contenido = _html_a_markdown(html_content)
|
|
|
|
if not subject and part_subject:
|
|
|
|
subject = part_subject
|
|
|
|
else:
|
|
|
|
contenido = _get_payload_safely(mensaje) or ""
|
|
|
|
|
|
|
|
# Solo agregar el mensaje si tiene contenido útil
|
|
|
|
if contenido or subject or adjuntos:
|
|
|
|
mensajes.append(MensajeEmail(
|
|
|
|
remitente=remitente,
|
|
|
|
fecha=fecha,
|
|
|
|
contenido=contenido,
|
|
|
|
subject=subject,
|
|
|
|
adjuntos=adjuntos
|
|
|
|
))
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error procesando mensaje: {str(e)}")
|
2025-02-05 08:14:36 -03:00
|
|
|
|
2025-02-05 08:58:41 -03:00
|
|
|
return mensajes
|
2025-02-05 08:14:36 -03:00
|
|
|
|
|
|
|
def _parsear_fecha(fecha_str):
|
|
|
|
try:
|
|
|
|
fecha = parsedate_to_datetime(fecha_str)
|
|
|
|
return fecha.replace(tzinfo=None) # Remove timezone info
|
|
|
|
except:
|
|
|
|
try:
|
|
|
|
fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
|
|
|
|
if fecha_match:
|
|
|
|
dia, mes, año, hora, minuto = fecha_match.groups()
|
|
|
|
meses_it = {
|
|
|
|
'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
|
|
|
|
'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
|
|
|
|
'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
|
|
|
|
}
|
|
|
|
mes_num = meses_it.get(mes.lower(), 1)
|
|
|
|
return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
return datetime.now()
|