EmailCrono/utils/email_parser.py

# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto
import tempfile
import os

def _get_payload_safely(parte):
    """
    Obtiene el payload de una parte del email de forma segura
    """
    try:
        if parte.is_multipart():
            return None
        payload = parte.get_payload(decode=True)
        if payload is None:
            return None
        charset = parte.get_content_charset() or 'utf-8'
        return payload.decode(charset, errors='ignore')
    except Exception as e:
        print(f"Error getting payload: {str(e)}")
        return None

def _extract_subject_from_text(text):
    """
    Extrae el asunto de un texto dados diferentes formatos de cabecera
    """
    subject_headers = {
        'Oggetto: ': 9,      # Italian
        'Subject: ': 9,      # English
        'Asunto: ': 8,       # Spanish
        'Sujet: ': 7,        # French
        'Betreff: ': 9       # German
    }
    
    for line in text.split('\n'):
        line = line.strip()
        for header, offset in subject_headers.items():
            if line.startswith(header):
                return line[offset:].strip()
    return None

def _should_skip_line(line):
    """
    Determina si una línea debe ser omitida por ser una cabecera de email
    """
    headers_to_skip = [
        'Da: ', 'Inviato: ', 'A: ',           # Italian
        'From: ', 'Sent: ', 'To: ',           # English
        'De: ', 'Enviado: ', 'Para: ',        # Spanish
        'Von: ', 'Gesendet: ', 'An: ',        # German
        'De : ', 'Envoyé : ', 'À : '          # French
    ]
    return any(line.strip().startswith(header) for header in headers_to_skip)

def _html_a_markdown(html):
    """
    Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente
    """
    if html is None:
        return (None, "")
        
    try:
        # Limpieza básica
        html = html.replace('\xa0', ' ')  # NBSP a espacio normal
        html = html.replace('\r\n', '\n') # CRLF a LF
        html = html.replace('\r', '\n')   # CR a LF
        
        soup = BeautifulSoup(html, 'html.parser')
        
        # Procesar tablas
        for table in soup.find_all('table'):
            try:
                rows = table.find_all('tr')
                if not rows:
                    continue
                    
                markdown_table = []
                max_widths = []
                
                # Calcular anchos máximos
                for row in rows:
                    cells = row.find_all(['th', 'td'])
                    while len(max_widths) < len(cells):
                        max_widths.append(0)
                    for i, cell in enumerate(cells):
                        cell_text = cell.get_text().strip()
                        max_widths[i] = max(max_widths[i], len(cell_text))
                
                # Construir tabla markdown
                if max_widths:  # Solo si tenemos celdas válidas
                    header_row = rows[0].find_all(['th', 'td'])
                    header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
                                             for i, cell in enumerate(header_row)) + ' |'
                    separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
                    
                    markdown_table.append(header)
                    markdown_table.append(separator)
                    
                    for row in rows[1:]:
                        cells = row.find_all(['td', 'th'])
                        row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
                                                   for i, cell in enumerate(cells)) + ' |'
                        markdown_table.append(row_text)
                    
                    table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))
            except Exception as e:
                print(f"Error procesando tabla: {str(e)}")
                continue
        
        # Procesar saltos de línea
        for br in soup.find_all('br'):
            br.replace_with('\n')
        
        # Obtener texto limpio
        text = soup.get_text()
        
        # Procesar líneas
        cleaned_lines = []
        subject = None
        
        for line in text.split('\n'):
            if not subject:
                subject = _extract_subject_from_text(line)
            
            if not _should_skip_line(line):
                cleaned_lines.append(line)
        
        final_text = '\n'.join(cleaned_lines).strip()
        return (subject, final_text)
        
    except Exception as e:
        print(f"Error en html_a_markdown: {str(e)}")
        return (None, html if html else "")

def _procesar_email_adjunto(parte, dir_adjuntos):
    """
    Procesa un email que viene como adjunto dentro de otro email.
    """
    try:
        mensajes = []
        if parte.is_multipart():
            # Si es multipart, procesar cada subparte
            for subparte in parte.walk():
                if subparte.get_content_type() == "message/rfc822":
                    # Si es un mensaje RFC822, obtener el payload como lista
                    payload = subparte.get_payload()
                    if isinstance(payload, list):
                        for msg in payload:
                            mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
                    elif isinstance(payload, email.message.Message):
                        mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
        else:
            # Si no es multipart, intentar procesar como mensaje único
            payload = parte.get_payload()
            if isinstance(payload, list):
                for msg in payload:
                    mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))
            elif isinstance(payload, email.message.Message):
                mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))
            
        return mensajes
    except Exception as e:
        print(f"Error procesando email adjunto: {str(e)}")
        return []

def procesar_eml(ruta_archivo, dir_adjuntos):
    """
    Punto de entrada principal para procesar archivos .eml
    """
    try:
        with open(ruta_archivo, 'rb') as eml:
            mensaje = BytesParser(policy=policy.default).parse(eml)
        return procesar_eml_interno(mensaje, dir_adjuntos)
    except Exception as e:
        print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")
        return []

def procesar_eml_interno(mensaje, dir_adjuntos):
    """
    Procesa un mensaje de email, ya sea desde archivo o adjunto
    """
    mensajes = []
    
    try:
        remitente = mensaje.get('from', '')
        fecha_str = mensaje.get('date', '')
        fecha = _parsear_fecha(fecha_str)
        
        # Get subject from email headers first
        subject = mensaje.get('subject', '')
        if subject:
            # Try to decode if it's encoded
            subject = str(email.header.make_header(email.header.decode_header(subject)))
        
        contenido = ""
        adjuntos = []
        tiene_html = False
        
        # First pass: check for HTML content
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                if parte.get_content_type() == "text/html":
                    tiene_html = True
                    break
        else:
            tiene_html = mensaje.get_content_type() == "text/html"
        
        # Second pass: process content and attachments
        if mensaje.is_multipart():
            for parte in mensaje.walk():
                content_type = parte.get_content_type()
                
                try:
                    if content_type == "text/html":
                        html_content = _get_payload_safely(parte)
                        if html_content:
                            part_subject, text = _html_a_markdown(html_content)
                            if not subject and part_subject:
                                subject = part_subject
                            if text:
                                contenido = text
                    elif content_type == "text/plain" and not tiene_html:
                        text = _get_payload_safely(parte)
                        if text:
                            contenido = text
                    elif content_type == "message/rfc822":
                        # Procesar email adjunto
                        mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                        mensajes.extend(mensajes_adjuntos)
                    elif parte.get_content_disposition() == 'attachment':
                        nombre = parte.get_filename()
                        if nombre and nombre.lower().endswith('.eml'):
                            # Si es un archivo .eml adjunto
                            mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)
                            mensajes.extend(mensajes_adjuntos)
                        else:
                            # Otros tipos de adjuntos
                            ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                            if ruta_adjunto:
                                adjuntos.append(Path(ruta_adjunto).name)
                except Exception as e:
                    print(f"Error procesando parte del mensaje: {str(e)}")
                    continue
        else:
            if mensaje.get_content_type() == "text/html":
                html_content = _get_payload_safely(mensaje)
                if html_content:
                    part_subject, contenido = _html_a_markdown(html_content)
                    if not subject and part_subject:
                        subject = part_subject
            else:
                contenido = _get_payload_safely(mensaje) or ""
        
        # Solo agregar el mensaje si tiene contenido útil
        if contenido or subject or adjuntos:
            mensajes.append(MensajeEmail(
                remitente=remitente,
                fecha=fecha,
                contenido=contenido,
                subject=subject,
                adjuntos=adjuntos
            ))
        
    except Exception as e:
        print(f"Error procesando mensaje: {str(e)}")
    
    return mensajes

def _parsear_fecha(fecha_str):
    try:
        fecha = parsedate_to_datetime(fecha_str)
        return fecha.replace(tzinfo=None)  # Remove timezone info
    except:
        try:
            fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
            if fecha_match:
                dia, mes, año, hora, minuto = fecha_match.groups()
                meses_it = {
                    'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
                    'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
                    'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
                }
                mes_num = meses_it.get(mes.lower(), 1)
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
        return datetime.now()
Software Base 2025-02-05 08:14:36 -03:00			`# utils/email_parser.py`
			`import email`
			`from email import policy`
			`from email.parser import BytesParser`
			`from datetime import datetime`
			`import re`
			`from pathlib import Path`
			`from bs4 import BeautifulSoup`
			`from email.utils import parsedate_to_datetime`
			`from models.mensaje_email import MensajeEmail`
			`from utils.attachment_handler import guardar_adjunto`
Funcionando mejor 2025-02-05 08:58:41 -03:00			`import tempfile`
			`import os`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`def _get_payload_safely(parte):`
			`"""`
			`Obtiene el payload de una parte del email de forma segura`
			`"""`
			`try:`
			`if parte.is_multipart():`
			`return None`
			`payload = parte.get_payload(decode=True)`
			`if payload is None:`
			`return None`
			`charset = parte.get_content_charset() or 'utf-8'`
			`return payload.decode(charset, errors='ignore')`
			`except Exception as e:`
			`print(f"Error getting payload: {str(e)}")`
			`return None`

			`def _extract_subject_from_text(text):`
			`"""`
			`Extrae el asunto de un texto dados diferentes formatos de cabecera`
			`"""`
			`subject_headers = {`
			`'Oggetto: ': 9, # Italian`
			`'Subject: ': 9, # English`
			`'Asunto: ': 8, # Spanish`
			`'Sujet: ': 7, # French`
			`'Betreff: ': 9 # German`
			`}`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`for line in text.split('\n'):`
			`line = line.strip()`
			`for header, offset in subject_headers.items():`
			`if line.startswith(header):`
			`return line[offset:].strip()`
			`return None`

			`def _should_skip_line(line):`
			`"""`
			`Determina si una línea debe ser omitida por ser una cabecera de email`
			`"""`
			`headers_to_skip = [`
			`'Da: ', 'Inviato: ', 'A: ', # Italian`
			`'From: ', 'Sent: ', 'To: ', # English`
			`'De: ', 'Enviado: ', 'Para: ', # Spanish`
			`'Von: ', 'Gesendet: ', 'An: ', # German`
			`'De : ', 'Envoyé : ', 'À : ' # French`
			`]`
			`return any(line.strip().startswith(header) for header in headers_to_skip)`

			`def _html_a_markdown(html):`
			`"""`
			`Convierte contenido HTML a texto markdown, extrayendo el asunto si está presente`
			`"""`
			`if html is None:`
			`return (None, "")`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`try:`
			`# Limpieza básica`
			`html = html.replace('\xa0', ' ') # NBSP a espacio normal`
			`html = html.replace('\r\n', '\n') # CRLF a LF`
			`html = html.replace('\r', '\n') # CR a LF`

			`soup = BeautifulSoup(html, 'html.parser')`

			`# Procesar tablas`
			`for table in soup.find_all('table'):`
			`try:`
			`rows = table.find_all('tr')`
			`if not rows:`
			`continue`

			`markdown_table = []`
			`max_widths = []`

			`# Calcular anchos máximos`
			`for row in rows:`
			`cells = row.find_all(['th', 'td'])`
			`while len(max_widths) < len(cells):`
			`max_widths.append(0)`
			`for i, cell in enumerate(cells):`
			`cell_text = cell.get_text().strip()`
			`max_widths[i] = max(max_widths[i], len(cell_text))`

			`# Construir tabla markdown`
			`if max_widths: # Solo si tenemos celdas válidas`
			`header_row = rows[0].find_all(['th', 'td'])`
			`header = '\| ' + ' \| '.join(cell.get_text().strip().ljust(max_widths[i])`
			`for i, cell in enumerate(header_row)) + ' \|'`
			`separator = '\|' + '\|'.join('-' * (width + 2) for width in max_widths) + '\|'`

			`markdown_table.append(header)`
			`markdown_table.append(separator)`

			`for row in rows[1:]:`
			`cells = row.find_all(['td', 'th'])`
			`row_text = '\| ' + ' \| '.join(cell.get_text().strip().ljust(max_widths[i])`
			`for i, cell in enumerate(cells)) + ' \|'`
			`markdown_table.append(row_text)`

			`table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))`
			`except Exception as e:`
			`print(f"Error procesando tabla: {str(e)}")`
			`continue`

			`# Procesar saltos de línea`
			`for br in soup.find_all('br'):`
			`br.replace_with('\n')`

			`# Obtener texto limpio`
			`text = soup.get_text()`

			`# Procesar líneas`
			`cleaned_lines = []`
			`subject = None`

			`for line in text.split('\n'):`
			`if not subject:`
			`subject = _extract_subject_from_text(line)`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`if not _should_skip_line(line):`
			`cleaned_lines.append(line)`

			`final_text = '\n'.join(cleaned_lines).strip()`
			`return (subject, final_text)`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`except Exception as e:`
			`print(f"Error en html_a_markdown: {str(e)}")`
			`return (None, html if html else "")`

			`def _procesar_email_adjunto(parte, dir_adjuntos):`
			`"""`
			`Procesa un email que viene como adjunto dentro de otro email.`
			`"""`
			`try:`
			`mensajes = []`
			`if parte.is_multipart():`
			`# Si es multipart, procesar cada subparte`
			`for subparte in parte.walk():`
			`if subparte.get_content_type() == "message/rfc822":`
			`# Si es un mensaje RFC822, obtener el payload como lista`
			`payload = subparte.get_payload()`
			`if isinstance(payload, list):`
			`for msg in payload:`
			`mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))`
			`elif isinstance(payload, email.message.Message):`
			`mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))`
			`else:`
			`# Si no es multipart, intentar procesar como mensaje único`
			`payload = parte.get_payload()`
			`if isinstance(payload, list):`
			`for msg in payload:`
			`mensajes.extend(procesar_eml_interno(msg, dir_adjuntos))`
			`elif isinstance(payload, email.message.Message):`
			`mensajes.extend(procesar_eml_interno(payload, dir_adjuntos))`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`return mensajes`
			`except Exception as e:`
			`print(f"Error procesando email adjunto: {str(e)}")`
			`return []`
Software Base 2025-02-05 08:14:36 -03:00
			`def procesar_eml(ruta_archivo, dir_adjuntos):`
Funcionando mejor 2025-02-05 08:58:41 -03:00			`"""`
			`Punto de entrada principal para procesar archivos .eml`
			`"""`
			`try:`
			`with open(ruta_archivo, 'rb') as eml:`
			`mensaje = BytesParser(policy=policy.default).parse(eml)`
			`return procesar_eml_interno(mensaje, dir_adjuntos)`
			`except Exception as e:`
			`print(f"Error al abrir el archivo {ruta_archivo}: {str(e)}")`
			`return []`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`def procesar_eml_interno(mensaje, dir_adjuntos):`
			`"""`
			`Procesa un mensaje de email, ya sea desde archivo o adjunto`
			`"""`
			`mensajes = []`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`try:`
			`remitente = mensaje.get('from', '')`
			`fecha_str = mensaje.get('date', '')`
			`fecha = _parsear_fecha(fecha_str)`

			`# Get subject from email headers first`
			`subject = mensaje.get('subject', '')`
			`if subject:`
			`# Try to decode if it's encoded`
			`subject = str(email.header.make_header(email.header.decode_header(subject)))`

			`contenido = ""`
			`adjuntos = []`
			`tiene_html = False`

			`# First pass: check for HTML content`
			`if mensaje.is_multipart():`
			`for parte in mensaje.walk():`
			`if parte.get_content_type() == "text/html":`
			`tiene_html = True`
			`break`
Software Base 2025-02-05 08:14:36 -03:00			`else:`
Funcionando mejor 2025-02-05 08:58:41 -03:00			`tiene_html = mensaje.get_content_type() == "text/html"`

			`# Second pass: process content and attachments`
			`if mensaje.is_multipart():`
			`for parte in mensaje.walk():`
			`content_type = parte.get_content_type()`

			`try:`
			`if content_type == "text/html":`
			`html_content = _get_payload_safely(parte)`
			`if html_content:`
			`part_subject, text = _html_a_markdown(html_content)`
			`if not subject and part_subject:`
			`subject = part_subject`
			`if text:`
			`contenido = text`
			`elif content_type == "text/plain" and not tiene_html:`
			`text = _get_payload_safely(parte)`
			`if text:`
			`contenido = text`
			`elif content_type == "message/rfc822":`
			`# Procesar email adjunto`
			`mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)`
			`mensajes.extend(mensajes_adjuntos)`
			`elif parte.get_content_disposition() == 'attachment':`
			`nombre = parte.get_filename()`
			`if nombre and nombre.lower().endswith('.eml'):`
			`# Si es un archivo .eml adjunto`
			`mensajes_adjuntos = _procesar_email_adjunto(parte, dir_adjuntos)`
			`mensajes.extend(mensajes_adjuntos)`
			`else:`
			`# Otros tipos de adjuntos`
			`ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)`
			`if ruta_adjunto:`
			`adjuntos.append(Path(ruta_adjunto).name)`
			`except Exception as e:`
			`print(f"Error procesando parte del mensaje: {str(e)}")`
			`continue`
			`else:`
			`if mensaje.get_content_type() == "text/html":`
			`html_content = _get_payload_safely(mensaje)`
			`if html_content:`
			`part_subject, contenido = _html_a_markdown(html_content)`
			`if not subject and part_subject:`
			`subject = part_subject`
			`else:`
			`contenido = _get_payload_safely(mensaje) or ""`

			`# Solo agregar el mensaje si tiene contenido útil`
			`if contenido or subject or adjuntos:`
			`mensajes.append(MensajeEmail(`
			`remitente=remitente,`
			`fecha=fecha,`
			`contenido=contenido,`
			`subject=subject,`
			`adjuntos=adjuntos`
			`))`

			`except Exception as e:`
			`print(f"Error procesando mensaje: {str(e)}")`
Software Base 2025-02-05 08:14:36 -03:00
Funcionando mejor 2025-02-05 08:58:41 -03:00			`return mensajes`
Software Base 2025-02-05 08:14:36 -03:00
			`def _parsear_fecha(fecha_str):`
			`try:`
			`fecha = parsedate_to_datetime(fecha_str)`
			`return fecha.replace(tzinfo=None) # Remove timezone info`
			`except:`
			`try:`
			`fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)`
			`if fecha_match:`
			`dia, mes, año, hora, minuto = fecha_match.groups()`
			`meses_it = {`
			`'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,`
			`'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,`
			`'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12`
			`}`
			`mes_num = meses_it.get(mes.lower(), 1)`
			`return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))`
			`except:`
			`pass`
			`return datetime.now()`