EmailCrono/utils/email_parser.py

# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto

def _html_a_markdown(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Convert tables, keeping all newlines
    for table in soup.find_all('table'):
        rows = table.find_all('tr')

        if rows:
            markdown_table = []
            # Get maximum width for each column
            max_widths = []
            for row in rows:
                cells = row.find_all(['th', 'td'])
                while len(max_widths) < len(cells):
                    max_widths.append(0)
                for i, cell in enumerate(cells):
                    max_widths[i] = max(max_widths[i], len(cell.get_text().strip()))

            # Build table rows
            header_row = rows[0].find_all(['th', 'td'])
            header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
                                     for i, cell in enumerate(header_row)) + ' |'
            separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'

            markdown_table.append(header)
            markdown_table.append(separator)

            for row in rows[1:]:
                cells = row.find_all(['td', 'th'])
                row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
                                           for i, cell in enumerate(cells)) + ' |'
                markdown_table.append(row_text)

            # Join with newlines and replace
            new_text = '\n' + '\n'.join(markdown_table)
            table.replace_with(soup.new_string(new_text))

    # Handle basic HTML elements
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Get text content
    text = soup.get_text()

    # Only extract subject and remove basic email headers
    lines = text.split('\n')
    cleaned_lines = []
    subject = None

    for line in lines:
        # Extract subject if present
        if line.startswith('Oggetto: '):
            subject = line[9:].strip()
            continue

        # Skip only the most basic email headers
        if line.startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')):
            continue

        # Keep the line as is, with all its spacing
        cleaned_lines.append(line)

    # Join lines preserving all newlines
    text = '\n'.join(cleaned_lines)

    return subject, text

def procesar_eml(ruta_archivo, dir_adjuntos):
    with open(ruta_archivo, 'rb') as eml:
        mensaje = BytesParser(policy=policy.default).parse(eml)

    remitente = mensaje.get('from', '')
    fecha_str = mensaje.get('date', '')
    fecha = _parsear_fecha(fecha_str)

    contenido = ""
    subject = None
    adjuntos = []

    if mensaje.is_multipart():
        for parte in mensaje.walk():
            if parte.get_content_type() == "text/plain":
                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                contenido += text
            elif parte.get_content_type() == "text/html":
                html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                part_subject, text = _html_a_markdown(html_content)
                if part_subject and not subject:
                    subject = part_subject
                contenido += text
            elif parte.get_content_disposition() == 'attachment':
                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                if ruta_adjunto:
                    adjuntos.append(Path(ruta_adjunto).name)
    else:
        if mensaje.get_content_type() == "text/html":
            html_content = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
            subject, contenido = _html_a_markdown(html_content)
        else:
            contenido = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')

    return [MensajeEmail(remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos)]

def _parsear_fecha(fecha_str):
    try:
        fecha = parsedate_to_datetime(fecha_str)
        return fecha.replace(tzinfo=None)  # Remove timezone info
    except:
        try:
            fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
            if fecha_match:
                dia, mes, año, hora, minuto = fecha_match.groups()
                meses_it = {
                    'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
                    'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
                    'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
                }
                mes_num = meses_it.get(mes.lower(), 1)
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
        return datetime.now()