# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto

def _html_a_markdown(html):
    # Primero limpiamos los caracteres especiales en el HTML
    html = html.replace('\xa0', ' ')  # NBSP a espacio normal
    html = html.replace('\r\n', '\n') # CRLF a LF
    html = html.replace('\r', '\n')   # CR a LF
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # Convert tables, keeping all newlines
    for table in soup.find_all('table'):
        rows = table.find_all('tr')
        
        if rows:
            markdown_table = []
            # Get maximum width for each column
            max_widths = []
            for row in rows:
                cells = row.find_all(['th', 'td'])
                while len(max_widths) < len(cells):
                    max_widths.append(0)
                for i, cell in enumerate(cells):
                    max_widths[i] = max(max_widths[i], len(cell.get_text().strip()))
            
            # Build table rows
            header_row = rows[0].find_all(['th', 'td'])
            header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
                                     for i, cell in enumerate(header_row)) + ' |'
            separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
            
            markdown_table.append(header)
            markdown_table.append(separator)
            
            for row in rows[1:]:
                cells = row.find_all(['td', 'th'])
                row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
                                           for i, cell in enumerate(cells)) + ' |'
                markdown_table.append(row_text)
            
            # Join with newlines and replace
            new_text = '\n' + '\n'.join(markdown_table)
            table.replace_with(soup.new_string(new_text))
    
    # Handle basic HTML elements
    for br in soup.find_all('br'):
        br.replace_with('\n')
    
    # Get text content
    text = soup.get_text()
    
    # Only extract subject and remove basic email headers
    lines = text.split('\n')
    cleaned_lines = []
    subject = None
    
    for line in lines:
        # Extract subject if present
        if line.startswith('Oggetto: '):
            subject = line[9:].strip()
            continue
        
        # Skip only the most basic email headers
        if line.startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')):
            continue
            
        # Keep the line as is, with all its spacing
        cleaned_lines.append(line)
    
    # Join lines preserving all newlines
    text = '\n'.join(cleaned_lines)
    
    return subject, text

def procesar_eml(ruta_archivo, dir_adjuntos):
    with open(ruta_archivo, 'rb') as eml:
        mensaje = BytesParser(policy=policy.default).parse(eml)

    remitente = mensaje.get('from', '')
    fecha_str = mensaje.get('date', '')
    fecha = _parsear_fecha(fecha_str)
    
    contenido = ""
    subject = None
    adjuntos = []
    tiene_html = False
    
    # Primera pasada: verificar si hay contenido HTML
    if mensaje.is_multipart():
        for parte in mensaje.walk():
            if parte.get_content_type() == "text/html":
                tiene_html = True
                break
    else:
        tiene_html = mensaje.get_content_type() == "text/html"
    
    # Segunda pasada: procesar el contenido
    if mensaje.is_multipart():
        for parte in mensaje.walk():
            if parte.get_content_type() == "text/html":
                html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                part_subject, text = _html_a_markdown(html_content)
                if part_subject and not subject:
                    subject = part_subject
                contenido = text  # Reemplazar en lugar de concatenar
            elif parte.get_content_type() == "text/plain" and not tiene_html:
                # Solo usar texto plano si no hay HTML
                text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
                contenido = text
            elif parte.get_content_disposition() == 'attachment':
                ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
                if ruta_adjunto:
                    adjuntos.append(Path(ruta_adjunto).name)
    else:
        if mensaje.get_content_type() == "text/html":
            html_content = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
            subject, contenido = _html_a_markdown(html_content)
        else:
            contenido = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
    
    return [MensajeEmail(remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos)]

def _parsear_fecha(fecha_str):
    try:
        fecha = parsedate_to_datetime(fecha_str)
        return fecha.replace(tzinfo=None)  # Remove timezone info
    except:
        try:
            fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
            if fecha_match:
                dia, mes, año, hora, minuto = fecha_match.groups()
                meses_it = {
                    'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
                    'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
                    'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
                }
                mes_num = meses_it.get(mes.lower(), 1)
                return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
        except:
            pass
        return datetime.now()