EmailCrono/utils/email_parser.py

134 lines
5.2 KiB
Python

# utils/email_parser.py
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime
import re
from pathlib import Path
from bs4 import BeautifulSoup
from email.utils import parsedate_to_datetime
from models.mensaje_email import MensajeEmail
from utils.attachment_handler import guardar_adjunto
def _html_a_markdown(html):
soup = BeautifulSoup(html, 'html.parser')
# Convert tables, keeping all newlines
for table in soup.find_all('table'):
rows = table.find_all('tr')
if rows:
markdown_table = []
# Get maximum width for each column
max_widths = []
for row in rows:
cells = row.find_all(['th', 'td'])
while len(max_widths) < len(cells):
max_widths.append(0)
for i, cell in enumerate(cells):
max_widths[i] = max(max_widths[i], len(cell.get_text().strip()))
# Build table rows
header_row = rows[0].find_all(['th', 'td'])
header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
for i, cell in enumerate(header_row)) + ' |'
separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
markdown_table.append(header)
markdown_table.append(separator)
for row in rows[1:]:
cells = row.find_all(['td', 'th'])
row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
for i, cell in enumerate(cells)) + ' |'
markdown_table.append(row_text)
# Join with newlines and replace
new_text = '\n' + '\n'.join(markdown_table)
table.replace_with(soup.new_string(new_text))
# Handle basic HTML elements
for br in soup.find_all('br'):
br.replace_with('\n')
# Get text content
text = soup.get_text()
# Only extract subject and remove basic email headers
lines = text.split('\n')
cleaned_lines = []
subject = None
for line in lines:
# Extract subject if present
if line.startswith('Oggetto: '):
subject = line[9:].strip()
continue
# Skip only the most basic email headers
if line.startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')):
continue
# Keep the line as is, with all its spacing
cleaned_lines.append(line)
# Join lines preserving all newlines
text = '\n'.join(cleaned_lines)
return subject, text
def procesar_eml(ruta_archivo, dir_adjuntos):
with open(ruta_archivo, 'rb') as eml:
mensaje = BytesParser(policy=policy.default).parse(eml)
remitente = mensaje.get('from', '')
fecha_str = mensaje.get('date', '')
fecha = _parsear_fecha(fecha_str)
contenido = ""
subject = None
adjuntos = []
if mensaje.is_multipart():
for parte in mensaje.walk():
if parte.get_content_type() == "text/plain":
text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
contenido += text
elif parte.get_content_type() == "text/html":
html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
part_subject, text = _html_a_markdown(html_content)
if part_subject and not subject:
subject = part_subject
contenido += text
elif parte.get_content_disposition() == 'attachment':
ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
if ruta_adjunto:
adjuntos.append(Path(ruta_adjunto).name)
else:
if mensaje.get_content_type() == "text/html":
html_content = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
subject, contenido = _html_a_markdown(html_content)
else:
contenido = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
return [MensajeEmail(remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos)]
def _parsear_fecha(fecha_str):
try:
fecha = parsedate_to_datetime(fecha_str)
return fecha.replace(tzinfo=None) # Remove timezone info
except:
try:
fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
if fecha_match:
dia, mes, año, hora, minuto = fecha_match.groups()
meses_it = {
'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
}
mes_num = meses_it.get(mes.lower(), 1)
return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
except:
pass
return datetime.now()