134 lines
5.2 KiB
Python
134 lines
5.2 KiB
Python
# utils/email_parser.py
|
|
import email
|
|
from email import policy
|
|
from email.parser import BytesParser
|
|
from datetime import datetime
|
|
import re
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup
|
|
from email.utils import parsedate_to_datetime
|
|
from models.mensaje_email import MensajeEmail
|
|
from utils.attachment_handler import guardar_adjunto
|
|
|
|
def _html_a_markdown(html):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Convert tables, keeping all newlines
|
|
for table in soup.find_all('table'):
|
|
rows = table.find_all('tr')
|
|
|
|
if rows:
|
|
markdown_table = []
|
|
# Get maximum width for each column
|
|
max_widths = []
|
|
for row in rows:
|
|
cells = row.find_all(['th', 'td'])
|
|
while len(max_widths) < len(cells):
|
|
max_widths.append(0)
|
|
for i, cell in enumerate(cells):
|
|
max_widths[i] = max(max_widths[i], len(cell.get_text().strip()))
|
|
|
|
# Build table rows
|
|
header_row = rows[0].find_all(['th', 'td'])
|
|
header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
|
|
for i, cell in enumerate(header_row)) + ' |'
|
|
separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
|
|
|
|
markdown_table.append(header)
|
|
markdown_table.append(separator)
|
|
|
|
for row in rows[1:]:
|
|
cells = row.find_all(['td', 'th'])
|
|
row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i])
|
|
for i, cell in enumerate(cells)) + ' |'
|
|
markdown_table.append(row_text)
|
|
|
|
# Join with newlines and replace
|
|
new_text = '\n' + '\n'.join(markdown_table)
|
|
table.replace_with(soup.new_string(new_text))
|
|
|
|
# Handle basic HTML elements
|
|
for br in soup.find_all('br'):
|
|
br.replace_with('\n')
|
|
|
|
# Get text content
|
|
text = soup.get_text()
|
|
|
|
# Only extract subject and remove basic email headers
|
|
lines = text.split('\n')
|
|
cleaned_lines = []
|
|
subject = None
|
|
|
|
for line in lines:
|
|
# Extract subject if present
|
|
if line.startswith('Oggetto: '):
|
|
subject = line[9:].strip()
|
|
continue
|
|
|
|
# Skip only the most basic email headers
|
|
if line.startswith(('Da: ', 'Inviato: ', 'A: ', 'From: ', 'Sent: ', 'To: ')):
|
|
continue
|
|
|
|
# Keep the line as is, with all its spacing
|
|
cleaned_lines.append(line)
|
|
|
|
# Join lines preserving all newlines
|
|
text = '\n'.join(cleaned_lines)
|
|
|
|
return subject, text
|
|
|
|
def procesar_eml(ruta_archivo, dir_adjuntos):
|
|
with open(ruta_archivo, 'rb') as eml:
|
|
mensaje = BytesParser(policy=policy.default).parse(eml)
|
|
|
|
remitente = mensaje.get('from', '')
|
|
fecha_str = mensaje.get('date', '')
|
|
fecha = _parsear_fecha(fecha_str)
|
|
|
|
contenido = ""
|
|
subject = None
|
|
adjuntos = []
|
|
|
|
if mensaje.is_multipart():
|
|
for parte in mensaje.walk():
|
|
if parte.get_content_type() == "text/plain":
|
|
text = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
|
|
contenido += text
|
|
elif parte.get_content_type() == "text/html":
|
|
html_content = parte.get_payload(decode=True).decode(parte.get_content_charset() or 'utf-8', errors='ignore')
|
|
part_subject, text = _html_a_markdown(html_content)
|
|
if part_subject and not subject:
|
|
subject = part_subject
|
|
contenido += text
|
|
elif parte.get_content_disposition() == 'attachment':
|
|
ruta_adjunto = guardar_adjunto(parte, dir_adjuntos)
|
|
if ruta_adjunto:
|
|
adjuntos.append(Path(ruta_adjunto).name)
|
|
else:
|
|
if mensaje.get_content_type() == "text/html":
|
|
html_content = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
|
|
subject, contenido = _html_a_markdown(html_content)
|
|
else:
|
|
contenido = mensaje.get_payload(decode=True).decode(mensaje.get_content_charset() or 'utf-8', errors='ignore')
|
|
|
|
return [MensajeEmail(remitente=remitente, fecha=fecha, contenido=contenido, subject=subject, adjuntos=adjuntos)]
|
|
|
|
def _parsear_fecha(fecha_str):
|
|
try:
|
|
fecha = parsedate_to_datetime(fecha_str)
|
|
return fecha.replace(tzinfo=None) # Remove timezone info
|
|
except:
|
|
try:
|
|
fecha_match = re.search(r'venerd=EC (\d{1,2}) (\w+) (\d{4}) (\d{1,2}):(\d{2})', fecha_str)
|
|
if fecha_match:
|
|
dia, mes, año, hora, minuto = fecha_match.groups()
|
|
meses_it = {
|
|
'gennaio': 1, 'febbraio': 2, 'marzo': 3, 'aprile': 4,
|
|
'maggio': 5, 'giugno': 6, 'luglio': 7, 'agosto': 8,
|
|
'settembre': 9, 'ottobre': 10, 'novembre': 11, 'dicembre': 12
|
|
}
|
|
mes_num = meses_it.get(mes.lower(), 1)
|
|
return datetime(int(año), mes_num, int(dia), int(hora), int(minuto))
|
|
except:
|
|
pass
|
|
return datetime.now() |