103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
"""
|
|
Script para importar archivos HTML y convertirlos a un archivo Markdown.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import json
|
|
from utils.html_parser import procesar_html
|
|
from utils.markdown_handler import escribir_archivo_markdown
|
|
|
|
# Forzar UTF-8 en la salida estándar
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
# Cargar configuraciones del entorno
|
|
configs = json.loads(os.environ.get("SCRIPT_CONFIGS", "{}"))
|
|
|
|
# Obtener working directory
|
|
working_directory = configs.get("working_directory", ".")
|
|
|
|
# Obtener configuraciones de nivel 2 (grupo)
|
|
group_config = configs.get("level2", {})
|
|
output_file = group_config.get("output_file", "contenido.md")
|
|
attachments_dir = group_config.get("attachments_dir", "adjuntos")
|
|
|
|
# Obtener directorio de salida (nivel 3)
|
|
work_config = configs.get("level3", {})
|
|
output_directory = work_config.get("output_directory", ".")
|
|
|
|
# Construir rutas absolutas
|
|
input_dir = (
|
|
working_directory # El directorio de trabajo es el directorio de entrada
|
|
)
|
|
output_path = os.path.join(output_directory, output_file)
|
|
attachments_path = os.path.join(output_directory, attachments_dir)
|
|
|
|
# Debug prints
|
|
print(f"Working directory: {working_directory}")
|
|
print(f"Input directory: {input_dir}")
|
|
print(f"Output directory: {output_directory}")
|
|
print(f"Output file: {output_path}")
|
|
print(f"Attachments directory: {attachments_path}")
|
|
|
|
# Asegurar que existen los directorios
|
|
os.makedirs(attachments_path, exist_ok=True)
|
|
|
|
# Verificar si el directorio de entrada existe
|
|
input_path = Path(input_dir)
|
|
if not input_path.exists():
|
|
print(f"Error: Input directory {input_path} does not exist")
|
|
return
|
|
|
|
# Buscar archivos HTML
|
|
html_files = []
|
|
for ext in ["*.html", "*.htm"]:
|
|
html_files.extend(list(input_path.glob(ext)))
|
|
|
|
print(f"Found {len(html_files)} HTML files")
|
|
|
|
if not html_files:
|
|
print("No HTML files found in the input directory.")
|
|
return
|
|
|
|
# Procesar archivos HTML
|
|
paginas = []
|
|
total_files = len(html_files)
|
|
successful_files = 0
|
|
failed_files = 0
|
|
|
|
for i, archivo in enumerate(html_files, 1):
|
|
print(f"\nProcessing [{i}/{total_files}] {archivo}")
|
|
pagina = procesar_html(archivo, attachments_path)
|
|
if pagina:
|
|
paginas.append(pagina)
|
|
# Verificar si hubo error al procesar
|
|
if pagina.contenido.startswith("Error al procesar:"):
|
|
failed_files += 1
|
|
print(f"❌ Failed: {pagina.contenido}")
|
|
else:
|
|
successful_files += 1
|
|
print(f"✅ Success: {pagina.titulo}")
|
|
|
|
# Escribir el archivo Markdown
|
|
if paginas:
|
|
print(f"\nSummary:")
|
|
print(f"- Total files: {total_files}")
|
|
print(f"- Successfully processed: {successful_files}")
|
|
print(f"- Failed: {failed_files}")
|
|
|
|
print(f"\nWriting {len(paginas)} pages to {output_path}")
|
|
if escribir_archivo_markdown(paginas, output_path):
|
|
print("Markdown file created successfully.")
|
|
else:
|
|
print("Error creating Markdown file.")
|
|
else:
|
|
print("No pages to write.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|