Intento de crear un RagEx

2025-04-04 14:29:40 +02:00 · 2025-04-04 14:29:40 +02:00 · 6070938bcc
parent 9d090d2db7
commit 6070938bcc
25 changed files with 841 additions and 167 deletions
--- a/pycache/config_manager.cpython-310.pyc
+++ b/pycache/config_manager.cpython-310.pyc
--- a/app.py
+++ b/app.py
@ -221,5 +221,11 @@ def handle_group_description(group):
            return jsonify({"status": "error", "message": str(e)}), 500


+@app.route("/api/directory-history/<group>")
+def get_directory_history(group):
+    history = config_manager.get_directory_history(group)
+    return jsonify(history)
+
+
 if __name__ == "__main__":
    app.run(debug=True)
--- a/backend/script_groups/CSharpCodeMerger/esquema_group.json
+++ b/backend/script_groups/CSharpCodeMerger/esquema_group.json
@ -0,0 +1,4 @@
+{
+  "type": "object",
+  "properties": {}
+}
--- a/backend/script_groups/CSharpCodeMerger/esquema_work.json
+++ b/backend/script_groups/CSharpCodeMerger/esquema_work.json
@ -0,0 +1,4 @@
+{
+  "type": "object",
+  "properties": {}
+}
--- a/backend/script_groups/CSharpCodeMerger/work_dir.json
+++ b/backend/script_groups/CSharpCodeMerger/work_dir.json
@ -0,0 +1,6 @@
+{
+  "path": "C:\\Users\\migue\\OneDrive\\Miguel\\Obsidean\\Trabajo\\VM\\30 - 9.3941- Kosme - Portogallo (Modifica + Linea)\\Emails",
+  "history": [
+    "C:\\Users\\migue\\OneDrive\\Miguel\\Obsidean\\Trabajo\\VM\\30 - 9.3941- Kosme - Portogallo (Modifica + Linea)\\Emails"
+  ]
+}
--- a/backend/script_groups/CSharpCodeMerger/x1.py
+++ b/backend/script_groups/CSharpCodeMerger/x1.py
@ -0,0 +1,303 @@
+"""
+Script para hacer una union de los cambios generados por un LLM en un archivo de código C#.
+"""
+
+import os
+import sys
+import json
+import re
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+import difflib
+
+# Forzar UTF-8 en la salida estándar
+sys.stdout.reconfigure(encoding="utf-8")
+
+@dataclass
+class CodeSection:
+    type: str
+    name: str
+    content: str
+    start_line: int
+    end_line: int
+    parent: Optional['CodeSection'] = None
+    children: List['CodeSection'] = None
+    attributes: List[str] = None
+    original_indent: str = ""
+    
+    def __post_init__(self):
+        if self.children is None:
+            self.children = []
+        if self.attributes is None:
+            self.attributes = []
+
+class CSharpParser:
+    def __init__(self):
+        # Definimos el orden específico de las secciones
+        self.section_order = [
+            'using',
+            'comment',
+            'attribute',
+            'namespace',
+            'class',
+            'interface',
+            'region',
+            'field',
+            'property',
+            'method'
+        ]
+        
+        self.patterns = {
+            'using': r'^\s*using\s+([^;]+);',
+            'namespace': r'^\s*namespace\s+([^\s{]+)',
+            'class': r'^\s*(?:public|private|internal|protected)?\s*(?:partial\s+)?(?:abstract\s+)?class\s+(\w+)',
+            'interface': r'^\s*(?:public|private|internal|protected)?\s*interface\s+(\w+)',
+            'method': r'^\s*(?:public|private|internal|protected)?\s*(?:virtual|override|static|async)?\s*[\w<>]+\s+(\w+)\s*\(',
+            'property': r'^\s*(?:\[.+\]\s*)*(?:public|private|internal|protected)?\s*[\w<>]+\s+(\w+)\s*(?:{\s*get;|=>)',
+            'field': r'^\s*(?:public|private|internal|protected)?\s*(?:readonly|static|const)?\s*[\w<>]+\s+(\w+)\s*(?:=|;)',
+            'attribute': r'^\s*\[([^\]]+)\]',
+            'comment': r'^\s*(?://.*|/\*.*?\*/)',
+            'region': r'^\s*#region\s+(.+)$'
+        }
+        self.placeholder_pattern = r'//\s*\.\.\.\s*resto del código\s*\.\.\.'
+
+    def get_section_order_index(self, section_type: str) -> int:
+        try:
+            return self.section_order.index(section_type)
+        except ValueError:
+            return len(self.section_order)
+
+    def get_indent(self, line: str) -> str:
+        match = re.match(r'^(\s*)', line)
+        return match.group(1) if match else ""
+
+    def parse_file(self, content: str) -> CodeSection:
+        lines = content.split('\n')
+        root = CodeSection('root', '', '', 0, len(lines))
+        current_context = [root]
+        current_attributes = []
+        
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+            original_indent = self.get_indent(line)
+            
+            # Skip empty lines
+            if not stripped:
+                i += 1
+                continue
+            
+            # Procesar cada tipo de sección según su orden definido
+            matched = False
+            for section_type in self.section_order:
+                if section_type not in self.patterns:
+                    continue
+                    
+                pattern = self.patterns[section_type]
+                match = re.match(pattern, line)
+                
+                if match:
+                    name = match.group(1)
+                    if section_type in ['namespace', 'class', 'interface', 'region']:
+                        # Procesar secciones con bloques
+                        section_lines, j = self._process_block(lines, i)
+                        section = CodeSection(
+                            section_type,
+                            name,
+                            '\n'.join(section_lines),
+                            i,
+                            j,
+                            parent=current_context[-1],
+                            attributes=current_attributes.copy(),
+                            original_indent=original_indent
+                        )
+                        current_context[-1].children.append(section)
+                        
+                        if section_type in ['namespace', 'class', 'interface']:
+                            current_context.append(section)
+                            
+                        i = j + 1
+                    else:
+                        # Procesar secciones simples
+                        section = CodeSection(
+                            section_type,
+                            name,
+                            line,
+                            i,
+                            i,
+                            parent=current_context[-1],
+                            attributes=current_attributes.copy(),
+                            original_indent=original_indent
+                        )
+                        current_context[-1].children.append(section)
+                        i += 1
+                        
+                    current_attributes = []
+                    matched = True
+                    break
+            
+            if not matched:
+                i += 1
+                
+            if stripped == '}' and len(current_context) > 1:
+                current_context.pop()
+        
+        return root
+
+    def _process_block(self, lines: List[str], start_index: int) -> Tuple[List[str], int]:
+        brace_count = 0
+        section_lines = []
+        j = start_index
+        
+        while j < len(lines):
+            current_line = lines[j]
+            section_lines.append(current_line)
+            brace_count += current_line.count('{') - current_line.count('}')
+            
+            if brace_count == 0 and len(section_lines) > 1:
+                break
+                
+            j += 1
+            
+        return section_lines, j
+
+class CSharpCodeMerger:
+    def __init__(self, original_code: str, llm_code: str):
+        self.parser = CSharpParser()
+        self.original_tree = self.parser.parse_file(original_code)
+        self.llm_tree = self.parser.parse_file(llm_code)
+        self.original_code = original_code
+        self.llm_code = llm_code
+
+    def _sort_sections(self, sections: List[CodeSection]) -> List[CodeSection]:
+        return sorted(sections, key=lambda x: (
+            self.parser.get_section_order_index(x.type),
+            x.start_line
+        ))
+
+    def _merge_sections(self, original: CodeSection, llm: CodeSection) -> CodeSection:
+        merged = CodeSection(
+            original.type,
+            original.name,
+            original.content,
+            original.start_line,
+            original.end_line,
+            original.parent,
+            original_indent=original.original_indent
+        )
+        
+        # Crear mapas de hijos por tipo y nombre
+        original_children = {(c.type, c.name): c for c in original.children}
+        llm_children = {(c.type, c.name): c for c in llm.children}
+        
+        merged_children = []
+        
+        # Procesar hijos originales
+        for key, orig_child in original_children.items():
+            if key in llm_children:
+                llm_child = llm_children[key]
+                if orig_child.type in ['namespace', 'class', 'interface', 'region']:
+                    merged_children.append(self._merge_sections(orig_child, llm_child))
+                else:
+                    merged_children.append(llm_child if orig_child.content != llm_child.content else orig_child)
+            else:
+                merged_children.append(orig_child)
+        
+        # Añadir nuevos hijos del LLM
+        for key, llm_child in llm_children.items():
+            if key not in original_children:
+                merged_children.append(llm_child)
+        
+        # Ordenar los hijos según el orden definido
+        merged.children = self._sort_sections(merged_children)
+        return merged
+
+    def _generate_code(self, section: CodeSection, indent_level: int = 0) -> str:
+        lines = []
+        base_indent = section.original_indent or "    " * indent_level
+        
+        # Añadir atributos
+        for attr in section.attributes:
+            lines.append(base_indent + attr.lstrip())
+        
+        if section.type != 'root':
+            content_lines = section.content.split('\n')
+            lines.append(base_indent + content_lines[0].lstrip())
+            
+            if len(content_lines) > 1:
+                for line in content_lines[1:]:
+                    if line.strip():
+                        current_indent = re.match(r'^(\s*)', line).group(1)
+                        content = line.lstrip()
+                        lines.append(base_indent + current_indent + content)
+        
+        if section.children:
+            sorted_children = self._sort_sections(section.children)
+            for child in sorted_children:
+                child_code = self._generate_code(child, indent_level + 1 if section.type != 'root' else 0)
+                if child_code:
+                    lines.append(child_code)
+        
+        return '\n'.join(lines)
+
+    def merge_code(self) -> str:
+        merged_tree = self._merge_sections(self.original_tree, self.llm_tree)
+        return self._generate_code(merged_tree)
+
+    def generate_diff(self) -> str:
+        merged = self.merge_code()
+        diff = difflib.unified_diff(
+            self.original_code.splitlines(keepends=True),
+            merged.splitlines(keepends=True),
+            fromfile='original',
+            tofile='merged'
+        )
+        return ''.join(diff)
+
+def main():
+    configs = json.loads(os.environ.get("SCRIPT_CONFIGS", "{}"))
+    working_directory = configs.get("working_directory", ".")
+    work_config = configs.get("level3", {})
+    
+    input_file = work_config.get("input_file", "original.cs")
+    llm_file = work_config.get("llm_file", "llm_generated.cs")
+    output_directory = work_config.get("output_directory", ".")
+
+    input_path = os.path.join(working_directory, input_file)
+    llm_path = os.path.join(working_directory, llm_file)
+    output_merged = os.path.join(output_directory, "merged.cs")
+    output_diff = os.path.join(output_directory, "changes.diff")
+
+    for path in [input_path, llm_path]:
+        if not os.path.exists(path):
+            print(f"Error: File {path} does not exist")
+            return
+
+    os.makedirs(output_directory, exist_ok=True)
+
+    try:
+        with open(input_path, "r", encoding="utf-8") as f:
+            original_code = f.read()
+        with open(llm_path, "r", encoding="utf-8") as f:
+            llm_code = f.read()
+
+        merger = CSharpCodeMerger(original_code, llm_code)
+        merged_code = merger.merge_code()
+
+        with open(output_merged, "w", encoding="utf-8") as f:
+            f.write(merged_code)
+        with open(output_diff, "w", encoding="utf-8") as f:
+            f.write(merger.generate_diff())
+
+        print(f"Successfully processed files:")
+        print(f"- Merged code saved to: {output_merged}")
+        print(f"- Diff file saved to: {output_diff}")
+
+    except Exception as e:
+        print(f"Error processing files: {str(e)}")
+        return
+
+if __name__ == "__main__":
+    main()
--- a/backend/script_groups/EmailCrono/description.json
+++ b/backend/script_groups/EmailCrono/description.json
@ -2,5 +2,5 @@
  "name": "Desempaquetado de Emails EML",
  "description": "This script processes email files (.eml) into a chronological narrative in Markdown format, optimized for processing with Large Language Models (LLMs). It extracts essential information from emails while removing unnecessary metadata, creating a clean, temporal narrative that can be easily analyzed. ",
  "version": "1.0",
-  "author": "Unknown"
+  "author": "Miguel"
 }
--- a/backend/script_groups/EmailCrono/utils/pycache/email_parser.cpython-310.pyc
+++ b/backend/script_groups/EmailCrono/utils/pycache/email_parser.cpython-310.pyc
--- a/backend/script_groups/EmailCrono/utils/email_parser.py
+++ b/backend/script_groups/EmailCrono/utils/email_parser.py
@ -82,35 +82,101 @@ def _html_a_markdown(html):
                if not rows:
                    continue
                    
-                markdown_table = []
-                max_widths = []
+                # Matriz para almacenar la tabla procesada
+                table_matrix = []
+                max_cols = 0
                
-                # Calcular anchos máximos
-                for row in rows:
+                # Primera pasada: crear matriz y procesar rowspans/colspans
+                row_idx = 0
+                while row_idx < len(rows):
+                    row = rows[row_idx]
                    cells = row.find_all(['th', 'td'])
-                    while len(max_widths) < len(cells):
-                        max_widths.append(0)
-                    for i, cell in enumerate(cells):
+                    if not cells:
+                        row_idx += 1
+                        continue
+                        
+                    # Expandir matriz si es necesario
+                    while len(table_matrix) <= row_idx:
+                        table_matrix.append([])
+                        
+                    col_idx = 0
+                    for cell in cells:
+                        # Encontrar la siguiente columna disponible
+                        while col_idx < len(table_matrix[row_idx]) and table_matrix[row_idx][col_idx] is not None:
+                            col_idx += 1
+                            
+                        # Obtener rowspan y colspan
+                        rowspan = int(cell.get('rowspan', 1))
+                        colspan = int(cell.get('colspan', 1))
+                        
+                        # Procesar el texto de la celda reemplazando saltos de línea por <br>
                        cell_text = cell.get_text().strip()
-                        max_widths[i] = max(max_widths[i], len(cell_text))
+                        cell_text = cell_text.replace('\n', '<br>')
+                        cell_text = re.sub(r'\s*<br>\s*<br>\s*', '<br>', cell_text)  # Eliminar <br> múltiples
+                        cell_text = cell_text.strip()
+                        
+                        # Rellenar la matriz con el texto y None para las celdas combinadas
+                        for r in range(rowspan):
+                            current_row = row_idx + r
+                            # Expandir matriz si es necesario
+                            while len(table_matrix) <= current_row:
+                                table_matrix.append([])
+                            # Expandir fila si es necesario
+                            while len(table_matrix[current_row]) <= col_idx + colspan - 1:
+                                table_matrix[current_row].append(None)
+                            
+                            for c in range(colspan):
+                                if r == 0 and c == 0:
+                                    table_matrix[current_row][col_idx + c] = cell_text
+                                else:
+                                    table_matrix[current_row][col_idx + c] = ''
+                        
+                        col_idx += colspan
+                        
+                    max_cols = max(max_cols, col_idx)
+                    row_idx += 1
                
-                # Construir tabla markdown
-                if max_widths:  # Solo si tenemos celdas válidas
-                    header_row = rows[0].find_all(['th', 'td'])
-                    header = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
-                                             for i, cell in enumerate(header_row)) + ' |'
-                    separator = '|' + '|'.join('-' * (width + 2) for width in max_widths) + '|'
+                # Asegurar que todas las filas tengan el mismo número de columnas
+                for row in table_matrix:
+                    while len(row) < max_cols:
+                        row.append('')
                    
+                # Calcular anchos máximos por columna
+                col_widths = [0] * max_cols
+                for row in table_matrix:
+                    for col_idx, cell in enumerate(row):
+                        if cell is not None:
+                            col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
+                
+                # Generar tabla Markdown
+                markdown_table = []
+                
+                # Cabecera
+                if table_matrix:
+                    header = '|'
+                    for col_idx, width in enumerate(col_widths):
+                        cell = str(table_matrix[0][col_idx] or '')
+                        header += f' {cell.ljust(width)} |'
                    markdown_table.append(header)
+                    
+                    # Separador
+                    separator = '|'
+                    for width in col_widths:
+                        separator += '-' * (width + 2) + '|'
                    markdown_table.append(separator)
                    
-                    for row in rows[1:]:
-                        cells = row.find_all(['td', 'th'])
-                        row_text = '| ' + ' | '.join(cell.get_text().strip().ljust(max_widths[i]) 
-                                                   for i, cell in enumerate(cells)) + ' |'
+                    # Contenido
+                    for row_idx in range(1, len(table_matrix)):
+                        row_text = '|'
+                        for col_idx, width in enumerate(col_widths):
+                            cell = str(table_matrix[row_idx][col_idx] or '')
+                            row_text += f' {cell.ljust(width)} |'
                        markdown_table.append(row_text)
-                    
-                    table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table)))
+                
+                # Reemplazar la tabla HTML con la versión Markdown
+                if markdown_table:
+                    table.replace_with(soup.new_string('\n' + '\n'.join(markdown_table) + '\n'))
+                
            except Exception as e:
                print(f"Error procesando tabla: {str(e)}")
                continue
--- a/backend/script_groups/EmailCrono/work_dir.json
+++ b/backend/script_groups/EmailCrono/work_dir.json
@ -1,3 +1,11 @@
 {
-  "path": "C:/Trabajo/VM/40 - 93040 - HENKEL - NEXT2 Problem/Reporte/Emails"
+  "path": "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\EmailTody",
+  "history": [
+    "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\EmailTody",
+    "C:\\Trabajo\\VM\\30 - 9.3941- Kosme - Portogallo (Modifica + Linea)\\Reporte\\Emails",
+    "C:\\Users\\migue\\OneDrive\\Miguel\\Obsidean\\Trabajo\\VM\\30 - 9.3941- Kosme - Portogallo (Modifica + Linea)\\Emails",
+    "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\Emails\\Trial",
+    "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\Emails",
+    "C:\\Trabajo\\VM\\40 - 93040 - HENKEL - NEXT2 Problem\\Reporte\\Emails\\Error de tablas"
+  ]
 }
--- a/backend/script_groups/example_group/work_dir.json
+++ b/backend/script_groups/example_group/work_dir.json
@ -1,3 +1,6 @@
 {
-  "path": "C:/Estudio"
+  "path": "C:\\Estudio",
+  "history": [
+    "C:\\Estudio"
+  ]
 }
--- a/backend/script_groups/ragex/data.json
+++ b/backend/script_groups/ragex/data.json
@ -0,0 +1,3 @@
+{
+
+}
--- a/backend/script_groups/ragex/description.json
+++ b/backend/script_groups/ragex/description.json
@ -0,0 +1,6 @@
+{
+  "name": "RAGEX",
+  "description": "This script processes text files into a chronological narrative in Markdown format, optimized for processing with Large Language Models (LLMs). It extracts essential information from text files while removing unnecessary metadata, creating a clean, temporal narrative that can be easily analyzed.",
+  "version": "1.0",
+  "author": "Miguel"
+}
--- a/backend/script_groups/ragex/esquema_group.json
+++ b/backend/script_groups/ragex/esquema_group.json
@ -0,0 +1,3 @@
+{
+
+}
--- a/backend/script_groups/ragex/esquema_work.json
+++ b/backend/script_groups/ragex/esquema_work.json
@ -0,0 +1,21 @@
+{
+  "type": "object",
+  "properties": {
+    "in_dir": {
+      "type": "string",
+      "format": "directory",
+      "title": "Subdirectorio desde donde hacer la ingesta de los datos",
+      "description": "Subdirectorio desde donde hacer la ingesta de los datos"
+    },
+    "model": {
+      "type": "string",
+      "title": "Model",
+      "description": "OpenAI Model"
+    },
+    "query": {
+      "type": "string",
+      "title": "Consulta",
+      "description": "Consulta"
+    }
+  }
+}
--- a/backend/script_groups/ragex/openai_api_key.py
+++ b/backend/script_groups/ragex/openai_api_key.py
@ -0,0 +1,3 @@
+# Configura tu clave API de OpenAI
+def openai_api_key():
+  return 'sk-HIY5DSK03Lr'
--- a/backend/script_groups/ragex/work_dir.json
+++ b/backend/script_groups/ragex/work_dir.json
@ -0,0 +1,6 @@
+{
+  "path": "D:\\Proyectos\\Scripts\\RAG\\TEST",
+  "history": [
+    "D:\\Proyectos\\Scripts\\RAG\\TEST"
+  ]
+}
--- a/backend/script_groups/ragex/x1.py
+++ b/backend/script_groups/ragex/x1.py
@ -0,0 +1,112 @@
+"""
+Este script realiza la ingesta de los datos alammacenados en el subdirectorio de ingesta.
+"""
+
+import os
+import sys
+from pathlib import Path
+import json
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings  # Cambiado a OpenAI Embeddings
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+import os
+import glob
+
+
+def load_documents(directory):
+    documents = []
+
+    # Cargar archivos markdown
+    for md_file in glob.glob(os.path.join(directory, "**/*.md"), recursive=True):
+        with open(md_file, "r", encoding="utf-8") as f:
+            content = f.read()
+            documents.append(
+                {
+                    "content": content,
+                    "metadata": {"source": md_file, "type": "markdown"},
+                }
+            )
+
+    # Cargar archivos de texto
+    for txt_file in glob.glob(os.path.join(directory, "**/*.txt"), recursive=True):
+        with open(txt_file, "r", encoding="utf-8") as f:
+            content = f.read()
+            documents.append(
+                {"content": content, "metadata": {"source": txt_file, "type": "text"}}
+            )
+
+    return documents
+
+
+def process_documents(documents, db_directory):
+    # Usar OpenAI Embeddings en lugar de HuggingFace
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+
+    # Dividir documentos en chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        separators=["\n## ", "\n### ", "\n#### ", "\n", " ", ""],
+        keep_separator=True,
+    )
+
+    docs = []
+    for doc in documents:
+        chunks = text_splitter.split_text(doc["content"])
+        for i, chunk in enumerate(chunks):
+            docs.append(
+                Document(
+                    page_content=chunk,
+                    metadata={
+                        **doc["metadata"],
+                        "chunk_id": i,
+                        "chunk": chunk[:100] + "...",  # Extracto para referencia
+                    },
+                )
+            )
+
+    # Configurar Chroma para evitar dependencia de ONNX
+    from chromadb.config import Settings
+
+    # Crear o actualizar la base de datos vectorial con configuración específica
+    db = Chroma.from_documents(
+        docs,
+        embeddings,
+        persist_directory=db_directory,
+        client_settings=Settings(anonymized_telemetry=False, is_persistent=True),
+    )
+    db.persist()
+
+    print(f"Procesados {len(docs)} fragmentos de {len(documents)} documentos")
+    return db
+
+
+def main():
+    # Cargar configuraciones del entorno
+    configs = json.loads(os.environ.get("SCRIPT_CONFIGS", "{}"))
+
+    # Obtener working directory
+    working_directory = configs.get("working_directory", ".")
+
+    # Obtener configuraciones de nivel 2 (grupo)
+    group_config = configs.get("level2", {})
+
+    work_config = configs.get("level3", {})
+    in_dir = work_config.get("in_dir", ".")
+
+    docs_directory = os.path.join(working_directory, in_dir)
+    db_directory = os.path.join(working_directory, "chroma_db")
+
+    print("Cargando documentos...")
+    documents = load_documents(docs_directory)
+    print(f"Se encontraron {len(documents)} documentos.")
+
+    print("Procesando e indexando documentos...")
+    db = process_documents(documents, db_directory)
+    print("¡Ingesta completada con éxito!")
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/script_groups/ragex/x2.py
+++ b/backend/script_groups/ragex/x2.py
@ -0,0 +1,126 @@
+"""
+Este script realiza la consulta usando RAGEX a la base de datos de documentos.
+"""
+
+import os
+import sys
+from pathlib import Path
+import json
+from langchain_community.vectorstores import Chroma
+from langchain_openai import (
+    OpenAIEmbeddings,
+)  # Cambiado de HuggingFaceEmbeddings a OpenAIEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from rich.console import Console
+from rich.markdown import Markdown
+import os
+import argparse
+from openai_api_key import openai_api_key
+
+
+console = Console()
+
+
+class CitationTracker:
+    def __init__(self):
+        self.citations = []
+
+    def add_citation(self, text, metadata):
+        self.citations.append({"text": text, "metadata": metadata})
+
+    def get_formatted_citations(self):
+        result = "\n## Fuentes\n\n"
+        for i, citation in enumerate(self.citations, 1):
+            source = citation["metadata"]["source"]
+            result += f"{i}. [{os.path.basename(source)}]({source}) - Fragmento {citation['metadata']['chunk_id']}\n"
+        return result
+
+
+def search_with_citation(query, db_directory, model="gpt-3.5-turbo"):
+    # Cargar embeddings y base de datos
+    embeddings = OpenAIEmbeddings(
+        model="text-embedding-3-small"
+    )  # Usar OpenAI Embeddings igual que en x1.py
+
+    db = Chroma(persist_directory=db_directory, embedding_function=embeddings)
+
+    api_key = openai_api_key()
+    os.environ["OPENAI_API_KEY"] = api_key
+
+    # Configurar el LLM de OpenAI
+    llm = ChatOpenAI(model_name=model)
+
+    # Rastreador de citas
+    citation_tracker = CitationTracker()
+
+    # Recuperar documentos relevantes
+    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
+
+    # Plantilla para el prompt
+    template = """
+    Responde a la siguiente pregunta basándote exclusivamente en la información proporcionada.
+    Incluye referencias a las fuentes originales para cada afirmación importante usando [Fuente N].
+    Si la información no es suficiente, indícalo claramente.
+    
+    Contexto:
+    {context}
+    
+    Pregunta: {question}
+    
+    Respuesta (incluye [Fuente N] para citar):
+    """
+
+    prompt = ChatPromptTemplate.from_template(template)
+
+    # Función para formatear el contexto
+    def format_docs(docs):
+        formatted_context = ""
+        for i, doc in enumerate(docs, 1):
+            citation_tracker.add_citation(doc.page_content, doc.metadata)
+            formatted_context += f"[Fuente {i}]: {doc.page_content}\n\n"
+        return formatted_context
+
+    # Cadena RAG
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    # Ejecutar búsqueda
+    response = rag_chain.invoke(query)
+
+    # Agregar citas al final
+    full_response = response + "\n\n" + citation_tracker.get_formatted_citations()
+
+    return full_response
+
+
+def main():
+    # Cargar configuraciones del entorno
+    configs = json.loads(os.environ.get("SCRIPT_CONFIGS", "{}"))
+
+    # Obtener working directory
+    working_directory = configs.get("working_directory", ".")
+
+    # Obtener configuraciones de nivel 2 (grupo)
+    group_config = configs.get("level2", {})
+
+    work_config = configs.get("level3", {})
+    in_dir = work_config.get("in_dir", ".")
+
+    docs_directory = os.path.join(working_directory, in_dir)
+    model = work_config.get("model", "gpt-3.5-turbo")
+    query = work_config.get("query", "")
+    db_directory = os.path.join(working_directory, "chroma_db")
+
+    result = search_with_citation(query, db_directory, model)
+    console.print(Markdown(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/config_manager.py
+++ b/config_manager.py
@ -454,6 +454,9 @@ class ConfigurationManager:
            with open(work_dir_path, "r") as f:
                data = json.load(f)
                path = data.get("path", "")
+                # Normalizar separadores de ruta
+                if path:
+                    path = os.path.normpath(path)
                # Actualizar la variable de instancia si hay una ruta válida
                if path and os.path.exists(path):
                    self.working_directory = path
@ -462,16 +465,45 @@ class ConfigurationManager:
            return ""

    def set_work_dir(self, group: str, path: str) -> Dict[str, str]:
-        """Set working directory path for a script group."""
+        """Set working directory path for a script group and update history."""
+        # Normalizar el path recibido
+        path = os.path.normpath(path)
+        
        if not os.path.exists(path):
            return {"status": "error", "message": "Directory does not exist"}

        work_dir_path = os.path.join(self.script_groups_path, group, "work_dir.json")

        try:
-            # Guardar la ruta en work_dir.json
+            # Cargar datos existentes o crear nuevos
+            try:
+                with open(work_dir_path, "r") as f:
+                    data = json.load(f)
+                    # Normalizar paths existentes en el historial
+                    if "history" in data:
+                        data["history"] = [os.path.normpath(p) for p in data["history"]]
+            except (FileNotFoundError, json.JSONDecodeError):
+                data = {"path": "", "history": []}
+
+            # Actualizar path actual
+            data["path"] = path
+
+            # Actualizar historial
+            if "history" not in data:
+                data["history"] = []
+
+            # Eliminar la ruta del historial si ya existe (usando path normalizado)
+            data["history"] = [p for p in data["history"] if os.path.normpath(p) != path]
+
+            # Agregar la ruta al principio del historial
+            data["history"].insert(0, path)
+
+            # Mantener solo los últimos 10 directorios
+            data["history"] = data["history"][:10]
+
+            # Guardar datos actualizados
            with open(work_dir_path, "w") as f:
-                json.dump({"path": path}, f, indent=2)
+                json.dump(data, f, indent=2)

            # Actualizar la variable de instancia
            self.working_directory = path
@ -485,3 +517,16 @@ class ConfigurationManager:
            return {"status": "success", "path": path}
        except Exception as e:
            return {"status": "error", "message": str(e)}
+
+    def get_directory_history(self, group: str) -> List[str]:
+        """Get the directory history for a script group."""
+        work_dir_path = os.path.join(self.script_groups_path, group, "work_dir.json")
+        try:
+            with open(work_dir_path, "r") as f:
+                data = json.load(f)
+                # Normalizar todos los paths en el historial
+                history = [os.path.normpath(p) for p in data.get("history", [])]
+                # Filtrar solo directorios que existen
+                return [p for p in history if os.path.exists(p)]
+        except (FileNotFoundError, json.JSONDecodeError):
+            return []
--- a/data/log.txt
+++ b/data/log.txt
@ -1,116 +1,8 @@
-[09:40:16] Iniciando ejecución de x1.py
-[09:40:18] Working directory: C:/Trabajo/VM/40 - 93040 - HENKEL - NEXT2 Problem/Reporte/Emails
-[09:40:18] Input directory: C:/Trabajo/VM/40 - 93040 - HENKEL - NEXT2 Problem/Reporte/Emails
-[09:40:18] Output directory: C:/Users/migue/OneDrive/Miguel/Obsidean/Trabajo/VM/04-InLavoro/HENKEL/93040 - HENKEL - BowlingGreen/Description/HENKEL - ALPLA - AUTEFA - Batch Data
-[09:40:18] Cronologia file: C:/Users/migue/OneDrive/Miguel/Obsidean/Trabajo/VM/04-InLavoro/HENKEL/93040 - HENKEL - BowlingGreen/Description/HENKEL - ALPLA - AUTEFA - Batch Data\cronologia.md
-[09:40:18] Attachments directory: C:/Trabajo/VM/40 - 93040 - HENKEL - NEXT2 Problem/Reporte/Emails\adjuntos
-[09:40:18] Beautify rules file: D:\Proyectos\Scripts\ParamManagerScripts\backend\script_groups\EmailCrono\config\beautify_rules.json
-[09:40:18] Found 13 .eml files
-[09:40:18] Loaded 0 existing messages
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\I_ 9.3040-3074 ALPLA BG open points.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\I_ 9.3061-TLO26-L42 automatic change over test.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\I_ Alpla BOW2 - Line emptying button 6168.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\I_ IDH_BTL.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\I_ OPC-UA interface Vetro - Bowling Green 2 9.3040-3074.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\I_ R_ I_ [EXT] R_ Vetro Conveyor 9.3674.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\NEXT2 - Data - ALPLA information verification.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\RE_ Automatic changeover trial.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\RE_ OPC-UA interface Vetro - Bowling Green 2 9.3040-3074.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\RE_ [EXT] RE_ Vetro_ALPLA information verification.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\Re_ _EXT_ Next + 1 - HENKEL - ALPLA - AUTEFA.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\RV_ RE_ [EXT] RE_ Vetro_ALPLA information verification.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] Processing C:\Trabajo\VM\40 - 93040 - HENKEL - NEXT2 Problem\Reporte\Emails\R_ [EXT] RE_ Vetro_ALPLA information verification 9.3060-3067.eml
-[09:40:18] Aplicando reglas de prioridad 1
-[09:40:18] Aplicando reglas de prioridad 2
-[09:40:18] Aplicando reglas de prioridad 3
-[09:40:18] Aplicando reglas de prioridad 4
-[09:40:18] EstadÃsticas de procesamiento:
-[09:40:18] - Total mensajes encontrados: 38
-[09:40:18] - Mensajes Ãºnicos aÃ±adidos: 22
-[09:40:18] - Mensajes duplicados ignorados: 16
-[09:40:18] Writing 22 messages to C:/Users/migue/OneDrive/Miguel/Obsidean/Trabajo/VM/04-InLavoro/HENKEL/93040 - HENKEL - BowlingGreen/Description/HENKEL - ALPLA - AUTEFA - Batch Data\cronologia.md
-[09:40:18] Ejecución completada
+[20:41:53] Iniciando ejecución de x1.py
+[20:41:57] Cargando documentos...
+[20:41:57] Se encontraron 1 documentos.
+[20:41:57] Procesando e indexando documentos...
+[20:44:57] Iniciando ejecución de x1.py
+[20:45:01] Cargando documentos...
+[20:45:01] Se encontraron 1 documentos.
+[20:45:01] Procesando e indexando documentos...
--- a/services/llm/openai_api_key.py
+++ b/services/llm/openai_api_key.py
@ -0,0 +1,3 @@
+# Configura tu clave API de OpenAI
+def openai_api_key():
+  return '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'
--- a/services/llm/openai_service.py
+++ b/services/llm/openai_service.py
@ -6,11 +6,11 @@ from openai import OpenAI
 from typing import Dict, List
 import json
 from .base import LLMService
-from config.api_keys import APIKeyManager
+from openai_api_key import openai_api_key

 class OpenAIService(LLMService):
    def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0.3):
-        api_key = APIKeyManager.get_openai_key()
+        api_key = openai_api_key()
        if not api_key:
            raise ValueError("OpenAI API key not found. Please set up your API keys.")
            
--- a/static/js/scripts.js
+++ b/static/js/scripts.js
@ -535,6 +535,7 @@ async function setWorkingDirectory() {
    await updateWorkingDirectory(path);
 }

+// Modificar initWorkingDirectory para cargar también el historial
 async function initWorkingDirectory() {
    if (!currentGroup) return;
    
@ -543,6 +544,7 @@ async function initWorkingDirectory() {
    if (result.status === 'success' && result.path) {
        await updateWorkingDirectory(result.path);
    }
+    await loadDirectoryHistory();
 }

 async function browseDirectory() {
@ -565,28 +567,64 @@ async function browseDirectory() {
 async function updateWorkingDirectory(path) {
    console.log('Updating working directory:', { path, group: currentGroup }); // Debug line

-    const response = await fetch('/api/working-directory', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ 
-            path: path,
-            group: currentGroup 
-        })
-    });
-    
-    const result = await response.json();
-    console.log('Update result:', result); // Debug line
-
-    if (result.status === 'success') {
-        // Actualizar input
-        document.getElementById('working-directory').value = path;
+    try {
+        const response = await fetch('/api/working-directory', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ 
+                path: path,
+                group: currentGroup 
+            })
+        });
        
-        // Recargar configuración de nivel 3
-        const configResponse = await fetch(`/api/config/3?group=${currentGroup}`);
-        const data = await configResponse.json();
-        await renderForm('level3-form', data);
-    } else {
-        alert('Error: ' + (result.message || 'No se pudo actualizar el directorio de trabajo'));
+        const result = await response.json();
+        console.log('Update result:', result); // Debug line
+
+        if (result.status === 'success') {
+            // Actualizar input y lista de directorios
+            document.getElementById('working-directory').value = path;
+            await loadDirectoryHistory();
+            
+            // Recargar configuración de nivel 3
+            const configResponse = await fetch(`/api/config/3?group=${currentGroup}`);
+            const data = await configResponse.json();
+            await renderForm('level3-form', data);
+        } else {
+            alert('Error: ' + (result.message || 'No se pudo actualizar el directorio de trabajo'));
+        }
+    } catch (error) {
+        console.error('Error updating working directory:', error);
+        alert('Error actualizando el directorio de trabajo: ' + error.message);
+    }
+}
+
+async function loadDirectoryHistory() {
+    try {
+        const response = await fetch(`/api/directory-history/${currentGroup}`);
+        const history = await response.json();
+        
+        const select = document.getElementById('directory-history');
+        select.innerHTML = '<option value="">-- Directorios recientes --</option>';
+        
+        history.forEach(dir => {
+            const option = document.createElement('option');
+            option.value = dir;
+            option.textContent = dir;
+            // Marcar como seleccionado si es el directorio actual
+            if (dir === document.getElementById('working-directory').value) {
+                option.selected = true;
+            }
+            select.appendChild(option);
+        });
+    } catch (error) {
+        console.error('Error loading directory history:', error);
+    }
+}
+
+function loadHistoryDirectory(path) {
+    if (path) {
+        document.getElementById('working-directory').value = path;
+        updateWorkingDirectory(path);  // Cambiado de setWorkingDirectory a updateWorkingDirectory
    }
 }

@ -657,6 +695,16 @@ async function initializeApp() {
        await initWorkingDirectory();
        await loadConfigs();

+        // Mostrar level3-content por defecto
+        const level3Content = document.getElementById('level3-content');
+        if (level3Content) {
+            level3Content.classList.remove('hidden');
+            const button = document.querySelector(`[onclick="toggleConfig('level3-content')"]`);
+            if (button) {
+                button.innerText = 'Ocultar Configuración';
+            }
+        }
+
    } catch (error) {
        console.error('Error during initialization:', error);
    }
--- a/templates/index.html
+++ b/templates/index.html
@ -109,6 +109,12 @@
                    Confirmar
                </button>
            </div>
+            <!-- Add directory history dropdown -->
+            <div class="mt-2">
+                <select id="directory-history" class="w-full p-2 border rounded text-gray-600" onchange="loadHistoryDirectory(this.value)">
+                    <option value="">-- Directorios recientes --</option>
+                </select>
+            </div>
        </div>

        <!-- Level 3 Configuration -->
@ -117,10 +123,10 @@
                <h2 class="text-xl font-bold">Configuración del Directorio</h2>
                <button class="bg-blue-500 text-white px-4 py-2 rounded" 
                        onclick="toggleConfig('level3-content')">
-                    Mostrar Configuración
+                    Ocultar Configuración
                </button>
            </div>
-            <div id="level3-content" class="hidden">
+            <div id="level3-content">
                <div id="level3-form"></div>
                <div class="flex justify-end mt-4">
                    <button class="bg-blue-500 text-white px-4 py-2 rounded" onclick="modifySchema(3)">