257 lines
9.7 KiB
Python
257 lines
9.7 KiB
Python
"""
|
|
Tokenizador Universal con Auto-Descubrimiento de Reglas
|
|
Sistema de tokenización distribuida donde cada clase define sus propias reglas
|
|
"""
|
|
import re
|
|
from typing import List, Dict, Any, Callable, Tuple, Optional
|
|
|
|
# Importar desde el registro de tipos
|
|
try:
|
|
from type_registry import get_registered_base_context
|
|
TYPE_REGISTRY_AVAILABLE = True
|
|
except ImportError:
|
|
TYPE_REGISTRY_AVAILABLE = False
|
|
|
|
|
|
class UniversalTokenizer:
|
|
"""
|
|
Tokenizador que auto-descubre reglas de tokenización desde las clases registradas
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.debug = False
|
|
self.tokenization_rules: List[Dict[str, Any]] = []
|
|
self._discover_tokenization_rules()
|
|
|
|
def _discover_tokenization_rules(self):
|
|
"""Auto-descubre reglas de tokenización desde todas las clases registradas"""
|
|
if not TYPE_REGISTRY_AVAILABLE:
|
|
if self.debug:
|
|
print("⚠️ Sistema de tipos no disponible")
|
|
return
|
|
|
|
try:
|
|
# Obtener todas las clases registradas
|
|
registered_classes = get_registered_base_context()
|
|
|
|
self.tokenization_rules.clear()
|
|
|
|
for class_name, class_obj in registered_classes.items():
|
|
if hasattr(class_obj, 'get_tokenization_patterns'):
|
|
try:
|
|
patterns = class_obj.get_tokenization_patterns()
|
|
for pattern_info in patterns:
|
|
self.tokenization_rules.append({
|
|
'class_name': class_name,
|
|
'class_obj': class_obj,
|
|
**pattern_info
|
|
})
|
|
|
|
if self.debug:
|
|
print(f"📋 Regla tokenización: {class_name} - {pattern_info.get('description', 'Sin descripción')}")
|
|
|
|
except Exception as e:
|
|
if self.debug:
|
|
print(f"⚠️ Error obteniendo patrones de {class_name}: {e}")
|
|
|
|
# Ordenar por prioridad (menor número = mayor prioridad)
|
|
self.tokenization_rules.sort(key=lambda x: x.get('priority', 100))
|
|
|
|
if self.debug:
|
|
print(f"🔧 {len(self.tokenization_rules)} reglas de tokenización cargadas")
|
|
for rule in self.tokenization_rules:
|
|
print(f" {rule['priority']:2d}: {rule['class_name']} - {rule.get('description', '')}")
|
|
|
|
except Exception as e:
|
|
if self.debug:
|
|
print(f"⚠️ Error en auto-descubrimiento: {e}")
|
|
self.tokenization_rules.clear()
|
|
|
|
def reload_tokenization_rules(self):
|
|
"""Recarga reglas de tokenización del registro"""
|
|
if self.debug:
|
|
print("🔄 Recargando reglas de tokenización...")
|
|
self._discover_tokenization_rules()
|
|
|
|
def preprocess_tokens(self, expression: str) -> str:
|
|
"""Aplica todas las reglas de tokenización descubiertas"""
|
|
if not self.tokenization_rules:
|
|
return expression
|
|
|
|
result = expression
|
|
original = expression
|
|
conversion_stats = {}
|
|
|
|
for rule in self.tokenization_rules:
|
|
try:
|
|
pattern = rule['pattern']
|
|
replacement_func = rule['replacement']
|
|
rule_name = f"{rule['class_name']}"
|
|
|
|
# Contar matches antes de aplicar
|
|
matches_before = len(re.findall(pattern, result))
|
|
|
|
# Aplicar transformación
|
|
if callable(replacement_func):
|
|
result = re.sub(pattern, replacement_func, result)
|
|
else:
|
|
result = re.sub(pattern, replacement_func, result)
|
|
|
|
# Contar conversiones
|
|
matches_after = len(re.findall(pattern, result))
|
|
conversions = matches_before - matches_after
|
|
|
|
if conversions > 0:
|
|
conversion_stats[rule_name] = conversion_stats.get(rule_name, 0) + conversions
|
|
|
|
except Exception as e:
|
|
if self.debug:
|
|
print(f"⚠️ Error aplicando regla {rule['class_name']}: {e}")
|
|
continue
|
|
|
|
if self.debug and result != original:
|
|
print(f"🔧 Tokenización: '{original}' → '{result}'")
|
|
if conversion_stats:
|
|
for rule_name, count in conversion_stats.items():
|
|
print(f" {rule_name}: {count} conversiones")
|
|
|
|
return result
|
|
|
|
def get_tokenization_info(self) -> List[Dict[str, Any]]:
|
|
"""Retorna información sobre las reglas de tokenización activas"""
|
|
return [
|
|
{
|
|
'class': rule['class_name'],
|
|
'priority': rule.get('priority', 100),
|
|
'description': rule.get('description', 'Sin descripción'),
|
|
'pattern': rule['pattern']
|
|
}
|
|
for rule in self.tokenization_rules
|
|
]
|
|
|
|
def test_tokenization(self, test_expressions: List[str]) -> Dict[str, str]:
|
|
"""Prueba tokenización en lista de expresiones"""
|
|
results = {}
|
|
for expr in test_expressions:
|
|
results[expr] = self.preprocess_tokens(expr)
|
|
return results
|
|
|
|
|
|
class TokenizationParser:
|
|
"""
|
|
Parser que integra tokenización universal con BracketParser para compatibilidad
|
|
"""
|
|
|
|
def __init__(self, enable_tokenization=True, debug=False):
|
|
self.enable_tokenization = enable_tokenization
|
|
self.debug = debug
|
|
self.tokenizer = UniversalTokenizer()
|
|
self.tokenizer.debug = debug
|
|
|
|
# Estadísticas
|
|
self.conversion_stats = {}
|
|
self.total_expressions_processed = 0
|
|
|
|
def process_expression(self, expression: str) -> str:
|
|
"""Procesa una expresión aplicando tokenización si está habilitada"""
|
|
self.total_expressions_processed += 1
|
|
|
|
if not self.enable_tokenization:
|
|
return expression
|
|
|
|
# Aplicar tokenización universal
|
|
tokenized = self.tokenizer.preprocess_tokens(expression)
|
|
|
|
if self.debug and tokenized != expression:
|
|
print(f"🔧 Parser: '{expression}' → '{tokenized}'")
|
|
|
|
return tokenized
|
|
|
|
def reload_tokenization_rules(self):
|
|
"""Recarga las reglas de tokenización"""
|
|
self.tokenizer.reload_tokenization_rules()
|
|
|
|
def get_tokenization_info(self) -> Dict[str, Any]:
|
|
"""Información completa del sistema de tokenización"""
|
|
return {
|
|
'rules': self.tokenizer.get_tokenization_info(),
|
|
'statistics': {
|
|
'total_processed': self.total_expressions_processed,
|
|
'tokenization_enabled': self.enable_tokenization,
|
|
'rules_count': len(self.tokenizer.tokenization_rules)
|
|
}
|
|
}
|
|
|
|
def test_patterns(self, test_cases: List[str]) -> Dict[str, Any]:
|
|
"""Prueba patrones de tokenización con casos de prueba"""
|
|
results = self.tokenizer.test_tokenization(test_cases)
|
|
|
|
return {
|
|
'input_expressions': test_cases,
|
|
'tokenized_results': results,
|
|
'successful_conversions': [expr for expr, result in results.items() if expr != result],
|
|
'failed_conversions': [expr for expr, result in results.items() if expr == result]
|
|
}
|
|
|
|
|
|
# Mantener BracketParser como alias de compatibilidad
|
|
class BracketParser(TokenizationParser):
|
|
"""
|
|
Alias de compatibilidad para el código existente
|
|
"""
|
|
|
|
def __init__(self, enable_tokenization=True, debug=False):
|
|
super().__init__(enable_tokenization, debug)
|
|
if debug:
|
|
print("🔄 BracketParser iniciado con tokenización universal")
|
|
|
|
def parse_expression(self, expression: str) -> str:
|
|
"""Método de compatibilidad que usa el nuevo sistema"""
|
|
return self.process_expression(expression)
|
|
|
|
|
|
# Funciones de utilidad para testing y preview
|
|
def preview_tokenization(expression: str, debug: bool = True) -> Dict[str, Any]:
|
|
"""Vista previa de cómo se tokenizaría una expresión"""
|
|
tokenizer = UniversalTokenizer()
|
|
tokenizer.debug = debug
|
|
|
|
original = expression
|
|
tokenized = tokenizer.preprocess_tokens(expression)
|
|
|
|
return {
|
|
'original': original,
|
|
'tokenized': tokenized,
|
|
'changed': original != tokenized,
|
|
'rules_applied': tokenizer.get_tokenization_info()
|
|
}
|
|
|
|
|
|
def test_tokenization_patterns(test_cases: List[str]) -> None:
|
|
"""Función de testing para validar patrones de tokenización"""
|
|
parser = TokenizationParser(debug=True)
|
|
|
|
print("🧪 Probando patrones de tokenización...")
|
|
print("=" * 50)
|
|
|
|
results = parser.test_patterns(test_cases)
|
|
|
|
print(f"\nResultados de {len(test_cases)} casos de prueba:")
|
|
print(f"✅ Exitosos: {len(results['successful_conversions'])}")
|
|
print(f"❌ Sin cambios: {len(results['failed_conversions'])}")
|
|
|
|
if results['successful_conversions']:
|
|
print("\n🎉 Conversiones exitosas:")
|
|
for expr in results['successful_conversions']:
|
|
result = results['tokenized_results'][expr]
|
|
print(f" '{expr}' → '{result}'")
|
|
|
|
if results['failed_conversions']:
|
|
print("\n⚠️ Sin conversión:")
|
|
for expr in results['failed_conversions']:
|
|
print(f" '{expr}' (sin cambios)")
|
|
|
|
print("\n📋 Reglas activas:")
|
|
info = parser.get_tokenization_info()
|
|
for rule in info['rules']:
|
|
print(f" {rule['priority']:2d}: {rule['class']} - {rule['description']}") |