2024-07-30 09:19:19 -03:00
import pandas as pd
2024-07-30 09:58:19 -03:00
from openai import OpenAI
2024-07-30 09:19:19 -03:00
import os
2024-07-31 07:02:49 -03:00
from openai_api_key import openai_api_key
from google_api_key import google_api_key
2024-07-30 13:03:39 -03:00
import ollama
2024-07-30 16:48:02 -03:00
import json
2024-07-31 07:02:49 -03:00
from google . cloud import translate_v2 as translate
from google . oauth2 import service_account
import html
2024-07-31 11:21:24 -03:00
from tqdm import tqdm
2024-09-27 11:08:13 -03:00
import PyLibrary . funciones_comunes as fc
2024-08-01 12:57:04 -03:00
import time
2024-10-12 09:06:22 -03:00
from translation_config import TranslationConfig
2024-11-18 07:31:36 -03:00
from openpyxl . styles import PatternFill , Alignment
import sys
2024-08-01 12:57:04 -03:00
2024-07-31 09:17:01 -03:00
GOOGLE_APPLICATION_CREDENTIALS = " translate-431108-020c17463fbb.json "
2024-10-12 09:06:22 -03:00
batch_size = 20
# Definir el logger a nivel de módulo
logger = None
2024-07-30 09:58:19 -03:00
2024-12-18 12:31:30 -03:00
# Crear el cliente OpenAI
openai_client = OpenAI ( api_key = openai_api_key ( ) )
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
def init_google_translate_client ( ) :
if os . path . exists ( GOOGLE_APPLICATION_CREDENTIALS ) :
# Usar credenciales de cuenta de servicio
credentials = service_account . Credentials . from_service_account_file (
GOOGLE_APPLICATION_CREDENTIALS
)
return translate . Client ( credentials = credentials )
else :
2024-07-31 09:17:01 -03:00
raise ValueError (
" No se han proporcionado credenciales válidas para Google Translate "
)
2024-10-12 09:06:22 -03:00
2024-07-31 07:02:49 -03:00
google_translate_client = init_google_translate_client ( )
2024-07-30 09:58:19 -03:00
2024-10-12 09:06:22 -03:00
2024-07-31 07:02:49 -03:00
def google_translate ( text , target_language ) :
result = google_translate_client . translate ( text , target_language = target_language )
2024-07-31 09:17:01 -03:00
translated_text = result [ " translatedText " ]
2024-07-31 07:02:49 -03:00
return html . unescape ( translated_text )
2024-07-31 09:17:01 -03:00
2024-07-30 12:16:58 -03:00
def read_system_prompt ( ) :
try :
2024-07-31 07:02:49 -03:00
with open ( " . \\ data \\ system_prompt.txt " , " r " , encoding = " utf-8 " ) as file :
2024-07-30 12:16:58 -03:00
return file . read ( ) . strip ( )
except FileNotFoundError :
2024-07-31 09:17:01 -03:00
logger . warning (
" Archivo system_prompt.txt no encontrado. Usando prompt por defecto. "
)
2024-07-30 12:16:58 -03:00
return " You are a translator. "
2024-07-30 16:48:02 -03:00
2024-07-31 09:17:01 -03:00
def translate_batch_ollama ( texts , source_lang , target_lang ) :
joined_text = " \n " . join ( texts )
system_prompt = read_system_prompt ( )
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { joined_text } "
)
response = ollama . generate (
model = " llama3.1 " ,
prompt = f " Translate the following texts from { source_lang } to { target_lang } while preserving special fields like <> and <#>. { system_prompt } : \n \n { joined_text } " ,
)
translations = response [ " response " ] . strip ( ) . split ( " \n " )
logger . info ( f " Respuestas recibidas: \n { translations } " )
return translations
2024-07-31 07:02:49 -03:00
def translate_batch_openai ( texts_dict , source_lang , target_lang ) :
2024-07-30 12:16:58 -03:00
system_prompt = read_system_prompt ( )
2024-07-31 07:02:49 -03:00
texts_list = list ( texts_dict . values ( ) )
2024-07-31 09:17:01 -03:00
request_payload = json . dumps (
{ " texts " : texts_list , " source_lang " : source_lang , " target_lang " : target_lang }
)
2024-07-30 09:58:19 -03:00
logger . info (
2024-07-31 07:02:49 -03:00
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { request_payload } "
2024-07-30 09:58:19 -03:00
)
2024-07-31 07:02:49 -03:00
2024-07-31 09:17:01 -03:00
response = openai_client . chat . completions . create (
2024-07-31 07:02:49 -03:00
model = " gpt-4o-mini " ,
2024-07-30 09:58:19 -03:00
messages = [
2024-07-30 16:48:02 -03:00
{ " role " : " system " , " content " : f " You are a translator. { system_prompt } . " } ,
2024-07-31 09:17:01 -03:00
{ " role " : " user " , " content " : request_payload } ,
2024-07-30 09:58:19 -03:00
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:19:19 -03:00
)
2024-07-30 16:48:02 -03:00
response_payload = json . loads ( response . choices [ 0 ] . message . content . strip ( ) )
translations = response_payload . get ( " texts " , [ ] )
2024-07-30 09:58:19 -03:00
logger . info ( f " Respuestas recibidas: \n { translations } " )
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
if len ( translations ) != len ( texts_list ) :
2024-07-31 09:17:01 -03:00
raise ValueError (
" La cantidad de traducciones recibidas no coincide con la cantidad de textos enviados. "
)
2024-07-31 07:02:49 -03:00
return dict ( zip ( texts_dict . keys ( ) , translations ) )
2024-07-30 09:19:19 -03:00
2024-07-30 13:03:39 -03:00
2024-10-12 09:06:22 -03:00
def main ( config : TranslationConfig ) :
df = fc . read_dataframe_with_cleanup_retries ( config . get_translate_path ( ) )
2024-10-08 11:58:04 -03:00
2024-10-12 09:06:22 -03:00
source_col = config . codigo_columna_maestra
2024-11-18 07:31:36 -03:00
source_translated_col = f " { config . codigo_idioma_seleccionado } _Propuesto "
2024-10-12 09:06:22 -03:00
target_col = f " { config . codigo_idioma_seleccionado } Translated "
check_translate_col = f " { config . codigo_idioma_seleccionado } CheckTranslate "
affinity_col = f " { config . codigo_idioma_seleccionado } Affinity "
2024-07-30 09:19:19 -03:00
2024-07-30 11:17:13 -03:00
# Asegurarse de que la columna de destino existe
2024-10-12 09:06:22 -03:00
for col in [ target_col , check_translate_col , affinity_col ] :
if col not in df . columns :
df [ col ] = None
2024-07-30 09:19:19 -03:00
2024-07-31 07:02:49 -03:00
texts_to_translate = { }
2024-07-30 09:58:19 -03:00
2024-10-14 10:47:49 -03:00
# Inicializar ProgressBar para la fase de preparación
2024-11-18 07:31:36 -03:00
prep_progress = fc . ProgressBar (
len ( df ) , prefix = " Preparando textos: " , suffix = " Completado "
)
2024-09-20 09:30:33 -03:00
for index , row in df . iterrows ( ) :
2024-08-01 08:53:38 -03:00
celda_clave = str ( row [ source_col ] )
2024-11-18 07:31:36 -03:00
source_translated_text = (
str ( row [ source_translated_col ] )
if source_translated_col in df . columns
else " "
)
celda_clave_compactada = fc . compactar_celda_traducida (
config . codigo_tipo_PLC , celda_clave
)
2024-07-31 09:17:01 -03:00
2024-10-12 09:06:22 -03:00
if config . traducir_todo :
2024-11-18 07:31:36 -03:00
if fc . texto_requiere_traduccion (
config . codigo_tipo_PLC , celda_clave_compactada , logger
) :
2024-10-12 09:06:22 -03:00
df . at [ index , source_translated_col ] = " "
2024-08-01 08:53:38 -03:00
texts_to_translate [ celda_clave ] = celda_clave_compactada
2024-07-30 11:17:13 -03:00
else :
2024-11-18 07:31:36 -03:00
if (
pd . isna ( row [ source_translated_col ] )
or source_translated_text . strip ( ) == " "
) :
if fc . texto_requiere_traduccion (
config . codigo_tipo_PLC , celda_clave_compactada , logger
) or fc . texto_con_campos_especiales (
config . codigo_tipo_PLC , celda_clave_compactada
) :
2024-08-01 08:53:38 -03:00
texts_to_translate [ celda_clave ] = celda_clave_compactada
2024-11-18 07:31:36 -03:00
2024-10-14 10:47:49 -03:00
prep_progress . update ( index + 1 )
2024-07-30 09:19:19 -03:00
2024-10-14 10:47:49 -03:00
prep_progress . finish ( )
2024-08-01 08:53:38 -03:00
2024-10-14 10:47:49 -03:00
num_texts = len ( texts_to_translate )
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de textos a traducir: { num_texts } " )
2024-10-14 10:47:49 -03:00
print ( f " \n Número total de textos a traducir: { num_texts } " )
# Inicializar ProgressBar para la fase de traducción
2024-11-18 07:31:36 -03:00
trans_progress = fc . ProgressBar (
num_texts , prefix = " Traduciendo: " , suffix = " Completado "
)
2024-07-31 09:17:01 -03:00
2024-10-14 10:47:49 -03:00
# Traducciones
2024-07-31 07:02:49 -03:00
translations = { }
2024-07-30 16:48:02 -03:00
for start_idx in range ( 0 , num_texts , batch_size ) :
2024-07-30 09:58:19 -03:00
end_idx = min ( start_idx + batch_size , num_texts )
2024-07-31 07:02:49 -03:00
batch_texts = dict ( list ( texts_to_translate . items ( ) ) [ start_idx : end_idx ] )
2024-11-18 07:31:36 -03:00
logger . info ( f " Traduciendo: celdas desde { start_idx } a { end_idx } . " )
2024-07-31 09:17:01 -03:00
2024-10-12 09:51:41 -03:00
retries = 4
2024-07-31 07:02:49 -03:00
for attempt in range ( retries ) :
try :
2024-07-31 09:17:01 -03:00
batch_translations = translate_batch_openai (
2024-10-12 09:06:22 -03:00
batch_texts ,
2024-10-12 09:51:41 -03:00
fc . idiomas_idiomafromcode ( config . codigo_columna_maestra ) ,
2024-11-18 07:31:36 -03:00
fc . idiomas_idiomafromcode ( config . codigo_idioma_seleccionado ) ,
2024-07-31 09:17:01 -03:00
)
2024-07-31 07:02:49 -03:00
translations . update ( batch_translations )
2024-10-12 09:51:41 -03:00
break
2024-07-31 07:02:49 -03:00
except Exception as e :
2024-10-12 09:51:41 -03:00
if attempt < retries - 1 :
2024-11-18 07:31:36 -03:00
logger . warning (
f " Error en el intento { attempt + 1 } de traducción de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... "
)
print (
f " Error en el intento { attempt + 1 } de traducción de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... "
)
2024-08-01 12:57:04 -03:00
time . sleep ( 3 )
2024-10-12 09:51:41 -03:00
else :
2024-11-18 07:31:36 -03:00
logger . error (
f " Error en todos los intentos de traducción de celdas desde { start_idx } a { end_idx } : { e } "
)
print (
f " Error en todos los intentos de traducción de celdas desde { start_idx } a { end_idx } : { e } "
)
2024-10-14 10:47:49 -03:00
trans_progress . update ( end_idx )
2024-07-30 09:19:19 -03:00
2024-10-14 10:47:49 -03:00
trans_progress . finish ( )
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de traducciones recibidas: { len ( translations ) } " )
2024-10-14 10:47:49 -03:00
# Inicializar ProgressBar para la fase de actualización del DataFrame
2024-11-18 07:31:36 -03:00
update_progress = fc . ProgressBar (
len ( df ) , prefix = " Actualizando DataFrame: " , suffix = " Completado "
)
2024-10-14 10:47:49 -03:00
2024-07-31 09:17:01 -03:00
# Actualizar el DataFrame con las traducciones y hacemos la Traduccion inversa
2024-10-14 10:47:49 -03:00
for index , row in df . iterrows ( ) :
2024-08-01 08:53:38 -03:00
celda_clave = str ( row [ source_col ] )
if celda_clave in translations :
df . at [ index , target_col ] = translations [ celda_clave ]
2024-07-31 07:02:49 -03:00
try :
2024-10-12 09:06:22 -03:00
google_translation = google_translate (
translations [ celda_clave ] ,
2024-11-18 07:31:36 -03:00
fc . idiomas_shortcodefromcode ( config . codigo_columna_maestra ) ,
2024-10-12 09:06:22 -03:00
)
2024-07-31 07:02:49 -03:00
df . at [ index , check_translate_col ] = google_translation
except Exception as e :
2024-11-18 07:31:36 -03:00
logger . error (
f " Error en la traducción de Google para el texto ' { celda_clave } ' : { e } "
)
2024-07-31 07:02:49 -03:00
df . at [ index , check_translate_col ] = " Error en la traducción "
2024-07-31 09:17:01 -03:00
df . at [ index , affinity_col ] = 0.0
2024-10-14 10:47:49 -03:00
update_progress . increment ( )
update_progress . finish ( )
2024-11-18 07:31:36 -03:00
2024-12-18 12:31:30 -03:00
# Configurar el modelo a usar
modelo_llm = fc . LLM_MODELS [ " OpenAI " ] # o el que se prefiera
api_key = openai_api_key ( ) # solo necesario para OpenAI y Grok
2024-07-31 09:17:01 -03:00
# Afinidades
2024-11-18 08:11:36 -03:00
# Los textos ya vienen del proceso de traducción
texts_to_check = { }
for key , translated_text in translations . items ( ) :
if pd . notna ( translated_text ) and str ( translated_text ) . strip ( ) != " " :
texts_to_check [ key ] = translated_text
# Calcular afinidades usando LLM
2024-12-18 12:31:30 -03:00
affinities_dict = fc . calcular_afinidad_batch (
texts_to_check , config . codigo_tipo_PLC , modelo_llm , logger , api_key
2024-11-18 08:11:36 -03:00
)
2024-07-31 09:17:01 -03:00
2024-11-18 08:11:36 -03:00
# Asignar resultados al DataFrame
2024-07-31 09:17:01 -03:00
for index , row in df . iterrows ( ) :
2024-11-18 08:11:36 -03:00
key = str ( row [ source_col ] )
if key in affinities_dict :
df . at [ index , affinity_col ] = affinities_dict [ key ]
2024-12-18 12:31:30 -03:00
2024-10-12 09:06:22 -03:00
output_path = config . get_auto_translate_path ( )
2024-11-18 07:31:36 -03:00
with pd . ExcelWriter ( output_path , engine = " openpyxl " ) as writer :
df . to_excel ( writer , index = False , sheet_name = " Sheet1 " )
workbook = writer . book
worksheet = writer . sheets [ " Sheet1 " ]
# Inmovilizar paneles en A2
worksheet . freeze_panes = " A2 "
# Configurar ancho de columnas basado en contenido
from openpyxl . utils import get_column_letter
for col in worksheet . columns :
max_length = 0
column = col [ 0 ] . column_letter
for cell in col :
try :
if cell . value :
text_length = len ( str ( cell . value ) )
# Si el texto es más largo que 50, aplicamos wrap_text
if text_length > 50 :
cell . alignment = Alignment ( wrap_text = True , vertical = " top " )
text_length = min (
50 , max ( len ( word ) for word in str ( cell . value ) . split ( ) )
)
max_length = max ( max_length , text_length )
except :
pass
# Ajustar el ancho con un pequeño padding
adjusted_width = min ( 50 , max_length + 2 )
worksheet . column_dimensions [ column ] . width = (
adjusted_width if adjusted_width > 8 else 8
)
# Colores para el formato condicional
light_blue = PatternFill (
start_color = " ADD8E6 " , end_color = " ADD8E6 " , fill_type = " solid "
)
yellow = PatternFill (
start_color = " FFFF00 " , end_color = " FFFF00 " , fill_type = " solid "
)
# Aplicar formatos
for row in worksheet . iter_rows ( min_row = 2 ) :
translated_cell = row [ df . columns . get_loc ( target_col ) ]
if translated_cell . value :
affinity_cell = row [ df . columns . get_loc ( affinity_col ) ]
try :
affinity_value = float (
affinity_cell . value if affinity_cell . value else 0
)
if affinity_value == 1 :
translated_cell . fill = light_blue
elif affinity_value < 1 :
translated_cell . fill = yellow
except ( ValueError , TypeError ) :
pass
2024-07-30 09:58:19 -03:00
logger . info ( f " Archivo traducido guardado en: { output_path } " )
2024-10-14 10:47:49 -03:00
print ( f " \n Archivo traducido guardado en: { output_path } " )
2024-07-30 09:19:19 -03:00
2024-11-18 07:31:36 -03:00
2024-10-12 09:06:22 -03:00
def run ( config : TranslationConfig ) :
global logger
logger = fc . configurar_logger ( config . work_dir )
2024-10-14 10:47:49 -03:00
script_name = os . path . basename ( __file__ )
print ( f " \r Iniciando: { script_name } \r " )
2024-10-12 09:06:22 -03:00
main ( config )
2024-11-18 07:31:36 -03:00
2024-08-01 12:57:04 -03:00
if __name__ == " __main__ " :
2024-10-12 09:06:22 -03:00
import menu_pasos_traduccion
2024-11-18 07:31:36 -03:00
menu_pasos_traduccion . main ( )