2024-07-30 09:19:19 -03:00
import pandas as pd
2024-07-30 09:58:19 -03:00
from openai import OpenAI
2024-07-30 09:19:19 -03:00
import os
2024-07-31 07:02:49 -03:00
from openai_api_key import openai_api_key
from google_api_key import google_api_key
2024-07-30 13:03:39 -03:00
import ollama
2024-07-30 16:48:02 -03:00
import json
2024-07-31 07:02:49 -03:00
from google . cloud import translate_v2 as translate
from google . oauth2 import service_account
import html
2024-07-31 11:21:24 -03:00
from tqdm import tqdm
2024-09-27 11:08:13 -03:00
import PyLibrary . funciones_comunes as fc
2024-08-01 12:57:04 -03:00
import time
2024-10-12 09:06:22 -03:00
import PyLibrary . funciones_comunes as fc
from translation_config import TranslationConfig
from openai import OpenAI
from tqdm import tqdm
2024-08-01 12:57:04 -03:00
2024-07-31 09:17:01 -03:00
openai_client = OpenAI ( api_key = openai_api_key ( ) )
GOOGLE_APPLICATION_CREDENTIALS = " translate-431108-020c17463fbb.json "
2024-10-12 09:06:22 -03:00
batch_size = 20
# Definir el logger a nivel de módulo
logger = None
2024-07-30 09:58:19 -03:00
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
def init_google_translate_client ( ) :
if os . path . exists ( GOOGLE_APPLICATION_CREDENTIALS ) :
# Usar credenciales de cuenta de servicio
credentials = service_account . Credentials . from_service_account_file (
GOOGLE_APPLICATION_CREDENTIALS
)
return translate . Client ( credentials = credentials )
else :
2024-07-31 09:17:01 -03:00
raise ValueError (
" No se han proporcionado credenciales válidas para Google Translate "
)
2024-10-12 09:06:22 -03:00
2024-07-31 07:02:49 -03:00
google_translate_client = init_google_translate_client ( )
2024-07-30 09:58:19 -03:00
2024-10-12 09:06:22 -03:00
2024-07-31 07:02:49 -03:00
def google_translate ( text , target_language ) :
result = google_translate_client . translate ( text , target_language = target_language )
2024-07-31 09:17:01 -03:00
translated_text = result [ " translatedText " ]
2024-07-31 07:02:49 -03:00
return html . unescape ( translated_text )
2024-07-31 09:17:01 -03:00
2024-07-30 12:16:58 -03:00
def read_system_prompt ( ) :
try :
2024-07-31 07:02:49 -03:00
with open ( " . \\ data \\ system_prompt.txt " , " r " , encoding = " utf-8 " ) as file :
2024-07-30 12:16:58 -03:00
return file . read ( ) . strip ( )
except FileNotFoundError :
2024-07-31 09:17:01 -03:00
logger . warning (
" Archivo system_prompt.txt no encontrado. Usando prompt por defecto. "
)
2024-07-30 12:16:58 -03:00
return " You are a translator. "
2024-07-30 16:48:02 -03:00
2024-07-31 09:17:01 -03:00
def translate_batch_ollama ( texts , source_lang , target_lang ) :
joined_text = " \n " . join ( texts )
system_prompt = read_system_prompt ( )
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { joined_text } "
)
response = ollama . generate (
model = " llama3.1 " ,
prompt = f " Translate the following texts from { source_lang } to { target_lang } while preserving special fields like <> and <#>. { system_prompt } : \n \n { joined_text } " ,
)
translations = response [ " response " ] . strip ( ) . split ( " \n " )
logger . info ( f " Respuestas recibidas: \n { translations } " )
return translations
2024-07-31 07:02:49 -03:00
def translate_batch_openai ( texts_dict , source_lang , target_lang ) :
2024-07-30 12:16:58 -03:00
system_prompt = read_system_prompt ( )
2024-07-31 07:02:49 -03:00
texts_list = list ( texts_dict . values ( ) )
2024-07-31 09:17:01 -03:00
request_payload = json . dumps (
{ " texts " : texts_list , " source_lang " : source_lang , " target_lang " : target_lang }
)
2024-07-30 09:58:19 -03:00
logger . info (
2024-07-31 07:02:49 -03:00
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { request_payload } "
2024-07-30 09:58:19 -03:00
)
2024-07-31 07:02:49 -03:00
2024-07-31 09:17:01 -03:00
response = openai_client . chat . completions . create (
2024-07-31 07:02:49 -03:00
model = " gpt-4o-mini " ,
2024-07-30 09:58:19 -03:00
messages = [
2024-07-30 16:48:02 -03:00
{ " role " : " system " , " content " : f " You are a translator. { system_prompt } . " } ,
2024-07-31 09:17:01 -03:00
{ " role " : " user " , " content " : request_payload } ,
2024-07-30 09:58:19 -03:00
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:19:19 -03:00
)
2024-07-30 16:48:02 -03:00
response_payload = json . loads ( response . choices [ 0 ] . message . content . strip ( ) )
translations = response_payload . get ( " texts " , [ ] )
2024-07-30 09:58:19 -03:00
logger . info ( f " Respuestas recibidas: \n { translations } " )
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
if len ( translations ) != len ( texts_list ) :
2024-07-31 09:17:01 -03:00
raise ValueError (
" La cantidad de traducciones recibidas no coincide con la cantidad de textos enviados. "
)
2024-07-31 07:02:49 -03:00
return dict ( zip ( texts_dict . keys ( ) , translations ) )
2024-07-30 09:19:19 -03:00
2024-07-30 13:03:39 -03:00
2024-10-12 09:51:41 -03:00
def affinity_batch_openai ( codigo_tipo_PLC , texts_dict ) :
2024-07-31 11:21:24 -03:00
system_prompt = (
" Evaluate the semantic similarity between the following table of pairs of texts in json format on a scale from 0 to 1. "
" Return the similarity scores for every row in JSON format as a list of numbers, without any additional text or formatting. "
)
2024-08-01 08:53:38 -03:00
original_list = [
2024-10-12 09:51:41 -03:00
fc . compactar_celda_traducida ( codigo_tipo_PLC , key ) for key in texts_dict . keys ( )
2024-08-01 08:53:38 -03:00
]
2024-07-31 09:17:01 -03:00
re_translated_list = list ( texts_dict . values ( ) )
2024-07-30 09:58:19 -03:00
2024-07-31 09:17:01 -03:00
request_payload = json . dumps (
{ " original " : original_list , " compared " : re_translated_list }
2024-07-30 09:58:19 -03:00
)
2024-07-31 09:17:01 -03:00
logger . info ( f " Solicitando Afinidad para el lote de textos: \n { request_payload } " )
response = openai_client . chat . completions . create (
model = " gpt-4o-mini " ,
messages = [
{
" role " : " system " ,
2024-07-31 11:21:24 -03:00
" content " : system_prompt ,
2024-07-31 09:17:01 -03:00
} ,
{ " role " : " user " , " content " : request_payload } ,
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:58:19 -03:00
)
2024-07-31 11:21:24 -03:00
response_content = response . choices [ 0 ] . message . content
# Limpiar y convertir el contenido de la respuesta
cleaned_response_content = response_content . strip ( ) . strip ( " ' ```json " ) . strip ( " ``` " )
# Intentar convertir el contenido a JSON
try :
response_payload = json . loads ( cleaned_response_content )
except json . JSONDecodeError :
raise ValueError ( " La respuesta no se pudo decodificar como JSON. " )
# Manejar diferentes formatos de respuesta
2024-08-01 08:53:38 -03:00
if isinstance ( response_payload , dict ) and " similarity_scores " in response_payload :
scores = response_payload [ " similarity_scores " ]
2024-07-31 11:21:24 -03:00
elif isinstance ( response_payload , list ) :
scores = response_payload
else :
raise ValueError ( " Formato de respuesta inesperado. " )
2024-07-31 09:17:01 -03:00
logger . info ( f " Respuestas recibidas: \n { scores } " )
if len ( scores ) != len ( original_list ) :
raise ValueError (
" La cantidad de afinidades recibidas no coincide con la cantidad de textos enviados. "
)
return dict ( zip ( texts_dict . keys ( ) , scores ) )
2024-07-30 09:58:19 -03:00
2024-07-31 11:21:24 -03:00
2024-08-01 12:57:04 -03:00
# Función que calcula la afinidad entre dos textos
2024-09-20 04:24:08 -03:00
def calcular_afinidad ( tipo_PLC , texto1 , texto2 ) :
2024-08-01 12:57:04 -03:00
system_prompt = (
" Evaluate the semantic similarity between the following pair of texts on a scale from 0 to 1. "
" Return the similarity score as a single number. "
)
2024-09-20 04:24:08 -03:00
original_text = fc . compactar_celda_traducida ( tipo_PLC , texto1 )
2024-08-01 12:57:04 -03:00
compared_text = texto2
request_payload = json . dumps ( { " original " : original_text , " compared " : compared_text } )
logger . info ( f " Solicitando afinidad para el par de textos: \n { request_payload } " )
response = openai_client . chat . completions . create (
model = " gpt-4o-mini " ,
messages = [
{
" role " : " system " ,
" content " : system_prompt ,
} ,
{ " role " : " user " , " content " : request_payload } ,
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
)
response_content = response . choices [ 0 ] . message . content
# Limpiar y convertir el contenido de la respuesta
cleaned_response_content = response_content . strip ( ) . strip ( " ' ```json " ) . strip ( " ``` " )
# Intentar convertir el contenido a JSON
try :
score = float ( cleaned_response_content )
except ValueError :
2024-10-12 09:06:22 -03:00
raise ValueError (
f " La respuesta no se pudo decodificar como un número: { cleaned_response_content } "
)
2024-08-01 12:57:04 -03:00
return score
2024-10-08 11:58:04 -03:00
2024-10-12 09:06:22 -03:00
def main ( config : TranslationConfig ) :
2024-10-12 09:51:41 -03:00
global logger
2024-10-12 09:06:22 -03:00
df = fc . read_dataframe_with_cleanup_retries ( config . get_translate_path ( ) )
2024-10-08 11:58:04 -03:00
2024-10-12 09:06:22 -03:00
source_col = config . codigo_columna_maestra
source_translated_col = config . codigo_idioma_seleccionado
target_col = f " { config . codigo_idioma_seleccionado } Translated "
check_translate_col = f " { config . codigo_idioma_seleccionado } CheckTranslate "
affinity_col = f " { config . codigo_idioma_seleccionado } Affinity "
2024-07-30 09:19:19 -03:00
2024-07-30 11:17:13 -03:00
# Asegurarse de que la columna de destino existe
2024-10-12 09:06:22 -03:00
for col in [ target_col , check_translate_col , affinity_col ] :
if col not in df . columns :
df [ col ] = None
2024-07-30 09:19:19 -03:00
2024-07-31 07:02:49 -03:00
texts_to_translate = { }
2024-07-30 09:58:19 -03:00
2024-10-14 10:47:49 -03:00
# Inicializar ProgressBar para la fase de preparación
prep_progress = fc . ProgressBar ( len ( df ) , prefix = ' Preparando textos: ' , suffix = ' Completado ' )
2024-09-20 09:30:33 -03:00
for index , row in df . iterrows ( ) :
2024-08-01 08:53:38 -03:00
celda_clave = str ( row [ source_col ] )
2024-10-14 10:47:49 -03:00
source_translated_text = str ( row [ source_translated_col ] ) if source_translated_col in df . columns else " "
celda_clave_compactada = fc . compactar_celda_traducida ( config . codigo_tipo_PLC , celda_clave )
2024-07-31 09:17:01 -03:00
2024-10-12 09:06:22 -03:00
if config . traducir_todo :
2024-10-14 10:47:49 -03:00
if fc . texto_requiere_traduccion ( config . codigo_tipo_PLC , celda_clave_compactada , logger ) :
2024-10-12 09:06:22 -03:00
df . at [ index , source_translated_col ] = " "
2024-08-01 08:53:38 -03:00
texts_to_translate [ celda_clave ] = celda_clave_compactada
2024-07-30 11:17:13 -03:00
else :
2024-10-14 10:47:49 -03:00
if pd . isna ( row [ source_translated_col ] ) or source_translated_text . strip ( ) == " " :
if fc . texto_requiere_traduccion ( config . codigo_tipo_PLC , celda_clave_compactada , logger ) or fc . texto_con_campos_especiales ( config . codigo_tipo_PLC , celda_clave_compactada ) :
2024-08-01 08:53:38 -03:00
texts_to_translate [ celda_clave ] = celda_clave_compactada
2024-10-14 10:47:49 -03:00
prep_progress . update ( index + 1 )
2024-07-30 09:19:19 -03:00
2024-10-14 10:47:49 -03:00
prep_progress . finish ( )
2024-08-01 08:53:38 -03:00
2024-10-14 10:47:49 -03:00
num_texts = len ( texts_to_translate )
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de textos a traducir: { num_texts } " )
2024-10-14 10:47:49 -03:00
print ( f " \n Número total de textos a traducir: { num_texts } " )
# Inicializar ProgressBar para la fase de traducción
trans_progress = fc . ProgressBar ( num_texts , prefix = ' Traduciendo: ' , suffix = ' Completado ' )
2024-07-31 09:17:01 -03:00
2024-10-14 10:47:49 -03:00
# Traducciones
2024-07-31 07:02:49 -03:00
translations = { }
2024-07-30 16:48:02 -03:00
for start_idx in range ( 0 , num_texts , batch_size ) :
2024-07-30 09:58:19 -03:00
end_idx = min ( start_idx + batch_size , num_texts )
2024-07-31 07:02:49 -03:00
batch_texts = dict ( list ( texts_to_translate . items ( ) ) [ start_idx : end_idx ] )
2024-10-14 10:47:49 -03:00
logger . info ( f " Traduciendo: celdas desde { start_idx } a { end_idx } . " )
2024-07-31 09:17:01 -03:00
2024-10-12 09:51:41 -03:00
retries = 4
2024-07-31 07:02:49 -03:00
for attempt in range ( retries ) :
try :
2024-07-31 09:17:01 -03:00
batch_translations = translate_batch_openai (
2024-10-12 09:06:22 -03:00
batch_texts ,
2024-10-12 09:51:41 -03:00
fc . idiomas_idiomafromcode ( config . codigo_columna_maestra ) ,
fc . idiomas_idiomafromcode ( config . codigo_idioma_seleccionado )
2024-07-31 09:17:01 -03:00
)
2024-07-31 07:02:49 -03:00
translations . update ( batch_translations )
2024-10-12 09:51:41 -03:00
break
2024-07-31 07:02:49 -03:00
except Exception as e :
2024-10-12 09:51:41 -03:00
if attempt < retries - 1 :
logger . warning ( f " Error en el intento { attempt + 1 } de traducción de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... " )
print ( f " Error en el intento { attempt + 1 } de traducción de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... " )
2024-08-01 12:57:04 -03:00
time . sleep ( 3 )
2024-10-12 09:51:41 -03:00
else :
logger . error ( f " Error en todos los intentos de traducción de celdas desde { start_idx } a { end_idx } : { e } " )
print ( f " Error en todos los intentos de traducción de celdas desde { start_idx } a { end_idx } : { e } " )
2024-10-14 10:47:49 -03:00
trans_progress . update ( end_idx )
2024-07-30 09:19:19 -03:00
2024-10-14 10:47:49 -03:00
trans_progress . finish ( )
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de traducciones recibidas: { len ( translations ) } " )
2024-10-14 10:47:49 -03:00
# Inicializar ProgressBar para la fase de actualización del DataFrame
update_progress = fc . ProgressBar ( len ( df ) , prefix = ' Actualizando DataFrame: ' , suffix = ' Completado ' )
2024-07-31 09:17:01 -03:00
# Actualizar el DataFrame con las traducciones y hacemos la Traduccion inversa
2024-10-14 10:47:49 -03:00
for index , row in df . iterrows ( ) :
2024-08-01 08:53:38 -03:00
celda_clave = str ( row [ source_col ] )
if celda_clave in translations :
df . at [ index , target_col ] = translations [ celda_clave ]
2024-07-31 07:02:49 -03:00
try :
2024-10-12 09:06:22 -03:00
google_translation = google_translate (
translations [ celda_clave ] ,
2024-10-12 09:51:41 -03:00
fc . idiomas_shortcodefromcode ( config . codigo_columna_maestra )
2024-10-12 09:06:22 -03:00
)
2024-07-31 07:02:49 -03:00
df . at [ index , check_translate_col ] = google_translation
except Exception as e :
2024-10-12 09:51:41 -03:00
logger . error ( f " Error en la traducción de Google para el texto ' { celda_clave } ' : { e } " )
2024-07-31 07:02:49 -03:00
df . at [ index , check_translate_col ] = " Error en la traducción "
2024-07-31 09:17:01 -03:00
df . at [ index , affinity_col ] = 0.0
2024-10-14 10:47:49 -03:00
update_progress . increment ( )
update_progress . finish ( )
# Inicializar ProgressBar para la fase de cálculo de afinidad
affinity_progress = fc . ProgressBar ( num_texts , prefix = ' Calculando afinidad: ' , suffix = ' Completado ' )
2024-07-30 09:19:19 -03:00
2024-07-31 09:17:01 -03:00
# Afinidades
affinities = { }
2024-10-12 09:51:41 -03:00
for start_idx in range ( 0 , num_texts , batch_size ) :
2024-07-31 09:17:01 -03:00
end_idx = min ( start_idx + batch_size , num_texts )
batch_texts = dict ( list ( texts_to_translate . items ( ) ) [ start_idx : end_idx ] )
2024-10-14 10:47:49 -03:00
logger . info ( f " Afinidad: celdas desde { start_idx } a { end_idx } . " )
2024-07-31 09:17:01 -03:00
2024-10-12 09:51:41 -03:00
retries = 2
2024-07-31 09:17:01 -03:00
for attempt in range ( retries ) :
try :
2024-10-12 09:51:41 -03:00
batch_affinities = affinity_batch_openai ( config . codigo_tipo_PLC , batch_texts )
2024-07-31 09:17:01 -03:00
affinities . update ( batch_affinities )
2024-10-12 09:51:41 -03:00
break
2024-07-31 09:17:01 -03:00
except Exception as e :
2024-10-12 09:51:41 -03:00
if attempt < retries - 1 :
logger . warning ( f " Error en el intento { attempt + 1 } de Afinidad de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... " )
print ( f " Error en el intento { attempt + 1 } de Afinidad de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... " )
2024-08-01 12:57:04 -03:00
time . sleep ( 3 )
2024-10-12 09:51:41 -03:00
else :
logger . error ( f " Error en todos los intentos de Afinidad de celdas desde { start_idx } a { end_idx } : { e } " )
print ( f " Error en todos los intentos de Afinidad de celdas desde { start_idx } a { end_idx } : { e } " )
2024-08-01 12:57:04 -03:00
for key , value in batch_texts . items ( ) :
try :
2024-10-12 09:51:41 -03:00
score = calcular_afinidad ( config . codigo_tipo_PLC , key , value )
2024-08-01 12:57:04 -03:00
affinities [ key ] = score
except Exception as ind_e :
affinities [ key ] = " 0 "
2024-10-12 09:51:41 -03:00
logger . error ( f " Error en el cálculo individual de Afinidad para el texto ' { key } ' : { ind_e } " )
print ( f " Error en el cálculo individual de Afinidad para el texto ' { key } ' : { ind_e } " )
2024-10-14 10:47:49 -03:00
affinity_progress . increment ( )
2024-08-01 12:57:04 -03:00
2024-10-14 10:47:49 -03:00
affinity_progress . finish ( )
2024-07-31 09:17:01 -03:00
# Actualizar el DataFrame con las Afinidades
for index , row in df . iterrows ( ) :
2024-08-01 08:53:38 -03:00
celda_clave = str ( row [ source_col ] )
if celda_clave in affinities :
df . at [ index , affinity_col ] = affinities [ celda_clave ]
2024-07-31 09:17:01 -03:00
2024-10-12 09:06:22 -03:00
output_path = config . get_auto_translate_path ( )
2024-08-01 08:53:38 -03:00
fc . save_dataframe_with_retries ( df , output_path = output_path )
2024-07-30 09:58:19 -03:00
logger . info ( f " Archivo traducido guardado en: { output_path } " )
2024-10-14 10:47:49 -03:00
print ( f " \n Archivo traducido guardado en: { output_path } " )
2024-07-30 09:19:19 -03:00
2024-10-12 09:06:22 -03:00
def run ( config : TranslationConfig ) :
global logger
logger = fc . configurar_logger ( config . work_dir )
2024-10-14 10:47:49 -03:00
script_name = os . path . basename ( __file__ )
print ( f " \r Iniciando: { script_name } \r " )
2024-10-12 09:06:22 -03:00
main ( config )
2024-08-01 12:57:04 -03:00
if __name__ == " __main__ " :
2024-10-12 09:06:22 -03:00
import menu_pasos_traduccion
2024-10-14 10:47:49 -03:00
menu_pasos_traduccion . main ( )