2024-07-30 09:19:19 -03:00
import pandas as pd
2024-07-30 09:58:19 -03:00
from openai import OpenAI
2024-07-30 09:19:19 -03:00
import os
import re
2024-07-30 09:58:19 -03:00
import logging
2024-07-31 07:02:49 -03:00
from openai_api_key import openai_api_key
from google_api_key import google_api_key
2024-07-30 13:03:39 -03:00
import ollama
2024-07-30 16:48:02 -03:00
import json
2024-07-31 07:02:49 -03:00
from google . cloud import translate_v2 as translate
from google . oauth2 import service_account
import html
2024-07-31 11:21:24 -03:00
from tqdm import tqdm
import time
2024-07-31 11:54:44 -03:00
import funciones_comunes
2024-07-30 09:19:19 -03:00
2024-07-31 09:17:01 -03:00
openai_client = OpenAI ( api_key = openai_api_key ( ) )
GOOGLE_APPLICATION_CREDENTIALS = " translate-431108-020c17463fbb.json "
2024-07-30 09:19:19 -03:00
2024-07-31 09:17:01 -03:00
2024-07-30 09:58:19 -03:00
def configurar_logger ( ) :
logger = logging . getLogger ( " translate_logger " )
2024-07-31 07:02:49 -03:00
logger . setLevel ( logging . DEBUG )
2024-07-30 13:03:39 -03:00
os . makedirs ( " . \\ data " , exist_ok = True )
fh = logging . FileHandler ( " . \\ data \\ translate_log.log " , encoding = " utf-8 " )
2024-07-30 09:58:19 -03:00
fh . setLevel ( logging . DEBUG )
formatter = logging . Formatter ( " %(asctime)s - %(levelname)s - %(message)s " )
fh . setFormatter ( formatter )
logger . addHandler ( fh )
return logger
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
def init_google_translate_client ( ) :
if os . path . exists ( GOOGLE_APPLICATION_CREDENTIALS ) :
# Usar credenciales de cuenta de servicio
credentials = service_account . Credentials . from_service_account_file (
GOOGLE_APPLICATION_CREDENTIALS
)
return translate . Client ( credentials = credentials )
else :
2024-07-31 09:17:01 -03:00
raise ValueError (
" No se han proporcionado credenciales válidas para Google Translate "
)
2024-07-30 09:58:19 -03:00
2024-07-31 07:02:49 -03:00
google_translate_client = init_google_translate_client ( )
2024-07-30 09:58:19 -03:00
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
def google_translate ( text , target_language ) :
result = google_translate_client . translate ( text , target_language = target_language )
2024-07-31 09:17:01 -03:00
translated_text = result [ " translatedText " ]
2024-07-31 07:02:49 -03:00
return html . unescape ( translated_text )
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
logger = configurar_logger ( )
2024-07-30 09:58:19 -03:00
2024-07-31 09:17:01 -03:00
2024-07-30 12:16:58 -03:00
def read_system_prompt ( ) :
try :
2024-07-31 07:02:49 -03:00
with open ( " . \\ data \\ system_prompt.txt " , " r " , encoding = " utf-8 " ) as file :
2024-07-30 12:16:58 -03:00
return file . read ( ) . strip ( )
except FileNotFoundError :
2024-07-31 09:17:01 -03:00
logger . warning (
" Archivo system_prompt.txt no encontrado. Usando prompt por defecto. "
)
2024-07-30 12:16:58 -03:00
return " You are a translator. "
2024-07-30 16:48:02 -03:00
2024-07-31 09:17:01 -03:00
def translate_batch_ollama ( texts , source_lang , target_lang ) :
joined_text = " \n " . join ( texts )
system_prompt = read_system_prompt ( )
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { joined_text } "
)
response = ollama . generate (
model = " llama3.1 " ,
prompt = f " Translate the following texts from { source_lang } to { target_lang } while preserving special fields like <> and <#>. { system_prompt } : \n \n { joined_text } " ,
)
translations = response [ " response " ] . strip ( ) . split ( " \n " )
logger . info ( f " Respuestas recibidas: \n { translations } " )
return translations
def texto_requiere_traduccion ( texto ) :
palabras = re . findall ( r " \ b \ w { 4,} \ b " , texto )
campos_especiales = re . findall ( r " <.*?> " , texto )
requiere_traduccion = len ( palabras ) > 0 or len ( campos_especiales ) != len (
re . findall ( r " <#> " , texto )
)
logger . debug (
f " Decisión de traducción para texto ' { texto } ' : { ' Sí ' if requiere_traduccion else ' No ' } (palabras > 3 letras: { len ( palabras ) > 0 } , solo campos especiales: { len ( campos_especiales ) == len ( re . findall ( r ' <#> ' , texto ) ) } ) "
)
return requiere_traduccion
2024-07-31 07:02:49 -03:00
def translate_batch_openai ( texts_dict , source_lang , target_lang ) :
2024-07-30 12:16:58 -03:00
system_prompt = read_system_prompt ( )
2024-07-31 07:02:49 -03:00
texts_list = list ( texts_dict . values ( ) )
2024-07-31 09:17:01 -03:00
request_payload = json . dumps (
{ " texts " : texts_list , " source_lang " : source_lang , " target_lang " : target_lang }
)
2024-07-30 09:58:19 -03:00
logger . info (
2024-07-31 07:02:49 -03:00
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { request_payload } "
2024-07-30 09:58:19 -03:00
)
2024-07-31 07:02:49 -03:00
2024-07-31 09:17:01 -03:00
response = openai_client . chat . completions . create (
2024-07-31 07:02:49 -03:00
model = " gpt-4o-mini " ,
2024-07-30 09:58:19 -03:00
messages = [
2024-07-30 16:48:02 -03:00
{ " role " : " system " , " content " : f " You are a translator. { system_prompt } . " } ,
2024-07-31 09:17:01 -03:00
{ " role " : " user " , " content " : request_payload } ,
2024-07-30 09:58:19 -03:00
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:19:19 -03:00
)
2024-07-30 16:48:02 -03:00
response_payload = json . loads ( response . choices [ 0 ] . message . content . strip ( ) )
translations = response_payload . get ( " texts " , [ ] )
2024-07-30 09:58:19 -03:00
logger . info ( f " Respuestas recibidas: \n { translations } " )
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
if len ( translations ) != len ( texts_list ) :
2024-07-31 09:17:01 -03:00
raise ValueError (
" La cantidad de traducciones recibidas no coincide con la cantidad de textos enviados. "
)
2024-07-31 07:02:49 -03:00
return dict ( zip ( texts_dict . keys ( ) , translations ) )
2024-07-30 09:19:19 -03:00
2024-07-30 13:03:39 -03:00
2024-07-31 09:17:01 -03:00
def affinity_batch_openai ( texts_dict ) :
2024-07-31 11:21:24 -03:00
system_prompt = (
" Evaluate the semantic similarity between the following table of pairs of texts in json format on a scale from 0 to 1. "
" Return the similarity scores for every row in JSON format as a list of numbers, without any additional text or formatting. "
)
2024-07-31 11:54:44 -03:00
original_list = [ funciones_comunes . transformar_texto ( key ) for key in texts_dict . keys ( ) ]
2024-07-31 09:17:01 -03:00
re_translated_list = list ( texts_dict . values ( ) )
2024-07-30 09:58:19 -03:00
2024-07-31 09:17:01 -03:00
request_payload = json . dumps (
{ " original " : original_list , " compared " : re_translated_list }
2024-07-30 09:58:19 -03:00
)
2024-07-31 09:17:01 -03:00
logger . info ( f " Solicitando Afinidad para el lote de textos: \n { request_payload } " )
response = openai_client . chat . completions . create (
model = " gpt-4o-mini " ,
messages = [
{
" role " : " system " ,
2024-07-31 11:21:24 -03:00
" content " : system_prompt ,
2024-07-31 09:17:01 -03:00
} ,
{ " role " : " user " , " content " : request_payload } ,
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:58:19 -03:00
)
2024-07-31 11:21:24 -03:00
response_content = response . choices [ 0 ] . message . content
# Limpiar y convertir el contenido de la respuesta
cleaned_response_content = response_content . strip ( ) . strip ( " ' ```json " ) . strip ( " ``` " )
# Intentar convertir el contenido a JSON
try :
response_payload = json . loads ( cleaned_response_content )
except json . JSONDecodeError :
raise ValueError ( " La respuesta no se pudo decodificar como JSON. " )
# Manejar diferentes formatos de respuesta
if isinstance ( response_payload , dict ) and ' similarity_scores ' in response_payload :
scores = response_payload [ ' similarity_scores ' ]
elif isinstance ( response_payload , list ) :
scores = response_payload
else :
raise ValueError ( " Formato de respuesta inesperado. " )
2024-07-31 09:17:01 -03:00
logger . info ( f " Respuestas recibidas: \n { scores } " )
if len ( scores ) != len ( original_list ) :
raise ValueError (
" La cantidad de afinidades recibidas no coincide con la cantidad de textos enviados. "
)
return dict ( zip ( texts_dict . keys ( ) , scores ) )
2024-07-30 09:58:19 -03:00
2024-07-31 11:21:24 -03:00
2024-07-31 07:02:49 -03:00
def main ( file_path , target_lang_code , target_lang , traducir_todo , batch_size = 10 ) :
2024-07-30 09:19:19 -03:00
df = pd . read_excel ( file_path )
source_col = " it-IT "
2024-07-30 11:17:13 -03:00
source_translated_col = target_lang_code
2024-07-30 09:19:19 -03:00
target_col = f " { target_lang_code } Translated "
2024-07-31 07:02:49 -03:00
check_translate_col = f " { target_lang_code } CheckTranslate "
2024-07-31 09:17:01 -03:00
affinity_col = f " { target_lang_code } Affinity "
2024-07-30 09:19:19 -03:00
2024-07-30 11:17:13 -03:00
# Asegurarse de que la columna de destino existe
if target_col not in df . columns :
2024-07-30 09:19:19 -03:00
df [ target_col ] = None
2024-07-31 07:02:49 -03:00
if check_translate_col not in df . columns :
df [ check_translate_col ] = None
2024-07-31 09:17:01 -03:00
if affinity_col not in df . columns :
df [ affinity_col ] = None
2024-07-30 09:19:19 -03:00
2024-07-31 07:02:49 -03:00
texts_to_translate = { }
2024-07-30 09:58:19 -03:00
2024-07-31 07:02:49 -03:00
for _ , row in df . iterrows ( ) :
2024-07-30 11:17:13 -03:00
source_text = str ( row [ source_col ] )
2024-07-31 09:17:01 -03:00
source_translated_text = (
str ( row [ source_translated_col ] )
if source_translated_col in df . columns
else " "
)
2024-07-31 11:54:44 -03:00
processed_text = funciones_comunes . transformar_texto ( source_text )
2024-07-31 09:17:01 -03:00
2024-07-30 11:17:13 -03:00
if traducir_todo :
2024-07-30 09:58:19 -03:00
if texto_requiere_traduccion ( processed_text ) :
2024-07-31 07:02:49 -03:00
texts_to_translate [ source_text ] = processed_text
2024-07-30 11:17:13 -03:00
else :
2024-07-31 09:17:01 -03:00
if (
pd . isna ( row [ source_translated_col ] )
or source_translated_text . strip ( ) == " "
) :
2024-07-30 11:17:13 -03:00
if texto_requiere_traduccion ( processed_text ) :
2024-07-31 07:02:49 -03:00
texts_to_translate [ source_text ] = processed_text
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
num_texts = len ( texts_to_translate )
logger . info ( f " Número total de textos a traducir: { num_texts } " )
2024-07-30 12:16:58 -03:00
print ( f " Número total de textos a traducir: { num_texts } " )
2024-07-31 09:17:01 -03:00
# Traducciones
# Hacer las traducciones via LLM en batch
2024-07-31 07:02:49 -03:00
translations = { }
2024-07-30 16:48:02 -03:00
for start_idx in range ( 0 , num_texts , batch_size ) :
2024-07-30 09:58:19 -03:00
end_idx = min ( start_idx + batch_size , num_texts )
2024-07-31 07:02:49 -03:00
batch_texts = dict ( list ( texts_to_translate . items ( ) ) [ start_idx : end_idx ] )
2024-07-30 16:48:02 -03:00
logger . info ( f " Traduciendo: celdas desde { start_idx } a { end_idx } . " )
2024-07-30 12:16:58 -03:00
print ( f " Traduciendo : celdas desde: { start_idx } a : { end_idx } . " )
2024-07-31 09:17:01 -03:00
2024-07-31 07:02:49 -03:00
retries = 2 # Número de intentos totales (1 inicial + 1 reintento)
for attempt in range ( retries ) :
try :
2024-07-31 09:17:01 -03:00
batch_translations = translate_batch_openai (
batch_texts , " Italian " , target_lang
)
2024-07-31 07:02:49 -03:00
translations . update ( batch_translations )
break # Si la traducción es exitosa, salimos del bucle de reintentos
except Exception as e :
if attempt < retries - 1 : # Si no es el último intento
2024-07-31 09:17:01 -03:00
logger . warning (
f " Error en el intento { attempt + 1 } de traducción de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... "
)
print (
f " Error en el intento { attempt + 1 } de traducción de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... "
)
2024-07-31 07:02:49 -03:00
else : # Si es el último intento
2024-07-31 09:17:01 -03:00
logger . error (
f " Error en todos los intentos de traducción de celdas desde { start_idx } a { end_idx } : { e } "
)
print (
f " Error en todos los intentos de traducción de celdas desde { start_idx } a { end_idx } : { e } "
)
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de traducciones recibidas: { len ( translations ) } " )
2024-07-31 09:17:01 -03:00
# Traduccion inversa
# Actualizar el DataFrame con las traducciones y hacemos la Traduccion inversa
2024-07-31 11:21:24 -03:00
for index , row in tqdm ( df . iterrows ( ) , total = df . shape [ 0 ] , desc = " Procesando traducciones " ) :
2024-07-31 07:02:49 -03:00
source_text = str ( row [ source_col ] )
if source_text in translations :
df . at [ index , target_col ] = translations [ source_text ]
# Realizar la traducción de verificación con Google Translate
try :
2024-07-31 09:17:01 -03:00
google_translation = google_translate ( translations [ source_text ] , " it " )
2024-07-31 07:02:49 -03:00
df . at [ index , check_translate_col ] = google_translation
except Exception as e :
2024-07-31 09:17:01 -03:00
logger . error (
f " Error en la traducción de Google para el texto ' { source_text } ' : { e } "
)
2024-07-31 07:02:49 -03:00
df . at [ index , check_translate_col ] = " Error en la traducción "
2024-07-31 09:17:01 -03:00
df . at [ index , affinity_col ] = 0.0
2024-07-30 09:19:19 -03:00
2024-07-31 09:17:01 -03:00
# Afinidades
# Se calculan las Afinidades
affinities = { }
2024-07-31 11:21:24 -03:00
batch_size = 10
2024-07-31 09:17:01 -03:00
for start_idx in range ( 0 , num_texts , batch_size ) :
end_idx = min ( start_idx + batch_size , num_texts )
batch_texts = dict ( list ( texts_to_translate . items ( ) ) [ start_idx : end_idx ] )
logger . info ( f " Afinidad: celdas desde { start_idx } a { end_idx } . " )
print ( f " Afinidad: celdas desde: { start_idx } a : { end_idx } . " )
retries = 2 # Número de intentos totales (1 inicial + 1 reintento)
for attempt in range ( retries ) :
try :
batch_affinities = affinity_batch_openai ( batch_texts )
affinities . update ( batch_affinities )
break # Si la llamada es exitosa, salimos del bucle de reintentos
except Exception as e :
if attempt < retries - 1 : # Si no es el último intento
logger . warning (
f " Error en el intento { attempt + 1 } de Afinidad de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... "
)
print (
f " Error en el intento { attempt + 1 } de Afinidad de celdas desde { start_idx } a { end_idx } : { e } . Reintentando... "
)
else : # Si es el último intento
logger . error (
f " Error en todos los intentos de Afinidad de celdas desde { start_idx } a { end_idx } : { e } "
)
print (
f " Error en todos los intentos de Afinidad de celdas desde { start_idx } a { end_idx } : { e } "
)
# Actualizar el DataFrame con las Afinidades
for index , row in df . iterrows ( ) :
source_text = str ( row [ source_col ] )
2024-07-31 11:21:24 -03:00
if source_text in affinities :
2024-07-31 09:17:01 -03:00
df . at [ index , affinity_col ] = affinities [ source_text ]
output_path = os . path . join (
os . path . dirname ( file_path ) , " 3_master_export2translate_translated.xlsx "
)
2024-07-31 11:54:44 -03:00
funciones_comunes . save_dataframe_with_retries ( df , output_path = output_path )
2024-07-30 09:58:19 -03:00
logger . info ( f " Archivo traducido guardado en: { output_path } " )
2024-07-30 09:19:19 -03:00
print ( f " Archivo traducido guardado en: { output_path } " )
2024-07-31 09:17:01 -03:00
2024-07-30 09:19:19 -03:00
if __name__ == " __main__ " :
2024-07-30 16:48:02 -03:00
batch_size = 20
2024-07-30 13:03:39 -03:00
translate_file = " . \\ data \\ 2_master_export2translate.xlsx "
2024-07-30 09:19:19 -03:00
2024-07-31 11:54:44 -03:00
funciones_comunes . mostrar_idiomas ( )
2024-07-30 09:19:19 -03:00
seleccion_idioma = int ( input ( " Introduce el número del idioma de destino: " ) )
2024-07-31 11:54:44 -03:00
if seleccion_idioma not in funciones_comunes . IDIOMAS :
2024-07-30 09:19:19 -03:00
print ( " Selección inválida. " )
else :
2024-07-31 11:54:44 -03:00
target_lang , target_lang_code = funciones_comunes . IDIOMAS [ seleccion_idioma ]
2024-07-30 09:58:19 -03:00
traducir_todo = (
input ( " ¿Desea traducir todas las celdas (s/n)? " ) . strip ( ) . lower ( ) == " s "
)
2024-07-31 09:17:01 -03:00
main ( translate_file , target_lang_code , target_lang , traducir_todo , batch_size )