2024-07-30 09:19:19 -03:00
import pandas as pd
2024-07-30 09:58:19 -03:00
from openai import OpenAI
2024-07-30 09:19:19 -03:00
import os
import re
2024-07-30 09:58:19 -03:00
import logging
2024-07-30 09:19:19 -03:00
from openai_api_key import api_key
2024-07-30 13:03:39 -03:00
from x2_master_export2translate import transformar_texto
import ollama
2024-07-30 16:48:02 -03:00
import json
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
client = OpenAI ( api_key = api_key ( ) )
2024-07-30 09:19:19 -03:00
# Diccionario de idiomas
IDIOMAS = {
2024-07-30 11:17:13 -03:00
1 : ( " English " , " en-GB " ) ,
2 : ( " Portuguese " , " pt-PT " ) ,
3 : ( " Spanish " , " es-ES " ) ,
4 : ( " Russian " , " ru-RU " ) ,
5 : ( " French " , " fr-FR " ) ,
6 : ( " German " , " de-DE " ) ,
2024-07-30 09:19:19 -03:00
}
2024-07-30 09:58:19 -03:00
def configurar_logger ( ) :
logger = logging . getLogger ( " translate_logger " )
logger . setLevel ( logging . DEBUG ) # Cambiado a DEBUG para más información
2024-07-30 13:03:39 -03:00
os . makedirs ( " . \\ data " , exist_ok = True )
fh = logging . FileHandler ( " . \\ data \\ translate_log.log " , encoding = " utf-8 " )
2024-07-30 09:58:19 -03:00
fh . setLevel ( logging . DEBUG )
formatter = logging . Formatter ( " %(asctime)s - %(levelname)s - %(message)s " )
fh . setFormatter ( formatter )
logger . addHandler ( fh )
return logger
logger = configurar_logger ( )
2024-07-30 09:19:19 -03:00
def mostrar_idiomas ( ) :
print ( " Selecciona el idioma de destino: " )
for numero , ( nombre , _ ) in IDIOMAS . items ( ) :
print ( f " { numero } : { nombre } " )
2024-07-30 09:58:19 -03:00
2024-07-30 09:19:19 -03:00
def translate_text ( text , source_lang , target_lang ) :
2024-07-30 09:58:19 -03:00
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el texto: { text } "
2024-07-30 09:19:19 -03:00
)
2024-07-30 09:58:19 -03:00
response = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
{ " role " : " system " , " content " : f " You are a translator. " } ,
{
" role " : " user " ,
" content " : f " Translate the following text from { source_lang } to { target_lang } while preserving special fields like <> and <#>. This texts are for an HMI industrial machine: { text } " ,
} ,
] ,
max_tokens = 150 ,
temperature = 0.3 ,
)
translated_text = response . choices [ 0 ] . message . content . strip ( )
logger . info ( f " Respuesta recibida: { translated_text } " )
return translated_text
2024-07-30 12:16:58 -03:00
def read_system_prompt ( ) :
try :
with open ( " /data/system_prompt.txt " , " r " , encoding = " utf-8 " ) as file :
return file . read ( ) . strip ( )
except FileNotFoundError :
logger . warning ( " Archivo system_prompt.txt no encontrado. Usando prompt por defecto. " )
return " You are a translator. "
2024-07-30 16:48:02 -03:00
def translate_batch_openai ( batch_texts , source_lang , target_lang ) :
# Aquí se asume que esta función maneja una lista de textos y devuelve una lista de traducciones
translations = [ ]
for text in batch_texts :
translation = translate_text ( text , source_lang , target_lang )
translations . append ( translation )
return translations
2024-07-30 13:03:39 -03:00
def translate_batch_openai ( texts , source_lang , target_lang ) :
2024-07-30 09:19:19 -03:00
joined_text = " \n " . join ( texts )
2024-07-30 12:16:58 -03:00
system_prompt = read_system_prompt ( )
2024-07-30 09:58:19 -03:00
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { joined_text } "
)
2024-07-30 16:48:02 -03:00
request_payload = json . dumps ( { " texts " : texts , " source_lang " : source_lang , " target_lang " : target_lang } )
2024-07-30 09:58:19 -03:00
response = client . chat . completions . create (
2024-07-30 13:03:39 -03:00
model = " gpt-4o-mini " , # "gpt-3.5-turbo",
2024-07-30 09:58:19 -03:00
messages = [
2024-07-30 16:48:02 -03:00
{ " role " : " system " , " content " : f " You are a translator. { system_prompt } . " } ,
{ " role " : " user " , " content " : request_payload }
2024-07-30 09:58:19 -03:00
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:19:19 -03:00
)
2024-07-30 16:48:02 -03:00
response_payload = json . loads ( response . choices [ 0 ] . message . content . strip ( ) )
translations = response_payload . get ( " texts " , [ ] )
2024-07-30 09:58:19 -03:00
logger . info ( f " Respuestas recibidas: \n { translations } " )
2024-07-30 16:48:02 -03:00
if len ( translations ) != len ( texts ) :
raise ValueError ( " La cantidad de traducciones recibidas no coincide con la cantidad de textos enviados. " )
2024-07-30 09:19:19 -03:00
return translations
2024-07-30 13:03:39 -03:00
def translate_batch ( texts , source_lang , target_lang ) :
joined_text = " \n " . join ( texts )
system_prompt = read_system_prompt ( )
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { joined_text } "
)
response = ollama . generate ( model = ' llama3.1 ' , prompt = f " Translate the following texts from { source_lang } to { target_lang } while preserving special fields like <> and <#>. { system_prompt } : \n \n { joined_text } " )
translations = response [ ' response ' ] . strip ( ) . split ( " \n " )
logger . info ( f " Respuestas recibidas: \n { translations } " )
return translations
2024-07-30 09:58:19 -03:00
2024-07-30 09:19:19 -03:00
def texto_requiere_traduccion ( texto ) :
2024-07-30 09:58:19 -03:00
palabras = re . findall ( r " \ b \ w { 4,} \ b " , texto )
campos_especiales = re . findall ( r " <.*?> " , texto )
requiere_traduccion = len ( palabras ) > 0 or len ( campos_especiales ) != len (
re . findall ( r " <#> " , texto )
)
logger . debug (
f " Decisión de traducción para texto ' { texto } ' : { ' Sí ' if requiere_traduccion else ' No ' } (palabras > 3 letras: { len ( palabras ) > 0 } , solo campos especiales: { len ( campos_especiales ) == len ( re . findall ( r ' <#> ' , texto ) ) } ) "
)
return requiere_traduccion
2024-07-30 09:19:19 -03:00
2024-07-30 12:16:58 -03:00
def main ( file_path , target_lang_code , target_lang , traducir_todo , batch_size = 10 ) :
2024-07-30 09:19:19 -03:00
df = pd . read_excel ( file_path )
source_col = " it-IT "
2024-07-30 11:17:13 -03:00
source_translated_col = target_lang_code
2024-07-30 09:19:19 -03:00
target_col = f " { target_lang_code } Translated "
2024-07-30 11:17:13 -03:00
# Asegurarse de que la columna de destino existe
if target_col not in df . columns :
2024-07-30 09:19:19 -03:00
df [ target_col ] = None
2024-07-30 09:58:19 -03:00
texts_to_translate = [ ]
indices_to_translate = [ ]
2024-07-30 11:17:13 -03:00
for index , row in df . iterrows ( ) :
source_text = str ( row [ source_col ] )
source_translated_text = str ( row [ source_translated_col ] ) if source_translated_col in df . columns else " "
processed_text = transformar_texto ( source_text )
if traducir_todo :
# Traducir todas las celdas del idioma de destino
2024-07-30 09:58:19 -03:00
if texto_requiere_traduccion ( processed_text ) :
2024-07-30 11:17:13 -03:00
texts_to_translate . append ( processed_text )
2024-07-30 09:58:19 -03:00
indices_to_translate . append ( index )
2024-07-30 11:17:13 -03:00
else :
# Traducir solo las celdas vacías en el idioma de destino original
if pd . isna ( row [ source_translated_col ] ) or source_translated_text . strip ( ) == " " :
if texto_requiere_traduccion ( processed_text ) :
texts_to_translate . append ( processed_text )
indices_to_translate . append ( index )
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
num_texts = len ( texts_to_translate )
logger . info ( f " Número total de textos a traducir: { num_texts } " )
2024-07-30 12:16:58 -03:00
print ( f " Número total de textos a traducir: { num_texts } " )
2024-07-30 09:58:19 -03:00
translations = [ ]
2024-07-30 16:48:02 -03:00
for start_idx in range ( 0 , num_texts , batch_size ) :
2024-07-30 09:58:19 -03:00
end_idx = min ( start_idx + batch_size , num_texts )
2024-07-30 09:19:19 -03:00
batch_texts = texts_to_translate [ start_idx : end_idx ]
2024-07-30 16:48:02 -03:00
logger . info ( f " Traduciendo: celdas desde { start_idx } a { end_idx } . " )
2024-07-30 12:16:58 -03:00
print ( f " Traduciendo : celdas desde: { start_idx } a : { end_idx } . " )
2024-07-30 16:48:02 -03:00
try :
batch_translations = translate_batch_openai ( batch_texts , ' Italian ' , target_lang_code )
translations . extend ( batch_translations )
except Exception as e :
logger . error ( f " Error en la traducción de celdas desde { start_idx } a { end_idx } : { e } " )
print ( f " Error en la traducción de celdas desde { start_idx } a { end_idx } : { e } " )
continue
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de traducciones recibidas: { len ( translations ) } " )
if len ( translations ) != len ( indices_to_translate ) :
2024-07-30 11:17:13 -03:00
logger . warning ( f " Desajuste entre el número de traducciones ( { len ( translations ) } ) y el número de índices ( { len ( indices_to_translate ) } ) " )
2024-07-30 09:58:19 -03:00
for i , index in enumerate ( indices_to_translate ) :
if i < len ( translations ) :
df . at [ index , target_col ] = translations [ i ]
else :
logger . error ( f " No hay traducción disponible para el índice { index } " )
2024-07-30 09:19:19 -03:00
2024-07-30 13:03:39 -03:00
output_path = os . path . join ( os . path . dirname ( file_path ) , ' 3_master_export2translate_translated.xlsx ' )
2024-07-30 09:19:19 -03:00
df . to_excel ( output_path , index = False )
2024-07-30 09:58:19 -03:00
logger . info ( f " Archivo traducido guardado en: { output_path } " )
2024-07-30 09:19:19 -03:00
print ( f " Archivo traducido guardado en: { output_path } " )
2024-07-30 09:58:19 -03:00
2024-07-30 09:19:19 -03:00
if __name__ == " __main__ " :
2024-07-30 16:48:02 -03:00
batch_size = 20
2024-07-30 13:03:39 -03:00
translate_file = " . \\ data \\ 2_master_export2translate.xlsx "
2024-07-30 09:19:19 -03:00
mostrar_idiomas ( )
seleccion_idioma = int ( input ( " Introduce el número del idioma de destino: " ) )
if seleccion_idioma not in IDIOMAS :
print ( " Selección inválida. " )
else :
2024-07-30 12:16:58 -03:00
target_lang , target_lang_code = IDIOMAS [ seleccion_idioma ]
2024-07-30 09:58:19 -03:00
traducir_todo = (
input ( " ¿Desea traducir todas las celdas (s/n)? " ) . strip ( ) . lower ( ) == " s "
)
2024-07-30 12:16:58 -03:00
main ( translate_file , target_lang_code , target_lang , traducir_todo , batch_size )