2024-07-30 09:19:19 -03:00
import pandas as pd
2024-07-30 09:58:19 -03:00
from openai import OpenAI
2024-07-30 09:19:19 -03:00
import os
import re
2024-07-30 09:58:19 -03:00
import logging
2024-07-30 09:19:19 -03:00
from openai_api_key import api_key
2024-07-30 12:16:58 -03:00
from 2 _master_export2translate import transformar_texto
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
client = OpenAI ( api_key = api_key ( ) )
2024-07-30 09:19:19 -03:00
# Diccionario de idiomas
IDIOMAS = {
2024-07-30 11:17:13 -03:00
1 : ( " English " , " en-GB " ) ,
2 : ( " Portuguese " , " pt-PT " ) ,
3 : ( " Spanish " , " es-ES " ) ,
4 : ( " Russian " , " ru-RU " ) ,
5 : ( " French " , " fr-FR " ) ,
6 : ( " German " , " de-DE " ) ,
2024-07-30 09:19:19 -03:00
}
2024-07-30 09:58:19 -03:00
def configurar_logger ( ) :
logger = logging . getLogger ( " translate_logger " )
logger . setLevel ( logging . DEBUG ) # Cambiado a DEBUG para más información
2024-07-30 12:16:58 -03:00
fh = logging . FileHandler ( " /data/translate_log.log " , encoding = " utf-8 " )
2024-07-30 09:58:19 -03:00
fh . setLevel ( logging . DEBUG )
formatter = logging . Formatter ( " %(asctime)s - %(levelname)s - %(message)s " )
fh . setFormatter ( formatter )
logger . addHandler ( fh )
return logger
logger = configurar_logger ( )
2024-07-30 09:19:19 -03:00
def mostrar_idiomas ( ) :
print ( " Selecciona el idioma de destino: " )
for numero , ( nombre , _ ) in IDIOMAS . items ( ) :
print ( f " { numero } : { nombre } " )
2024-07-30 09:58:19 -03:00
2024-07-30 09:19:19 -03:00
def translate_text ( text , source_lang , target_lang ) :
2024-07-30 09:58:19 -03:00
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el texto: { text } "
2024-07-30 09:19:19 -03:00
)
2024-07-30 09:58:19 -03:00
response = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
{ " role " : " system " , " content " : f " You are a translator. " } ,
{
" role " : " user " ,
" content " : f " Translate the following text from { source_lang } to { target_lang } while preserving special fields like <> and <#>. This texts are for an HMI industrial machine: { text } " ,
} ,
] ,
max_tokens = 150 ,
temperature = 0.3 ,
)
translated_text = response . choices [ 0 ] . message . content . strip ( )
logger . info ( f " Respuesta recibida: { translated_text } " )
return translated_text
2024-07-30 12:16:58 -03:00
def read_system_prompt ( ) :
try :
with open ( " /data/system_prompt.txt " , " r " , encoding = " utf-8 " ) as file :
return file . read ( ) . strip ( )
except FileNotFoundError :
logger . warning ( " Archivo system_prompt.txt no encontrado. Usando prompt por defecto. " )
return " You are a translator. "
2024-07-30 09:19:19 -03:00
def translate_batch ( texts , source_lang , target_lang ) :
joined_text = " \n " . join ( texts )
2024-07-30 12:16:58 -03:00
system_prompt = read_system_prompt ( )
2024-07-30 09:58:19 -03:00
logger . info (
f " Solicitando traducción de { source_lang } a { target_lang } para el lote de textos: \n { joined_text } "
)
response = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
2024-07-30 12:16:58 -03:00
{ " role " : " system " , " content " : system_prompt } ,
2024-07-30 09:58:19 -03:00
{
" role " : " user " ,
2024-07-30 12:16:58 -03:00
" content " : f " Translate the following texts from { source_lang } to { target_lang } while preserving special fields like <> and <#>. This texts are for an HMI industrial machine: \n \n { joined_text } " ,
2024-07-30 09:58:19 -03:00
} ,
] ,
max_tokens = 1500 ,
temperature = 0.3 ,
2024-07-30 09:19:19 -03:00
)
2024-07-30 09:58:19 -03:00
translations = response . choices [ 0 ] . message . content . strip ( ) . split ( " \n " )
logger . info ( f " Respuestas recibidas: \n { translations } " )
2024-07-30 09:19:19 -03:00
return translations
2024-07-30 09:58:19 -03:00
2024-07-30 09:19:19 -03:00
def texto_requiere_traduccion ( texto ) :
2024-07-30 09:58:19 -03:00
palabras = re . findall ( r " \ b \ w { 4,} \ b " , texto )
campos_especiales = re . findall ( r " <.*?> " , texto )
requiere_traduccion = len ( palabras ) > 0 or len ( campos_especiales ) != len (
re . findall ( r " <#> " , texto )
)
logger . debug (
f " Decisión de traducción para texto ' { texto } ' : { ' Sí ' if requiere_traduccion else ' No ' } (palabras > 3 letras: { len ( palabras ) > 0 } , solo campos especiales: { len ( campos_especiales ) == len ( re . findall ( r ' <#> ' , texto ) ) } ) "
)
return requiere_traduccion
2024-07-30 09:19:19 -03:00
2024-07-30 12:16:58 -03:00
def main ( file_path , target_lang_code , target_lang , traducir_todo , batch_size = 10 ) :
2024-07-30 09:19:19 -03:00
df = pd . read_excel ( file_path )
source_col = " it-IT "
2024-07-30 11:17:13 -03:00
source_translated_col = target_lang_code
2024-07-30 09:19:19 -03:00
target_col = f " { target_lang_code } Translated "
2024-07-30 11:17:13 -03:00
# Asegurarse de que la columna de destino existe
if target_col not in df . columns :
2024-07-30 09:19:19 -03:00
df [ target_col ] = None
2024-07-30 09:58:19 -03:00
texts_to_translate = [ ]
indices_to_translate = [ ]
2024-07-30 11:17:13 -03:00
for index , row in df . iterrows ( ) :
source_text = str ( row [ source_col ] )
source_translated_text = str ( row [ source_translated_col ] ) if source_translated_col in df . columns else " "
processed_text = transformar_texto ( source_text )
if traducir_todo :
# Traducir todas las celdas del idioma de destino
2024-07-30 09:58:19 -03:00
if texto_requiere_traduccion ( processed_text ) :
2024-07-30 11:17:13 -03:00
texts_to_translate . append ( processed_text )
2024-07-30 09:58:19 -03:00
indices_to_translate . append ( index )
2024-07-30 11:17:13 -03:00
else :
# Traducir solo las celdas vacías en el idioma de destino original
if pd . isna ( row [ source_translated_col ] ) or source_translated_text . strip ( ) == " " :
if texto_requiere_traduccion ( processed_text ) :
texts_to_translate . append ( processed_text )
indices_to_translate . append ( index )
2024-07-30 09:19:19 -03:00
2024-07-30 09:58:19 -03:00
num_texts = len ( texts_to_translate )
logger . info ( f " Número total de textos a traducir: { num_texts } " )
2024-07-30 12:16:58 -03:00
print ( f " Número total de textos a traducir: { num_texts } " )
2024-07-30 09:58:19 -03:00
translations = [ ]
for start_idx in range ( 0 , num_texts , batch_size ) :
end_idx = min ( start_idx + batch_size , num_texts )
2024-07-30 09:19:19 -03:00
batch_texts = texts_to_translate [ start_idx : end_idx ]
2024-07-30 12:16:58 -03:00
print ( f " Traduciendo : celdas desde: { start_idx } a : { end_idx } . " )
batch_translations = translate_batch ( batch_texts , ' Italian ' , target_lang )
2024-07-30 09:19:19 -03:00
translations . extend ( batch_translations )
2024-07-30 09:58:19 -03:00
logger . info ( f " Número total de traducciones recibidas: { len ( translations ) } " )
if len ( translations ) != len ( indices_to_translate ) :
2024-07-30 11:17:13 -03:00
logger . warning ( f " Desajuste entre el número de traducciones ( { len ( translations ) } ) y el número de índices ( { len ( indices_to_translate ) } ) " )
2024-07-30 09:58:19 -03:00
for i , index in enumerate ( indices_to_translate ) :
if i < len ( translations ) :
df . at [ index , target_col ] = translations [ i ]
else :
logger . error ( f " No hay traducción disponible para el índice { index } " )
2024-07-30 09:19:19 -03:00
2024-07-30 12:16:58 -03:00
output_path = os . path . join ( os . path . dirname ( file_path ) , ' /data/3_master_export2translate_translated.xlsx ' )
2024-07-30 09:19:19 -03:00
df . to_excel ( output_path , index = False )
2024-07-30 09:58:19 -03:00
logger . info ( f " Archivo traducido guardado en: { output_path } " )
2024-07-30 09:19:19 -03:00
print ( f " Archivo traducido guardado en: { output_path } " )
2024-07-30 09:58:19 -03:00
2024-07-30 09:19:19 -03:00
if __name__ == " __main__ " :
2024-07-30 12:16:58 -03:00
batch_size = 20
translate_file = " /data/2_master_export2translate.xlsx "
2024-07-30 09:19:19 -03:00
mostrar_idiomas ( )
seleccion_idioma = int ( input ( " Introduce el número del idioma de destino: " ) )
if seleccion_idioma not in IDIOMAS :
print ( " Selección inválida. " )
else :
2024-07-30 12:16:58 -03:00
target_lang , target_lang_code = IDIOMAS [ seleccion_idioma ]
2024-07-30 09:58:19 -03:00
traducir_todo = (
input ( " ¿Desea traducir todas las celdas (s/n)? " ) . strip ( ) . lower ( ) == " s "
)
2024-07-30 12:16:58 -03:00
main ( translate_file , target_lang_code , target_lang , traducir_todo , batch_size )