CtrEditor/Services/LLMService.cs

using GTPCorrgir;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Newtonsoft.Json;

namespace CtrEditor.Services
{
    public class LLMService
    {
        private readonly gtpask _llmProcessor;

        public LLMService()
        {
            _llmProcessor = new gtpask();
        }

        public async Task<string> ProcessText(string text, bool useMarkdown = false)
        {
            try
            {
                _llmProcessor.TextoACorregir = text;
                // Create system prompt through constructor or initialization
                _llmProcessor.TextodeSistema = "You are an OCR correction specialist. Analyze and correct any obvious OCR errors." +
                    "\nPay special attention to:" +
                    "\n- Incorrectly joined words (missing spaces)" +
                    "\n- Wrong character recognition (0 vs O, 1 vs I, etc.)" +
                    "\n- Extra or missing characters" +
                    "\nReturn only the corrected text without explanations.";

                // Initialize a new instance with the system prompt
                await _llmProcessor.CorregirTexto();
                return _llmProcessor.TextoCorregido;
            }
            catch (Exception ex)
            {
                throw new Exception($"Error processing text with LLM: {ex.Message}", ex);
            }
        }

        public async Task<List<(string Source, string Target)>> ProcessTextBatch(
            List<(string Source, string Target)> textPairs,
            string sourceLanguage = "English",
            string targetLanguage = "English")
        {
            try
            {
                var textPairsJson = JsonConvert.SerializeObject(
                    textPairs.Select(p => new[] { p.Source, p.Target }).ToList()
                );

                _llmProcessor.TextoACorregir = textPairsJson;
                _llmProcessor.TextodeSistema = $@"You are an OCR correction specialist working with {sourceLanguage} and {targetLanguage} texts.
For each pair, the first text is in {sourceLanguage} and the second text is in {targetLanguage}.

Pay special attention to:
- Language-specific characters and accents for both {sourceLanguage} and {targetLanguage}
- Incorrectly joined words (missing spaces)
- Wrong character recognition (0 vs O, 1 vs I, etc.)
- Extra or missing characters

Return the corrected versions in JSON format as a list of pairs.
Input: [[""source text"", ""target text""]]
Expected output format: ```json[[""corrected source"", ""corrected target""]]```";

                await _llmProcessor.CorregirTexto();

                // Extract JSON content from markdown
                string jsonContent = ExtractJsonFromMarkdown(_llmProcessor.TextoCorregido);
                if (string.IsNullOrEmpty(jsonContent))
                {
                    throw new Exception("Could not extract JSON content from LLM response");
                }

                var result = JsonConvert.DeserializeObject<List<string[]>>(jsonContent);
                return result.Select(pair => (pair[0].TrimEnd('\n'), pair[1].TrimEnd('\n'))).ToList();
            }
            catch (Exception ex)
            {
                throw new Exception($"Error processing text batch with LLM: {ex.Message}", ex);
            }
        }

        private string ExtractJsonFromMarkdown(string markdownText)
        {
            const string jsonStart = "```json";
            const string codeBlockEnd = "```";

            var startIndex = markdownText.IndexOf(jsonStart);
            if (startIndex == -1) return null;

            startIndex += jsonStart.Length;
            var endIndex = markdownText.IndexOf(codeBlockEnd, startIndex);
            if (endIndex == -1) return null;

            return markdownText.Substring(startIndex, endIndex - startIndex).Trim();
        }

        public void Dispose()
        {
            _llmProcessor?.Dispose();
        }
    }
}