CtrEditor/Services/LLMService.cs

106 lines
4.0 KiB
C#

using GTPCorrgir;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Newtonsoft.Json;
namespace CtrEditor.Services
{
public class LLMService
{
private readonly gtpask _llmProcessor;
public LLMService()
{
_llmProcessor = new gtpask();
}
public async Task<string> ProcessText(string text, bool useMarkdown = false)
{
try
{
_llmProcessor.TextoACorregir = text;
// Create system prompt through constructor or initialization
_llmProcessor.TextodeSistema = "You are an OCR correction specialist. Analyze and correct any obvious OCR errors." +
"\nPay special attention to:" +
"\n- Incorrectly joined words (missing spaces)" +
"\n- Wrong character recognition (0 vs O, 1 vs I, etc.)" +
"\n- Extra or missing characters" +
"\nReturn only the corrected text without explanations.";
// Initialize a new instance with the system prompt
await _llmProcessor.CorregirTexto();
return _llmProcessor.TextoCorregido;
}
catch (Exception ex)
{
throw new Exception($"Error processing text with LLM: {ex.Message}", ex);
}
}
public async Task<List<(string Source, string Target)>> ProcessTextBatch(
List<(string Source, string Target)> textPairs,
string sourceLanguage = "English",
string targetLanguage = "English")
{
try
{
var textPairsJson = JsonConvert.SerializeObject(
textPairs.Select(p => new[] { p.Source, p.Target }).ToList()
);
_llmProcessor.TextoACorregir = textPairsJson;
_llmProcessor.TextodeSistema = $@"You are an OCR correction specialist working with {sourceLanguage} and {targetLanguage} texts.
For each pair, the first text is in {sourceLanguage} and the second text is in {targetLanguage}.
Pay special attention to:
- Language-specific characters and accents for both {sourceLanguage} and {targetLanguage}
- Incorrectly joined words (missing spaces)
- Wrong character recognition (0 vs O, 1 vs I, etc.)
- Extra or missing characters
Return the corrected versions in JSON format as a list of pairs.
Input: [[""source text"", ""target text""]]
Expected output format: ```json[[""corrected source"", ""corrected target""]]```";
await _llmProcessor.CorregirTexto();
// Extract JSON content from markdown
string jsonContent = ExtractJsonFromMarkdown(_llmProcessor.TextoCorregido);
if (string.IsNullOrEmpty(jsonContent))
{
throw new Exception("Could not extract JSON content from LLM response");
}
var result = JsonConvert.DeserializeObject<List<string[]>>(jsonContent);
return result.Select(pair => (pair[0].TrimEnd('\n'), pair[1].TrimEnd('\n'))).ToList();
}
catch (Exception ex)
{
throw new Exception($"Error processing text batch with LLM: {ex.Message}", ex);
}
}
private string ExtractJsonFromMarkdown(string markdownText)
{
const string jsonStart = "```json";
const string codeBlockEnd = "```";
var startIndex = markdownText.IndexOf(jsonStart);
if (startIndex == -1) return null;
startIndex += jsonStart.Length;
var endIndex = markdownText.IndexOf(codeBlockEnd, startIndex);
if (endIndex == -1) return null;
return markdownText.Substring(startIndex, endIndex - startIndex).Trim();
}
public void Dispose()
{
_llmProcessor?.Dispose();
}
}
}