106 lines
4.0 KiB
C#
106 lines
4.0 KiB
C#
using GTPCorrgir;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Threading.Tasks;
|
|
using Newtonsoft.Json;
|
|
|
|
namespace CtrEditor.Services
|
|
{
|
|
public class LLMService
|
|
{
|
|
private readonly gtpask _llmProcessor;
|
|
|
|
public LLMService()
|
|
{
|
|
_llmProcessor = new gtpask();
|
|
}
|
|
|
|
public async Task<string> ProcessText(string text, bool useMarkdown = false)
|
|
{
|
|
try
|
|
{
|
|
_llmProcessor.TextoACorregir = text;
|
|
// Create system prompt through constructor or initialization
|
|
_llmProcessor.TextodeSistema = "You are an OCR correction specialist. Analyze and correct any obvious OCR errors." +
|
|
"\nPay special attention to:" +
|
|
"\n- Incorrectly joined words (missing spaces)" +
|
|
"\n- Wrong character recognition (0 vs O, 1 vs I, etc.)" +
|
|
"\n- Extra or missing characters" +
|
|
"\nReturn only the corrected text without explanations.";
|
|
|
|
// Initialize a new instance with the system prompt
|
|
await _llmProcessor.CorregirTexto();
|
|
return _llmProcessor.TextoCorregido;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
throw new Exception($"Error processing text with LLM: {ex.Message}", ex);
|
|
}
|
|
}
|
|
|
|
public async Task<List<(string Source, string Target)>> ProcessTextBatch(
|
|
List<(string Source, string Target)> textPairs,
|
|
string sourceLanguage = "English",
|
|
string targetLanguage = "English")
|
|
{
|
|
try
|
|
{
|
|
var textPairsJson = JsonConvert.SerializeObject(
|
|
textPairs.Select(p => new[] { p.Source, p.Target }).ToList()
|
|
);
|
|
|
|
_llmProcessor.TextoACorregir = textPairsJson;
|
|
_llmProcessor.TextodeSistema = $@"You are an OCR correction specialist working with {sourceLanguage} and {targetLanguage} texts.
|
|
For each pair, the first text is in {sourceLanguage} and the second text is in {targetLanguage}.
|
|
|
|
Pay special attention to:
|
|
- Language-specific characters and accents for both {sourceLanguage} and {targetLanguage}
|
|
- Incorrectly joined words (missing spaces)
|
|
- Wrong character recognition (0 vs O, 1 vs I, etc.)
|
|
- Extra or missing characters
|
|
|
|
Return the corrected versions in JSON format as a list of pairs.
|
|
Input: [[""source text"", ""target text""]]
|
|
Expected output format: ```json[[""corrected source"", ""corrected target""]]```";
|
|
|
|
await _llmProcessor.CorregirTexto();
|
|
|
|
// Extract JSON content from markdown
|
|
string jsonContent = ExtractJsonFromMarkdown(_llmProcessor.TextoCorregido);
|
|
if (string.IsNullOrEmpty(jsonContent))
|
|
{
|
|
throw new Exception("Could not extract JSON content from LLM response");
|
|
}
|
|
|
|
var result = JsonConvert.DeserializeObject<List<string[]>>(jsonContent);
|
|
return result.Select(pair => (pair[0].TrimEnd('\n'), pair[1].TrimEnd('\n'))).ToList();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
throw new Exception($"Error processing text batch with LLM: {ex.Message}", ex);
|
|
}
|
|
}
|
|
|
|
private string ExtractJsonFromMarkdown(string markdownText)
|
|
{
|
|
const string jsonStart = "```json";
|
|
const string codeBlockEnd = "```";
|
|
|
|
var startIndex = markdownText.IndexOf(jsonStart);
|
|
if (startIndex == -1) return null;
|
|
|
|
startIndex += jsonStart.Length;
|
|
var endIndex = markdownText.IndexOf(codeBlockEnd, startIndex);
|
|
if (endIndex == -1) return null;
|
|
|
|
return markdownText.Substring(startIndex, endIndex - startIndex).Trim();
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
_llmProcessor?.Dispose();
|
|
}
|
|
}
|
|
}
|