52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
|
# services/language/langid_service.py
|
||
|
"""
|
||
|
Language detection service using langid
|
||
|
"""
|
||
|
from typing import List, Tuple, Optional, Set
|
||
|
import langid
|
||
|
from .base import LanguageDetectionService
|
||
|
|
||
|
class LangIdService(LanguageDetectionService):
|
||
|
def __init__(self, allowed_languages: Optional[Set[str]] = None):
|
||
|
"""
|
||
|
Initialize langid service
|
||
|
|
||
|
Args:
|
||
|
allowed_languages: Set of allowed language codes (e.g., {'en', 'es', 'fr'})
|
||
|
If None, all languages supported by langid will be allowed
|
||
|
"""
|
||
|
if allowed_languages:
|
||
|
langid.set_languages(list(allowed_languages))
|
||
|
self.allowed_languages = allowed_languages
|
||
|
|
||
|
def detect_language(self, text: str) -> Tuple[str, float]:
|
||
|
"""
|
||
|
Detect language of a text using langid
|
||
|
|
||
|
Args:
|
||
|
text: Text to analyze
|
||
|
|
||
|
Returns:
|
||
|
Tuple of (language_code, confidence_score)
|
||
|
"""
|
||
|
try:
|
||
|
if not text or len(text.strip()) < 3:
|
||
|
return ("unknown", 0.0)
|
||
|
|
||
|
lang, score = langid.classify(text.strip())
|
||
|
return (lang, score)
|
||
|
except Exception as e:
|
||
|
print(f"Error in language detection: {e}")
|
||
|
return ("unknown", 0.0)
|
||
|
|
||
|
def detect_batch(self, texts: List[str]) -> List[Tuple[str, float]]:
|
||
|
"""
|
||
|
Detect language of multiple texts
|
||
|
|
||
|
Args:
|
||
|
texts: List of texts to analyze
|
||
|
|
||
|
Returns:
|
||
|
List of tuples (language_code, confidence_score)
|
||
|
"""
|
||
|
return [self.detect_language(text) for text in texts]
|