fix: [language] crawled items, force gcld3 detection

This commit is contained in:
terrtia 2024-02-05 14:10:19 +01:00
parent 99fedf9855
commit aa56e71631
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 5 additions and 5 deletions

View file

@ -357,9 +357,9 @@ class LanguagesDetector:
languages.append(language) languages.append(language)
return languages return languages
def detect(self, content): def detect(self, content, force_gcld3=False):
# gcld3 # gcld3
if len(content) >= 200 or not self.lt: if len(content) >= 200 or not self.lt or force_gcld3:
language = self.detect_gcld3(content) language = self.detect_gcld3(content)
# libretranslate # libretranslate
else: else:

View file

@ -339,9 +339,9 @@ class Item(AbstractObject):
return {'nb': nb_line, 'max_length': max_length} return {'nb': nb_line, 'max_length': max_length}
# TODO RENAME ME # TODO RENAME ME
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7, force_gcld3=False):
ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len) ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len)
return ld.detect(self.get_content()) return ld.detect(self.get_content(), force_gcld3=force_gcld3)
def get_mimetype(self, content=None): def get_mimetype(self, content=None):
if not content: if not content:

View file

@ -30,7 +30,7 @@ class Languages(AbstractModule):
if obj.type == 'item': if obj.type == 'item':
if obj.is_crawled(): if obj.is_crawled():
domain = Domain(obj.get_domain()) domain = Domain(obj.get_domain())
for lang in obj.get_languages(min_probability=0.8): for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
print(lang) print(lang)
domain.add_language(lang) domain.add_language(lang)