fix: [cld3] enable cld3

This commit is contained in:
Terrtia 2022-06-21 16:15:18 +02:00
parent 4d39b2c813
commit a32928643b
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 28 additions and 28 deletions

View file

@ -273,20 +273,20 @@ def remove_all_urls_from_content(item_id, item_content=None):
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
all_languages = [] all_languages = []
# ## CLEAN CONTENT ## ## CLEAN CONTENT ##
# content = get_item_content_html2text(item_id, ignore_links=True) content = get_item_content_html2text(item_id, ignore_links=True)
# content = remove_all_urls_from_content(item_id, item_content=content) content = remove_all_urls_from_content(item_id, item_content=content)
#
# # REMOVE USELESS SPACE # REMOVE USELESS SPACE
# content = ' '.join(content.split()) content = ' '.join(content.split())
# #- CLEAN CONTENT -# #- CLEAN CONTENT -#
#
# #print(content) #print(content)
# #print(len(content)) #print(len(content))
# if len(content) >= min_len: if len(content) >= min_len:
# for lang in cld3.get_frequent_languages(content, num_langs=num_langs): for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
# if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
# all_languages.append(lang) all_languages.append(lang)
return all_languages return all_languages
# API # API

View file

@ -148,20 +148,20 @@ def remove_all_urls_from_content(item_id, item_content=None):
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
all_languages = [] all_languages = []
# ## CLEAN CONTENT ## ## CLEAN CONTENT ##
# content = get_item_content_html2text(item_id, ignore_links=True) content = get_item_content_html2text(item_id, ignore_links=True)
# content = remove_all_urls_from_content(item_id, item_content=content) content = remove_all_urls_from_content(item_id, item_content=content)
#
# # REMOVE USELESS SPACE # REMOVE USELESS SPACE
# content = ' '.join(content.split()) content = ' '.join(content.split())
# #- CLEAN CONTENT -# #- CLEAN CONTENT -#
#
# #print(content) #print(content)
# #print(len(content)) #print(len(content))
# if len(content) >= min_len: if len(content) >= min_len:
# for lang in cld3.get_frequent_languages(content, num_langs=num_langs): for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
# if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
# all_languages.append(lang) all_languages.append(lang)
return all_languages return all_languages
# API # API