From a32928643b1ece2f03f277b2e4de826c3fc25a11 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 21 Jun 2022 16:15:18 +0200 Subject: [PATCH] fix: [cld3] enable cld3 --- bin/lib/objects/Items.py | 28 ++++++++++++++-------------- bin/packages/Item.py | 28 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/bin/lib/objects/Items.py b/bin/lib/objects/Items.py index e34315d7..d31a8315 100755 --- a/bin/lib/objects/Items.py +++ b/bin/lib/objects/Items.py @@ -273,20 +273,20 @@ def remove_all_urls_from_content(item_id, item_content=None): def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): all_languages = [] - # ## CLEAN CONTENT ## - # content = get_item_content_html2text(item_id, ignore_links=True) - # content = remove_all_urls_from_content(item_id, item_content=content) - # - # # REMOVE USELESS SPACE - # content = ' '.join(content.split()) - # #- CLEAN CONTENT -# - # - # #print(content) - # #print(len(content)) - # if len(content) >= min_len: - # for lang in cld3.get_frequent_languages(content, num_langs=num_langs): - # if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: - # all_languages.append(lang) + ## CLEAN CONTENT ## + content = get_item_content_html2text(item_id, ignore_links=True) + content = remove_all_urls_from_content(item_id, item_content=content) + + # REMOVE USELESS SPACE + content = ' '.join(content.split()) + #- CLEAN CONTENT -# + + #print(content) + #print(len(content)) + if len(content) >= min_len: + for lang in cld3.get_frequent_languages(content, num_langs=num_langs): + if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: + all_languages.append(lang) return all_languages # API diff --git a/bin/packages/Item.py b/bin/packages/Item.py index 40765a39..ab634073 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -148,20 +148,20 @@ def remove_all_urls_from_content(item_id, item_content=None): def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): all_languages = [] - # ## CLEAN CONTENT ## - # content = get_item_content_html2text(item_id, ignore_links=True) - # content = remove_all_urls_from_content(item_id, item_content=content) - # - # # REMOVE USELESS SPACE - # content = ' '.join(content.split()) - # #- CLEAN CONTENT -# - # - # #print(content) - # #print(len(content)) - # if len(content) >= min_len: - # for lang in cld3.get_frequent_languages(content, num_langs=num_langs): - # if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: - # all_languages.append(lang) + ## CLEAN CONTENT ## + content = get_item_content_html2text(item_id, ignore_links=True) + content = remove_all_urls_from_content(item_id, item_content=content) + + # REMOVE USELESS SPACE + content = ' '.join(content.split()) + #- CLEAN CONTENT -# + + #print(content) + #print(len(content)) + if len(content) >= min_len: + for lang in cld3.get_frequent_languages(content, num_langs=num_langs): + if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: + all_languages.append(lang) return all_languages # API