From d4eb54740fec77b1fc906bffd9f10c60b7791059 Mon Sep 17 00:00:00 2001 From: Liam Steckler Date: Sun, 11 Feb 2024 13:45:54 -0800 Subject: [PATCH] Initial langage detection --- config-sample.yml | 4 ++++ kuow_fetcher.py | 27 ++++++++++++++++++++++++++- requirements.txt | 1 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/config-sample.yml b/config-sample.yml index 61137b9..4fd18a4 100644 --- a/config-sample.yml +++ b/config-sample.yml @@ -3,3 +3,7 @@ mastodon: client_secret: access_token: api_base_url: +languages: + - en + - es +default_language: en \ No newline at end of file diff --git a/kuow_fetcher.py b/kuow_fetcher.py index 98590cd..12fef0a 100644 --- a/kuow_fetcher.py +++ b/kuow_fetcher.py @@ -6,6 +6,7 @@ from typing import Optional import requests import yaml from bs4 import BeautifulSoup +from lingua import IsoCode639_1, Language, LanguageDetectorBuilder from mastodon import Mastodon from sqlalchemy import create_engine, select from sqlalchemy.exc import NoResultFound @@ -21,13 +22,27 @@ class KuowStory(Base): pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True) dfp_targeting_id: Mapped[str] = mapped_column() article_link: Mapped[str] = mapped_column() + article_language: Mapped[Optional[str]] = mapped_column() last_updated_time: Mapped[datetime] = mapped_column() post_id: Mapped[Optional[str]] = mapped_column() def __repr__(self) -> str: - return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})" + return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})" +def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language: + iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper()) + return Language.from_iso_code_639_1(iso_code_369_1) + + +def detect_article_language(article_description: str) -> str: + detector = LanguageDetectorBuilder.from_languages(*languages).build() + try: + language = detector.detect_language_of(article_description) + return language.iso_code_639_1.name + except AttributeError: + return default_language.iso_code_639_1.name + engine = create_engine("sqlite:///kuow.db") Base.metadata.create_all(engine) @@ -44,6 +59,11 @@ mastodon = Mastodon( api_base_url=config["mastodon"]["api_base_url"], ) +languages = [ + get_language_from_iso_code_639_1_str(language) for language in config["languages"] +] +default_language = get_language_from_iso_code_639_1_str(config["default_language"]) + kuow_response = requests.get(url) soup = BeautifulSoup(kuow_response.content, "html.parser") articles = soup.find_all("span", class_="txt") @@ -131,6 +151,10 @@ with Session(engine) as session: ) if not article_record.post_id: print("Posting to Mastodon") + + article_language = detect_article_language(article_description) + article_record.article_language = article_language + mastodon_post_result = mastodon.status_post( status=article_description + "\n" @@ -138,6 +162,7 @@ with Session(engine) as session: + article_link + "\n#KUOW #News{}".format(additional_tag_string), visibility="public", + language=article_language, ) article_record.post_id = mastodon_post_result["id"] else: diff --git a/requirements.txt b/requirements.txt index 8682f18..1ac479e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ charset-normalizer==3.3.2 decorator==5.1.1 greenlet==3.0.3 idna==3.6 +lingua-language-detector==2.0.2 Mastodon.py==1.8.1 python-dateutil==2.8.2 python-magic==0.4.27