Initial langage detection #16
3 changed files with 31 additions and 1 deletions
|
@ -3,3 +3,7 @@ mastodon:
|
|||
client_secret:
|
||||
access_token:
|
||||
api_base_url:
|
||||
languages:
|
||||
- en
|
||||
- es
|
||||
default_language: en
|
|
@ -6,6 +6,7 @@ from typing import Optional
|
|||
import requests
|
||||
import yaml
|
||||
from bs4 import BeautifulSoup
|
||||
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
|
||||
from mastodon import Mastodon
|
||||
from sqlalchemy import create_engine, select
|
||||
from sqlalchemy.exc import NoResultFound
|
||||
|
@ -21,13 +22,27 @@ class KuowStory(Base):
|
|||
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
|
||||
dfp_targeting_id: Mapped[str] = mapped_column()
|
||||
article_link: Mapped[str] = mapped_column()
|
||||
article_language: Mapped[Optional[str]] = mapped_column()
|
||||
last_updated_time: Mapped[datetime] = mapped_column()
|
||||
post_id: Mapped[Optional[str]] = mapped_column()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
|
||||
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
|
||||
|
||||
|
||||
def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language:
|
||||
iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper())
|
||||
return Language.from_iso_code_639_1(iso_code_369_1)
|
||||
|
||||
|
||||
def detect_article_language(article_description: str) -> str:
|
||||
detector = LanguageDetectorBuilder.from_languages(*languages).build()
|
||||
try:
|
||||
language = detector.detect_language_of(article_description)
|
||||
return language.iso_code_639_1.name
|
||||
except AttributeError:
|
||||
return default_language.iso_code_639_1.name
|
||||
|
||||
engine = create_engine("sqlite:///kuow.db")
|
||||
Base.metadata.create_all(engine)
|
||||
|
||||
|
@ -44,6 +59,11 @@ mastodon = Mastodon(
|
|||
api_base_url=config["mastodon"]["api_base_url"],
|
||||
)
|
||||
|
||||
languages = [
|
||||
get_language_from_iso_code_639_1_str(language) for language in config["languages"]
|
||||
]
|
||||
default_language = get_language_from_iso_code_639_1_str(config["default_language"])
|
||||
|
||||
kuow_response = requests.get(url)
|
||||
soup = BeautifulSoup(kuow_response.content, "html.parser")
|
||||
articles = soup.find_all("span", class_="txt")
|
||||
|
@ -131,6 +151,10 @@ with Session(engine) as session:
|
|||
)
|
||||
if not article_record.post_id:
|
||||
print("Posting to Mastodon")
|
||||
|
||||
article_language = detect_article_language(article_description)
|
||||
article_record.article_language = article_language
|
||||
|
||||
mastodon_post_result = mastodon.status_post(
|
||||
status=article_description
|
||||
+ "\n"
|
||||
|
@ -138,6 +162,7 @@ with Session(engine) as session:
|
|||
+ article_link
|
||||
+ "\n#KUOW #News{}".format(additional_tag_string),
|
||||
visibility="public",
|
||||
language=article_language,
|
||||
)
|
||||
article_record.post_id = mastodon_post_result["id"]
|
||||
else:
|
||||
|
|
|
@ -5,6 +5,7 @@ charset-normalizer==3.3.2
|
|||
decorator==5.1.1
|
||||
greenlet==3.0.3
|
||||
idna==3.6
|
||||
lingua-language-detector==2.0.2
|
||||
Mastodon.py==1.8.1
|
||||
python-dateutil==2.8.2
|
||||
python-magic==0.4.27
|
||||
|
|
Loading…
Reference in a new issue