Compare commits

...

6 commits

Author SHA1 Message Date
9a52805213 Merge remote-tracking branch 'origin/main' into update-metadata-exceptions
All checks were successful
ci/woodpecker/pr/lint Pipeline was successful
ci/woodpecker/pull_request_closed/lint Pipeline was successful
2024-02-14 07:38:18 -08:00
d9ee0a632e Update metadata exception types 2024-02-14 07:38:07 -08:00
7186957eb0 Update dependency SQLAlchemy to v2.0.27
All checks were successful
ci/woodpecker/pr/lint Pipeline was successful
ci/woodpecker/pull_request_closed/lint Pipeline was successful
ci/woodpecker/push/lint Pipeline was successful
ci/woodpecker/manual/lint Pipeline was successful
2024-02-13 16:00:14 +00:00
7061e433fd Merge pull request 'Initial langage detection' (#16) from detect-post-language into main
All checks were successful
ci/woodpecker/push/lint Pipeline was successful
Reviewed-on: #16
2024-02-11 13:58:46 -08:00
9891b8af87 Change lint Docker image to one which Lingua is available for
All checks were successful
ci/woodpecker/pr/lint Pipeline was successful
ci/woodpecker/pull_request_closed/lint Pipeline was successful
2024-02-11 13:56:37 -08:00
d4eb54740f Initial langage detection
Some checks failed
ci/woodpecker/pr/lint Pipeline failed
2024-02-11 13:45:54 -08:00
4 changed files with 35 additions and 5 deletions

View file

@ -2,7 +2,7 @@ when:
branch: main branch: main
steps: steps:
- name: lint - name: lint
image: python:3-alpine image: python:3-slim
commands: commands:
- python -m pip install --upgrade pip - python -m pip install --upgrade pip
- python -m pip install -r requirements.txt - python -m pip install -r requirements.txt

View file

@ -3,3 +3,7 @@ mastodon:
client_secret: client_secret:
access_token: access_token:
api_base_url: api_base_url:
languages:
- en
- es
default_language: en

View file

@ -6,6 +6,7 @@ from typing import Optional
import requests import requests
import yaml import yaml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
from mastodon import Mastodon from mastodon import Mastodon
from sqlalchemy import create_engine, select from sqlalchemy import create_engine, select
from sqlalchemy.exc import NoResultFound from sqlalchemy.exc import NoResultFound
@ -21,13 +22,27 @@ class KuowStory(Base):
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True) pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
dfp_targeting_id: Mapped[str] = mapped_column() dfp_targeting_id: Mapped[str] = mapped_column()
article_link: Mapped[str] = mapped_column() article_link: Mapped[str] = mapped_column()
article_language: Mapped[Optional[str]] = mapped_column()
last_updated_time: Mapped[datetime] = mapped_column() last_updated_time: Mapped[datetime] = mapped_column()
post_id: Mapped[Optional[str]] = mapped_column() post_id: Mapped[Optional[str]] = mapped_column()
def __repr__(self) -> str: def __repr__(self) -> str:
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})" return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language:
iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper())
return Language.from_iso_code_639_1(iso_code_369_1)
def detect_article_language(article_description: str) -> str:
detector = LanguageDetectorBuilder.from_languages(*languages).build()
try:
language = detector.detect_language_of(article_description)
return language.iso_code_639_1.name
except AttributeError:
return default_language.iso_code_639_1.name
engine = create_engine("sqlite:///kuow.db") engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
@ -44,6 +59,11 @@ mastodon = Mastodon(
api_base_url=config["mastodon"]["api_base_url"], api_base_url=config["mastodon"]["api_base_url"],
) )
languages = [
get_language_from_iso_code_639_1_str(language) for language in config["languages"]
]
default_language = get_language_from_iso_code_639_1_str(config["default_language"])
kuow_response = requests.get(url) kuow_response = requests.get(url)
soup = BeautifulSoup(kuow_response.content, "html.parser") soup = BeautifulSoup(kuow_response.content, "html.parser")
articles = soup.find_all("span", class_="txt") articles = soup.find_all("span", class_="txt")
@ -107,14 +127,14 @@ with Session(engine) as session:
article_record.dfp_targeting_id = article_soup.find( article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"} "script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"] )["data-value"]
except NameError: except (NameError, TypeError):
print("Could not find or load IDs for this post") print("Could not find or load IDs for this post")
try: try:
tags = article_soup.find( tags = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "tags"} "script", {"class": "dfp_targeting", "data-key": "tags"}
)["data-value"].split("|") )["data-value"].split("|")
except NameError: except (NameError, TypeError):
print("Could not find or load any tags for this article") print("Could not find or load any tags for this article")
tags = [] tags = []
@ -131,6 +151,10 @@ with Session(engine) as session:
) )
if not article_record.post_id: if not article_record.post_id:
print("Posting to Mastodon") print("Posting to Mastodon")
article_language = detect_article_language(article_description)
article_record.article_language = article_language
mastodon_post_result = mastodon.status_post( mastodon_post_result = mastodon.status_post(
status=article_description status=article_description
+ "\n" + "\n"
@ -138,6 +162,7 @@ with Session(engine) as session:
+ article_link + article_link
+ "\n#KUOW #News{}".format(additional_tag_string), + "\n#KUOW #News{}".format(additional_tag_string),
visibility="public", visibility="public",
language=article_language,
) )
article_record.post_id = mastodon_post_result["id"] article_record.post_id = mastodon_post_result["id"]
else: else:

View file

@ -5,6 +5,7 @@ charset-normalizer==3.3.2
decorator==5.1.1 decorator==5.1.1
greenlet==3.0.3 greenlet==3.0.3
idna==3.6 idna==3.6
lingua-language-detector==2.0.2
Mastodon.py==1.8.1 Mastodon.py==1.8.1
python-dateutil==2.8.2 python-dateutil==2.8.2
python-magic==0.4.27 python-magic==0.4.27
@ -12,6 +13,6 @@ PyYAML==6.0.1
requests==2.31.0 requests==2.31.0
six==1.16.0 six==1.16.0
soupsieve==2.5 soupsieve==2.5
SQLAlchemy==2.0.26 SQLAlchemy==2.0.27
typing_extensions==4.9.0 typing_extensions==4.9.0
urllib3==2.2.0 urllib3==2.2.0