kuow-mastodon-bot/kuow_fetcher.py
Liam Steckler 0feaae9090
All checks were successful
ci/woodpecker/pr/lint Pipeline was successful
ci/woodpecker/pull_request_closed/lint Pipeline was successful
Improve tagging
2024-02-14 08:18:08 -08:00

200 lines
7.7 KiB
Python

import json
from datetime import datetime
from json.decoder import JSONDecodeError
from typing import Optional
import requests
import yaml
from bs4 import BeautifulSoup
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
from mastodon import Mastodon
from sqlalchemy import create_engine, select
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
class Base(DeclarativeBase):
pass
class KuowStory(Base):
__tablename__ = "kuow_stories"
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
dfp_targeting_id: Mapped[str] = mapped_column()
article_link: Mapped[str] = mapped_column()
article_language: Mapped[Optional[str]] = mapped_column()
last_updated_time: Mapped[datetime] = mapped_column()
post_id: Mapped[Optional[str]] = mapped_column()
def __repr__(self) -> str:
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language:
iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper())
return Language.from_iso_code_639_1(iso_code_369_1)
def detect_article_language(article_description: str) -> str:
detector = LanguageDetectorBuilder.from_languages(*languages).build()
try:
language = detector.detect_language_of(article_description)
return language.iso_code_639_1.name
except AttributeError:
return default_language.iso_code_639_1.name
engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine)
kuow_base_url = "https://www.kuow.org"
url = (
kuow_base_url
+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
)
config = yaml.safe_load(open("config.yml"))
mastodon = Mastodon(
client_id=config["mastodon"]["client_id"],
client_secret=config["mastodon"]["client_secret"],
access_token=config["mastodon"]["access_token"],
api_base_url=config["mastodon"]["api_base_url"],
)
languages = [
get_language_from_iso_code_639_1_str(language) for language in config["languages"]
]
default_language = get_language_from_iso_code_639_1_str(config["default_language"])
kuow_response = requests.get(url)
soup = BeautifulSoup(kuow_response.content, "html.parser")
articles = soup.find_all("span", class_="txt")
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
articles.reverse()
with Session(engine) as session:
for article in articles:
article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link))
lookup_statement = select(KuowStory).where(
KuowStory.article_link == article_link
)
lookup_result = session.scalars(lookup_statement)
is_new_article = False
article_lookup = requests.get(kuow_base_url + article_link)
article_soup = BeautifulSoup(article_lookup.content, "html.parser")
try:
schema_org_scriptblock = article_soup.find(
"script", {"type": "application/ld+json"}
)
schema_org = json.loads(schema_org_scriptblock.text)
assert schema_org["@context"] == "http://schema.org"
last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
except (AssertionError, JSONDecodeError):
print(
"Could not find or load schema.org data for this post, looking up the meta published time"
)
try:
last_updated_time = datetime.fromisoformat(
article_soup.find(
"meta", attrs={"property": "article:published_time"}
)["content"]
)
except NameError:
print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now()
try:
article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match
process_article = (
article_record.last_updated_time.astimezone() != last_updated_time
)
except NoResultFound:
# Is a new article, or at least one that doesn't match based on the link
article_record = KuowStory()
process_article = True
is_new_article = True
if process_article:
print("Processing {}".format(article_link))
try:
article_record.pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"]
except (NameError, TypeError):
print("Could not find or load IDs for this post")
tags: list[str] = []
try:
tags.extend(
article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "tags"}
)["data-value"].split("|")
)
except (NameError, TypeError):
print("Could not find or load any tags from the 'tags' property")
try:
tags.extend(
article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "topics"}
)["data-value"].split("|")
)
except (NameError, TypeError):
print("Could not find or load any tags from the 'tags' property")
# Remove duplicates
tags = list(set(tags))
additional_tag_string = ""
for tag in tags:
tag = tag.title().replace(" ", "").replace("&", "And")
if tag.casefold() in config["exclude_tags"]:
print(
"Tag {} was found in the article, but won't be included in the post".format(
tag
)
)
else:
additional_tag_string += " #{}".format(tag)
try:
article_description = (
(article_soup.find("meta", attrs={"property": "description"}))
.attrs["content"]
.strip()
)
if not article_record.post_id:
print("Posting to Mastodon")
article_language = detect_article_language(article_description)
article_record.article_language = article_language
mastodon_post_result = mastodon.status_post(
status=article_description
+ "\n"
+ kuow_base_url
+ article_link
+ "\n#KUOW #News{}".format(additional_tag_string),
visibility="public",
language=article_language,
)
article_record.post_id = mastodon_post_result["id"]
else:
print("Article has already been posted")
except Exception:
print("Could not load a description/post this article")
article_record.article_link = article_link
article_record.last_updated_time = last_updated_time
if is_new_article:
session.add(article_record)
session.commit()