kuow-mastodon-bot/kuow_fetcher.py

209 lines
8 KiB
Python
Raw Permalink Normal View History

2024-01-15 08:47:46 -08:00
import json
from datetime import datetime
2024-02-10 20:22:10 -08:00
from json.decoder import JSONDecodeError
2024-01-15 08:47:46 -08:00
from typing import Optional
2024-01-14 14:48:09 -08:00
import requests
2024-01-14 15:12:33 -08:00
import yaml
2024-01-14 14:48:09 -08:00
from bs4 import BeautifulSoup
2024-02-11 13:45:54 -08:00
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
2024-01-14 14:48:09 -08:00
from mastodon import Mastodon
2024-02-22 06:38:08 -08:00
from sqlalchemy import create_engine, select, or_
2024-01-15 08:47:46 -08:00
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
class Base(DeclarativeBase):
pass
class KuowStory(Base):
__tablename__ = "kuow_stories"
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
dfp_targeting_id: Mapped[str] = mapped_column()
article_link: Mapped[str] = mapped_column()
2024-02-11 13:45:54 -08:00
article_language: Mapped[Optional[str]] = mapped_column()
2024-01-15 08:47:46 -08:00
last_updated_time: Mapped[datetime] = mapped_column()
post_id: Mapped[Optional[str]] = mapped_column()
def __repr__(self) -> str:
2024-02-11 13:45:54 -08:00
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
2024-01-15 08:47:46 -08:00
2024-02-11 13:45:54 -08:00
def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language:
iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper())
return Language.from_iso_code_639_1(iso_code_369_1)
def detect_article_language(article_description: str) -> str:
detector = LanguageDetectorBuilder.from_languages(*languages).build()
try:
language = detector.detect_language_of(article_description)
return language.iso_code_639_1.name
except AttributeError:
return default_language.iso_code_639_1.name
2024-02-14 08:18:08 -08:00
2024-01-15 08:47:46 -08:00
engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine)
2024-01-14 14:48:09 -08:00
kuow_base_url = "https://www.kuow.org"
url = (
kuow_base_url
+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
)
2024-01-14 15:12:33 -08:00
config = yaml.safe_load(open("config.yml"))
mastodon = Mastodon(
client_id=config["mastodon"]["client_id"],
client_secret=config["mastodon"]["client_secret"],
access_token=config["mastodon"]["access_token"],
api_base_url=config["mastodon"]["api_base_url"],
)
2024-01-14 14:48:09 -08:00
2024-02-11 13:45:54 -08:00
languages = [
get_language_from_iso_code_639_1_str(language) for language in config["languages"]
]
default_language = get_language_from_iso_code_639_1_str(config["default_language"])
2024-01-14 15:12:33 -08:00
kuow_response = requests.get(url)
soup = BeautifulSoup(kuow_response.content, "html.parser")
2024-01-14 14:48:09 -08:00
articles = soup.find_all("span", class_="txt")
2024-01-14 14:48:09 -08:00
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
articles.reverse()
2024-01-15 08:47:46 -08:00
with Session(engine) as session:
for article in articles:
article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link))
is_new_article = False
2024-01-14 14:48:09 -08:00
2024-01-14 14:48:09 -08:00
article_lookup = requests.get(kuow_base_url + article_link)
article_soup = BeautifulSoup(article_lookup.content, "html.parser")
2024-01-15 08:47:46 -08:00
2024-01-14 14:48:09 -08:00
try:
2024-01-15 08:47:46 -08:00
schema_org_scriptblock = article_soup.find(
"script", {"type": "application/ld+json"}
2024-01-14 14:48:09 -08:00
)
2024-01-15 08:47:46 -08:00
schema_org = json.loads(schema_org_scriptblock.text)
assert schema_org["@context"] == "http://schema.org"
last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
2024-02-10 20:22:10 -08:00
except (AssertionError, JSONDecodeError):
2024-01-15 08:47:46 -08:00
print(
"Could not find or load schema.org data for this post, looking up the meta published time"
)
try:
last_updated_time = datetime.fromisoformat(
article_soup.find(
"meta", attrs={"property": "article:published_time"}
)["content"]
)
2024-02-10 20:22:10 -08:00
except NameError:
2024-01-15 08:47:46 -08:00
print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now()
2024-02-22 06:41:44 -08:00
try:
pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
except (NameError, TypeError):
print(
"Could not find or load a Pageview story ID, skipping additional processing on this post"
)
continue
2024-01-15 08:47:46 -08:00
try:
2024-02-22 06:38:08 -08:00
lookup_statement = select(KuowStory).where(
or_(
KuowStory.article_link == article_link,
KuowStory.pageview_story_id == pageview_story_id,
)
)
lookup_result = session.scalars(lookup_statement)
2024-01-15 08:47:46 -08:00
article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match
process_article = (
article_record.last_updated_time.astimezone() != last_updated_time
)
except NoResultFound:
2024-02-22 07:14:29 -08:00
# Is a new article
2024-01-15 08:47:46 -08:00
article_record = KuowStory()
process_article = True
is_new_article = True
if process_article:
print("Processing {}".format(article_link))
try:
article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"]
2024-02-14 07:38:07 -08:00
except (NameError, TypeError):
2024-01-15 08:47:46 -08:00
print("Could not find or load IDs for this post")
2024-02-14 08:18:08 -08:00
tags: list[str] = []
2024-01-15 08:47:46 -08:00
try:
2024-02-14 08:18:08 -08:00
tags.extend(
article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "tags"}
)["data-value"].split("|")
)
2024-02-14 07:38:07 -08:00
except (NameError, TypeError):
2024-02-14 08:18:08 -08:00
print("Could not find or load any tags from the 'tags' property")
try:
tags.extend(
article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "topics"}
)["data-value"].split("|")
)
except (NameError, TypeError):
print("Could not find or load any tags from the 'tags' property")
2024-02-22 07:14:29 -08:00
# Remove duplicate tags
2024-02-14 08:18:08 -08:00
tags = list(set(tags))
2024-01-15 08:47:46 -08:00
additional_tag_string = ""
for tag in tags:
2024-02-14 08:18:08 -08:00
tag = tag.title().replace(" ", "").replace("&", "And")
if tag.casefold() in config["exclude_tags"]:
print(
"Tag {} was found in the article, but won't be included in the post".format(
tag
)
)
else:
additional_tag_string += " #{}".format(tag)
2024-01-15 08:47:46 -08:00
try:
article_description = (
(article_soup.find("meta", attrs={"property": "description"}))
.attrs["content"]
.strip()
)
if not article_record.post_id:
print("Posting to Mastodon")
2024-02-11 13:45:54 -08:00
article_language = detect_article_language(article_description)
article_record.article_language = article_language
mastodon_post_result = mastodon.status_post(
status=article_description
+ "\n"
+ kuow_base_url
+ article_link
+ "\n#KUOW #News{}".format(additional_tag_string),
visibility="public",
2024-02-11 13:45:54 -08:00
language=article_language,
)
article_record.post_id = mastodon_post_result["id"]
else:
print("Article has already been posted")
2024-02-10 20:22:10 -08:00
except Exception:
2024-01-15 08:47:46 -08:00
print("Could not load a description/post this article")
2024-02-22 06:38:08 -08:00
article_record.pageview_story_id = pageview_story_id
2024-01-15 08:47:46 -08:00
article_record.article_link = article_link
article_record.last_updated_time = last_updated_time
if is_new_article:
session.add(article_record)
session.commit()