kuow-mastodon-bot/kuow_fetcher.py

153 lines
5.9 KiB
Python

import json
from datetime import datetime
from json.decoder import JSONDecodeError
from typing import Optional
import requests
import yaml
from bs4 import BeautifulSoup
from mastodon import Mastodon
from sqlalchemy import create_engine, select
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
class Base(DeclarativeBase):
pass
class KuowStory(Base):
__tablename__ = "kuow_stories"
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
dfp_targeting_id: Mapped[str] = mapped_column()
article_link: Mapped[str] = mapped_column()
last_updated_time: Mapped[datetime] = mapped_column()
post_id: Mapped[Optional[str]] = mapped_column()
def __repr__(self) -> str:
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine)
kuow_base_url = "https://www.kuow.org"
url = (
kuow_base_url
+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
)
config = yaml.safe_load(open("config.yml"))
mastodon = Mastodon(
client_id=config["mastodon"]["client_id"],
client_secret=config["mastodon"]["client_secret"],
access_token=config["mastodon"]["access_token"],
api_base_url=config["mastodon"]["api_base_url"],
)
kuow_response = requests.get(url)
soup = BeautifulSoup(kuow_response.content, "html.parser")
articles = soup.find_all("span", class_="txt")
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
articles.reverse()
with Session(engine) as session:
for article in articles:
article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link))
lookup_statement = select(KuowStory).where(
KuowStory.article_link == article_link
)
lookup_result = session.scalars(lookup_statement)
is_new_article = False
article_lookup = requests.get(kuow_base_url + article_link)
article_soup = BeautifulSoup(article_lookup.content, "html.parser")
try:
schema_org_scriptblock = article_soup.find(
"script", {"type": "application/ld+json"}
)
schema_org = json.loads(schema_org_scriptblock.text)
assert schema_org["@context"] == "http://schema.org"
last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
except (AssertionError, JSONDecodeError):
print(
"Could not find or load schema.org data for this post, looking up the meta published time"
)
try:
last_updated_time = datetime.fromisoformat(
article_soup.find(
"meta", attrs={"property": "article:published_time"}
)["content"]
)
except NameError:
print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now()
try:
article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match
process_article = (
article_record.last_updated_time.astimezone() != last_updated_time
)
except NoResultFound:
# Is a new article, or at least one that doesn't match based on the link
article_record = KuowStory()
process_article = True
is_new_article = True
if process_article:
print("Processing {}".format(article_link))
try:
article_record.pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"]
except (NameError, TypeError):
print("Could not find or load IDs for this post")
try:
tags = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "tags"}
)["data-value"].split("|")
except (NameError, TypeError):
print("Could not find or load any tags for this article")
tags = []
additional_tag_string = ""
for tag in tags:
# TODO: Do a check for tags in the config file that we don't want to tag posts with
additional_tag_string += " #{}".format(tag.title().replace(" ", ""))
try:
article_description = (
(article_soup.find("meta", attrs={"property": "description"}))
.attrs["content"]
.strip()
)
if not article_record.post_id:
print("Posting to Mastodon")
mastodon_post_result = mastodon.status_post(
status=article_description
+ "\n"
+ kuow_base_url
+ article_link
+ "\n#KUOW #News{}".format(additional_tag_string),
visibility="public",
)
article_record.post_id = mastodon_post_result["id"]
else:
print("Article has already been posted")
except Exception:
print("Could not load a description/post this article")
article_record.article_link = article_link
article_record.last_updated_time = last_updated_time
if is_new_article:
session.add(article_record)
session.commit()