From 208ccec0b6ef75ec9c9f692a838e900ac0501a94 Mon Sep 17 00:00:00 2001 From: Liam Steckler Date: Mon, 15 Jan 2024 08:47:46 -0800 Subject: [PATCH] Migrate to SQLAlchemy for storage --- kuow_fetcher.py | 144 ++++++++++++++++++++++++++++++++++++++--------- requirements.txt | 3 + 2 files changed, 120 insertions(+), 27 deletions(-) diff --git a/kuow_fetcher.py b/kuow_fetcher.py index 5f8e0d0..fba159c 100644 --- a/kuow_fetcher.py +++ b/kuow_fetcher.py @@ -1,14 +1,40 @@ +import json +from datetime import datetime +from typing import Optional + import requests import yaml from bs4 import BeautifulSoup from mastodon import Mastodon +from sqlalchemy import create_engine, select +from sqlalchemy.exc import NoResultFound +from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column + + +class Base(DeclarativeBase): + pass + + +class KuowStory(Base): + __tablename__ = "kuow_stories" + pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True) + dfp_targeting_id: Mapped[str] = mapped_column() + article_link: Mapped[str] = mapped_column() + last_updated_time: Mapped[datetime] = mapped_column() + post_id: Mapped[Optional[str]] = mapped_column() + + def __repr__(self) -> str: + return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})" + + +engine = create_engine("sqlite:///kuow.db") +Base.metadata.create_all(engine) kuow_base_url = "https://www.kuow.org" url = ( kuow_base_url + "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12" ) -log_file = "kuow_bot_logfile.txt" config = yaml.safe_load(open("config.yml")) mastodon = Mastodon( client_id=config["mastodon"]["client_id"], @@ -23,36 +49,100 @@ articles = soup.find_all("span", class_="txt") # Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published articles.reverse() -for article in articles: - article_link = article.find("a").attrs["href"] +with Session(engine) as session: + for article in articles: + article_link = article.find("a").attrs["href"] + print("Checking {}".format(article_link)) - is_new_article = True - with open(log_file, "r") as fp: - lines = fp.readlines() - for row in lines: - if row == article_link + "\n": - print("Article " + article_link + " has already been seen") - is_new_article = False + lookup_statement = select(KuowStory).where( + KuowStory.article_link == article_link + ) + lookup_result = session.scalars(lookup_statement) + is_new_article = False - if is_new_article: - print(article_link + " has not been seen, posting") article_lookup = requests.get(kuow_base_url + article_link) article_soup = BeautifulSoup(article_lookup.content, "html.parser") + try: - article_description = ( - (article_soup.find("meta", attrs={"property": "description"})) - .attrs["content"] - .strip() + schema_org_scriptblock = article_soup.find( + "script", {"type": "application/ld+json"} ) - mastodon.status_post( - status=article_description - + "\n" - + kuow_base_url - + article_link - + "\n#KUOW #News", - visibility="unlisted", - ) - with open(log_file, "a") as fp: - fp.write(article_link + "\n") + schema_org = json.loads(schema_org_scriptblock.text) + assert schema_org["@context"] == "http://schema.org" + last_updated_time = datetime.fromisoformat(schema_org["dateModified"]) except: - print("Could not load a description/post this article") + print( + "Could not find or load schema.org data for this post, looking up the meta published time" + ) + try: + last_updated_time = datetime.fromisoformat( + article_soup.find( + "meta", attrs={"property": "article:published_time"} + )["content"] + ) + except: + print("Could not find or load the meta published time for this post") + last_updated_time = datetime.now() + + try: + article_record = lookup_result.one() + # Only process existing articles if the last updated time doesn't match + process_article = ( + article_record.last_updated_time.astimezone() != last_updated_time + ) + except NoResultFound: + # Is a new article, or at least one that doesn't match based on the link + article_record = KuowStory() + process_article = True + is_new_article = True + + if process_article: + print("Processing {}".format(article_link)) + + try: + article_record.pageview_story_id = article_soup.find( + "script", {"class": "pageview_story"} + )["data-id"] + article_record.dfp_targeting_id = article_soup.find( + "script", {"class": "dfp_targeting", "data-key": "id"} + )["data-value"] + except: + print("Could not find or load IDs for this post") + + try: + tags = article_soup.find( + "script", {"class": "dfp_targeting", "data-key": "tags"} + )["data-value"].split("|") + except: + print("Could not find or load any tags for this article") + tags = [] + + additional_tag_string = "" + for tag in tags: + # TODO: Do a check for tags in the config file that we don't want to tag posts with + additional_tag_string += " #{}".format(tag.title().replace(" ", "")) + + try: + article_description = ( + (article_soup.find("meta", attrs={"property": "description"})) + .attrs["content"] + .strip() + ) + mastodon_post_result = mastodon.status_post( + status=article_description + + "\n" + + kuow_base_url + + article_link + + "\n#KUOW #News{}".format(additional_tag_string), + visibility="public", + ) + article_record.post_id = mastodon_post_result["id"] + except: + print("Could not load a description/post this article") + + article_record.article_link = article_link + article_record.last_updated_time = last_updated_time + + if is_new_article: + session.add(article_record) + session.commit() diff --git a/requirements.txt b/requirements.txt index e4b3eed..f4fe2e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ blurhash==1.1.4 certifi==2023.11.17 charset-normalizer==3.3.2 decorator==5.1.1 +greenlet==3.0.3 idna==3.6 Mastodon.py==1.8.1 python-dateutil==2.8.2 @@ -11,4 +12,6 @@ PyYAML==6.0.1 requests==2.31.0 six==1.16.0 soupsieve==2.5 +SQLAlchemy==2.0.25 +typing_extensions==4.9.0 urllib3==2.1.0