Migrate to SQLAlchemy for storage

2024-01-15 08:47:46 -08:00 · 2024-01-15 08:47:46 -08:00 · 208ccec0b6
commit 208ccec0b6
parent 860a083405
2 changed files with 120 additions and 27 deletions
--- a/kuow_fetcher.py
+++ b/kuow_fetcher.py
@ -1,14 +1,40 @@
+import json
+from datetime import datetime
+from typing import Optional
+
 import requests
 import yaml
 from bs4 import BeautifulSoup
 from mastodon import Mastodon
+from sqlalchemy import create_engine, select
+from sqlalchemy.exc import NoResultFound
+from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+class KuowStory(Base):
+    __tablename__ = "kuow_stories"
+    pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
+    dfp_targeting_id: Mapped[str] = mapped_column()
+    article_link: Mapped[str] = mapped_column()
+    last_updated_time: Mapped[datetime] = mapped_column()
+    post_id: Mapped[Optional[str]] = mapped_column()
+
+    def __repr__(self) -> str:
+        return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
+
+
+engine = create_engine("sqlite:///kuow.db")
+Base.metadata.create_all(engine)

 kuow_base_url = "https://www.kuow.org"
 url = (
    kuow_base_url
    + "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
 )
-log_file = "kuow_bot_logfile.txt"
 config = yaml.safe_load(open("config.yml"))
 mastodon = Mastodon(
    client_id=config["mastodon"]["client_id"],
@ -23,36 +49,100 @@ articles = soup.find_all("span", class_="txt")
 # Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
 articles.reverse()

-for article in articles:
-    article_link = article.find("a").attrs["href"]
+with Session(engine) as session:
+    for article in articles:
+        article_link = article.find("a").attrs["href"]
+        print("Checking {}".format(article_link))

-    is_new_article = True
-    with open(log_file, "r") as fp:
-        lines = fp.readlines()
-        for row in lines:
-            if row == article_link + "\n":
-                print("Article " + article_link + " has already been seen")
-                is_new_article = False
+        lookup_statement = select(KuowStory).where(
+            KuowStory.article_link == article_link
+        )
+        lookup_result = session.scalars(lookup_statement)
+        is_new_article = False

-    if is_new_article:
-        print(article_link + " has not been seen, posting")
        article_lookup = requests.get(kuow_base_url + article_link)
        article_soup = BeautifulSoup(article_lookup.content, "html.parser")
+
        try:
-            article_description = (
-                (article_soup.find("meta", attrs={"property": "description"}))
-                .attrs["content"]
-                .strip()
+            schema_org_scriptblock = article_soup.find(
+                "script", {"type": "application/ld+json"}
            )
-            mastodon.status_post(
-                status=article_description
-                + "\n"
-                + kuow_base_url
-                + article_link
-                + "\n#KUOW #News",
-                visibility="unlisted",
-            )
-            with open(log_file, "a") as fp:
-                fp.write(article_link + "\n")
+            schema_org = json.loads(schema_org_scriptblock.text)
+            assert schema_org["@context"] == "http://schema.org"
+            last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
        except:
-            print("Could not load a description/post this article")
+            print(
+                "Could not find or load schema.org data for this post, looking up the meta published time"
+            )
+            try:
+                last_updated_time = datetime.fromisoformat(
+                    article_soup.find(
+                        "meta", attrs={"property": "article:published_time"}
+                    )["content"]
+                )
+            except:
+                print("Could not find or load the meta published time for this post")
+                last_updated_time = datetime.now()
+
+        try:
+            article_record = lookup_result.one()
+            # Only process existing articles if the last updated time doesn't match
+            process_article = (
+                article_record.last_updated_time.astimezone() != last_updated_time
+            )
+        except NoResultFound:
+            # Is a new article, or at least one that doesn't match based on the link
+            article_record = KuowStory()
+            process_article = True
+            is_new_article = True
+
+        if process_article:
+            print("Processing {}".format(article_link))
+
+            try:
+                article_record.pageview_story_id = article_soup.find(
+                    "script", {"class": "pageview_story"}
+                )["data-id"]
+                article_record.dfp_targeting_id = article_soup.find(
+                    "script", {"class": "dfp_targeting", "data-key": "id"}
+                )["data-value"]
+            except:
+                print("Could not find or load IDs for this post")
+
+            try:
+                tags = article_soup.find(
+                    "script", {"class": "dfp_targeting", "data-key": "tags"}
+                )["data-value"].split("|")
+            except:
+                print("Could not find or load any tags for this article")
+                tags = []
+
+            additional_tag_string = ""
+            for tag in tags:
+                # TODO: Do a check for tags in the config file that we don't want to tag posts with
+                additional_tag_string += " #{}".format(tag.title().replace(" ", ""))
+
+            try:
+                article_description = (
+                    (article_soup.find("meta", attrs={"property": "description"}))
+                    .attrs["content"]
+                    .strip()
+                )
+                mastodon_post_result = mastodon.status_post(
+                    status=article_description
+                    + "\n"
+                    + kuow_base_url
+                    + article_link
+                    + "\n#KUOW #News{}".format(additional_tag_string),
+                    visibility="public",
+                )
+                article_record.post_id = mastodon_post_result["id"]
+            except:
+                print("Could not load a description/post this article")
+
+            article_record.article_link = article_link
+            article_record.last_updated_time = last_updated_time
+
+            if is_new_article:
+                session.add(article_record)
+            session.commit()
--- a/requirements.txt
+++ b/requirements.txt
@ -3,6 +3,7 @@ blurhash==1.1.4
 certifi==2023.11.17
 charset-normalizer==3.3.2
 decorator==5.1.1
+greenlet==3.0.3
 idna==3.6
 Mastodon.py==1.8.1
 python-dateutil==2.8.2
@ -11,4 +12,6 @@ PyYAML==6.0.1
 requests==2.31.0
 six==1.16.0
 soupsieve==2.5
+SQLAlchemy==2.0.25
+typing_extensions==4.9.0
 urllib3==2.1.0