4 changed files with 31 additions and 45 deletions
--- a/.woodpecker/lint.yml
+++ b/.woodpecker/lint.yml
@ -7,4 +7,4 @@ steps:
      - python -m pip install --upgrade pip
      - python -m pip install -r requirements.txt
      - python -m pip install ruff
-      - ruff check .
+      - ruff check .
--- a/README.md
+++ b/README.md
@ -1,12 +1,6 @@
 # KUOW Mastodon Bot
 [![status-badge](https://ci.gruezi.net/api/badges/1/status.svg)](https://ci.gruezi.net/repos/1)

-This bot scrapes the KUOW website, looks for news stories that haven't already been seen before, then posts them to Mastodon.
+This bot scrapes the KUOW website, looks for links to news stories that haven't already been seen/posted, then posts them to Mastodon.

-PRs welcome!
-
-## Features
- Tags based on article topics
- Matching for already-seen articles by both article link, and "Pageview ID"
- Re-posting articles when the published time changes
- Detecting and including metadata about the language of the posts
+PRs welcome!
--- a/kuow_fetcher.py
+++ b/kuow_fetcher.py
@ -8,7 +8,7 @@ import yaml
 from bs4 import BeautifulSoup
 from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
 from mastodon import Mastodon
-from sqlalchemy import create_engine, select, or_
+from sqlalchemy import create_engine, select
 from sqlalchemy.exc import NoResultFound
 from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column

@ -75,6 +75,11 @@ with Session(engine) as session:
    for article in articles:
        article_link = article.find("a").attrs["href"]
        print("Checking {}".format(article_link))
+
+        lookup_statement = select(KuowStory).where(
+            KuowStory.article_link == article_link
+        )
+        lookup_result = session.scalars(lookup_statement)
        is_new_article = False

        article_lookup = requests.get(kuow_base_url + article_link)
@ -100,30 +105,15 @@ with Session(engine) as session:
            except NameError:
                print("Could not find or load the meta published time for this post")
                last_updated_time = datetime.now()
+
        try:
-            pageview_story_id = article_soup.find(
-                "script", {"class": "pageview_story"}
-            )["data-id"]
-        except (NameError, TypeError):
-            print(
-                "Could not find or load a Pageview story ID, skipping additional processing on this post"
-            )
-            continue
-        try:
-            lookup_statement = select(KuowStory).where(
-                or_(
-                    KuowStory.article_link == article_link,
-                    KuowStory.pageview_story_id == pageview_story_id,
-                )
-            )
-            lookup_result = session.scalars(lookup_statement)
            article_record = lookup_result.one()
            # Only process existing articles if the last updated time doesn't match
            process_article = (
                article_record.last_updated_time.astimezone() != last_updated_time
            )
        except NoResultFound:
-            # Is a new article
+            # Is a new article, or at least one that doesn't match based on the link
            article_record = KuowStory()
            process_article = True
            is_new_article = True
@ -132,6 +122,9 @@ with Session(engine) as session:
            print("Processing {}".format(article_link))

            try:
+                article_record.pageview_story_id = article_soup.find(
+                    "script", {"class": "pageview_story"}
+                )["data-id"]
                article_record.dfp_targeting_id = article_soup.find(
                    "script", {"class": "dfp_targeting", "data-key": "id"}
                )["data-value"]
@ -157,7 +150,7 @@ with Session(engine) as session:
            except (NameError, TypeError):
                print("Could not find or load any tags from the 'tags' property")

-            # Remove duplicate tags
+            # Remove duplicates
            tags = list(set(tags))

            additional_tag_string = ""
@ -199,7 +192,6 @@ with Session(engine) as session:
            except Exception:
                print("Could not load a description/post this article")

-            article_record.pageview_story_id = pageview_story_id
            article_record.article_link = article_link
            article_record.last_updated_time = last_updated_time

--- a/requirements.txt
+++ b/requirements.txt
@ -1,18 +1,18 @@
-beautifulsoup4==4.13.4
+beautifulsoup4==4.12.3
 blurhash==1.1.4
-certifi==2025.4.26
-charset-normalizer==3.4.2
-decorator==5.2.1
-greenlet==3.2.3
-idna==3.10
-lingua-language-detector==2.1.1
-Mastodon.py==2.0.1
-python-dateutil==2.9.0.post0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+decorator==5.1.1
+greenlet==3.0.3
+idna==3.6
+lingua-language-detector==2.0.2
+Mastodon.py==1.8.1
+python-dateutil==2.8.2
 python-magic==0.4.27
-PyYAML==6.0.2
-requests==2.32.4
-six==1.17.0
-soupsieve==2.7
-SQLAlchemy==2.0.41
-typing_extensions==4.14.0
-urllib3==2.4.0
+PyYAML==6.0.1
+requests==2.31.0
+six==1.16.0
+soupsieve==2.5
+SQLAlchemy==2.0.27
+typing_extensions==4.9.0
+urllib3==2.2.0