diff --git a/.woodpecker/lint.yml b/.woodpecker/lint.yml index 31f6ca1..1eead62 100644 --- a/.woodpecker/lint.yml +++ b/.woodpecker/lint.yml @@ -2,9 +2,9 @@ when: branch: main steps: - name: lint - image: python:3-alpine + image: python:3-slim commands: - python -m pip install --upgrade pip - python -m pip install -r requirements.txt - python -m pip install ruff - - ruff check . + - ruff check . \ No newline at end of file diff --git a/README.md b/README.md index 1798342..811b797 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,6 @@ # KUOW Mastodon Bot [![status-badge](https://ci.gruezi.net/api/badges/1/status.svg)](https://ci.gruezi.net/repos/1) -This bot scrapes the KUOW website, looks for news stories that haven't already been seen before, then posts them to Mastodon. +This bot scrapes the KUOW website, looks for links to news stories that haven't already been seen/posted, then posts them to Mastodon. -PRs welcome! - -## Features -- Tags based on article topics -- Matching for already-seen articles by both article link, and "Pageview ID" -- Re-posting articles when the published time changes -- Detecting and including metadata about the language of the posts +PRs welcome! \ No newline at end of file diff --git a/kuow_fetcher.py b/kuow_fetcher.py index df8b75f..9413d0b 100644 --- a/kuow_fetcher.py +++ b/kuow_fetcher.py @@ -8,7 +8,7 @@ import yaml from bs4 import BeautifulSoup from lingua import IsoCode639_1, Language, LanguageDetectorBuilder from mastodon import Mastodon -from sqlalchemy import create_engine, select, or_ +from sqlalchemy import create_engine, select from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column @@ -75,6 +75,11 @@ with Session(engine) as session: for article in articles: article_link = article.find("a").attrs["href"] print("Checking {}".format(article_link)) + + lookup_statement = select(KuowStory).where( + KuowStory.article_link == article_link + ) + lookup_result = session.scalars(lookup_statement) is_new_article = False article_lookup = requests.get(kuow_base_url + article_link) @@ -100,30 +105,15 @@ with Session(engine) as session: except NameError: print("Could not find or load the meta published time for this post") last_updated_time = datetime.now() + try: - pageview_story_id = article_soup.find( - "script", {"class": "pageview_story"} - )["data-id"] - except (NameError, TypeError): - print( - "Could not find or load a Pageview story ID, skipping additional processing on this post" - ) - continue - try: - lookup_statement = select(KuowStory).where( - or_( - KuowStory.article_link == article_link, - KuowStory.pageview_story_id == pageview_story_id, - ) - ) - lookup_result = session.scalars(lookup_statement) article_record = lookup_result.one() # Only process existing articles if the last updated time doesn't match process_article = ( article_record.last_updated_time.astimezone() != last_updated_time ) except NoResultFound: - # Is a new article + # Is a new article, or at least one that doesn't match based on the link article_record = KuowStory() process_article = True is_new_article = True @@ -132,6 +122,9 @@ with Session(engine) as session: print("Processing {}".format(article_link)) try: + article_record.pageview_story_id = article_soup.find( + "script", {"class": "pageview_story"} + )["data-id"] article_record.dfp_targeting_id = article_soup.find( "script", {"class": "dfp_targeting", "data-key": "id"} )["data-value"] @@ -157,7 +150,7 @@ with Session(engine) as session: except (NameError, TypeError): print("Could not find or load any tags from the 'tags' property") - # Remove duplicate tags + # Remove duplicates tags = list(set(tags)) additional_tag_string = "" @@ -199,7 +192,6 @@ with Session(engine) as session: except Exception: print("Could not load a description/post this article") - article_record.pageview_story_id = pageview_story_id article_record.article_link = article_link article_record.last_updated_time = last_updated_time diff --git a/requirements.txt b/requirements.txt index 0cdb935..1dff066 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ beautifulsoup4==4.12.3 blurhash==1.1.4 -certifi==2024.12.14 -charset-normalizer==3.4.1 +certifi==2024.2.2 +charset-normalizer==3.3.2 decorator==5.1.1 -greenlet==3.1.1 -idna==3.10 +greenlet==3.0.3 +idna==3.6 lingua-language-detector==2.0.2 Mastodon.py==1.8.1 -python-dateutil==2.9.0.post0 +python-dateutil==2.8.2 python-magic==0.4.27 -PyYAML==6.0.2 -requests==2.32.3 -six==1.17.0 -soupsieve==2.6 -SQLAlchemy==2.0.37 -typing_extensions==4.12.2 -urllib3==2.3.0 +PyYAML==6.0.1 +requests==2.31.0 +six==1.16.0 +soupsieve==2.5 +SQLAlchemy==2.0.27 +typing_extensions==4.9.0 +urllib3==2.2.0