Compare commits

..

No commits in common. "main" and "improve-tagging" have entirely different histories.

4 changed files with 28 additions and 42 deletions

View file

@ -2,9 +2,9 @@ when:
branch: main
steps:
- name: lint
image: python:3-alpine
image: python:3-slim
commands:
- python -m pip install --upgrade pip
- python -m pip install -r requirements.txt
- python -m pip install ruff
- ruff check .
- ruff check .

View file

@ -1,12 +1,6 @@
# KUOW Mastodon Bot
[![status-badge](https://ci.gruezi.net/api/badges/1/status.svg)](https://ci.gruezi.net/repos/1)
This bot scrapes the KUOW website, looks for news stories that haven't already been seen before, then posts them to Mastodon.
This bot scrapes the KUOW website, looks for links to news stories that haven't already been seen/posted, then posts them to Mastodon.
PRs welcome!
## Features
- Tags based on article topics
- Matching for already-seen articles by both article link, and "Pageview ID"
- Re-posting articles when the published time changes
- Detecting and including metadata about the language of the posts
PRs welcome!

View file

@ -8,7 +8,7 @@ import yaml
from bs4 import BeautifulSoup
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
from mastodon import Mastodon
from sqlalchemy import create_engine, select, or_
from sqlalchemy import create_engine, select
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
@ -75,6 +75,11 @@ with Session(engine) as session:
for article in articles:
article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link))
lookup_statement = select(KuowStory).where(
KuowStory.article_link == article_link
)
lookup_result = session.scalars(lookup_statement)
is_new_article = False
article_lookup = requests.get(kuow_base_url + article_link)
@ -100,30 +105,15 @@ with Session(engine) as session:
except NameError:
print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now()
try:
pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
except (NameError, TypeError):
print(
"Could not find or load a Pageview story ID, skipping additional processing on this post"
)
continue
try:
lookup_statement = select(KuowStory).where(
or_(
KuowStory.article_link == article_link,
KuowStory.pageview_story_id == pageview_story_id,
)
)
lookup_result = session.scalars(lookup_statement)
article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match
process_article = (
article_record.last_updated_time.astimezone() != last_updated_time
)
except NoResultFound:
# Is a new article
# Is a new article, or at least one that doesn't match based on the link
article_record = KuowStory()
process_article = True
is_new_article = True
@ -132,6 +122,9 @@ with Session(engine) as session:
print("Processing {}".format(article_link))
try:
article_record.pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"]
@ -157,7 +150,7 @@ with Session(engine) as session:
except (NameError, TypeError):
print("Could not find or load any tags from the 'tags' property")
# Remove duplicate tags
# Remove duplicates
tags = list(set(tags))
additional_tag_string = ""
@ -199,7 +192,6 @@ with Session(engine) as session:
except Exception:
print("Could not load a description/post this article")
article_record.pageview_story_id = pageview_story_id
article_record.article_link = article_link
article_record.last_updated_time = last_updated_time

View file

@ -1,18 +1,18 @@
beautifulsoup4==4.12.3
blurhash==1.1.4
certifi==2024.12.14
charset-normalizer==3.4.1
certifi==2024.2.2
charset-normalizer==3.3.2
decorator==5.1.1
greenlet==3.1.1
idna==3.10
greenlet==3.0.3
idna==3.6
lingua-language-detector==2.0.2
Mastodon.py==1.8.1
python-dateutil==2.9.0.post0
python-dateutil==2.8.2
python-magic==0.4.27
PyYAML==6.0.2
requests==2.32.3
six==1.17.0
soupsieve==2.6
SQLAlchemy==2.0.37
typing_extensions==4.12.2
urllib3==2.3.0
PyYAML==6.0.1
requests==2.31.0
six==1.16.0
soupsieve==2.5
SQLAlchemy==2.0.27
typing_extensions==4.9.0
urllib3==2.2.0