Compare commits
No commits in common. "main" and "improve-tagging" have entirely different histories.
main
...
improve-ta
4 changed files with 27 additions and 41 deletions
|
@ -2,9 +2,9 @@ when:
|
|||
branch: main
|
||||
steps:
|
||||
- name: lint
|
||||
image: python:3-alpine
|
||||
image: python:3-slim
|
||||
commands:
|
||||
- python -m pip install --upgrade pip
|
||||
- python -m pip install -r requirements.txt
|
||||
- python -m pip install ruff
|
||||
- ruff check .
|
||||
- ruff check .
|
10
README.md
10
README.md
|
@ -1,12 +1,6 @@
|
|||
# KUOW Mastodon Bot
|
||||
[![status-badge](https://ci.gruezi.net/api/badges/1/status.svg)](https://ci.gruezi.net/repos/1)
|
||||
|
||||
This bot scrapes the KUOW website, looks for news stories that haven't already been seen before, then posts them to Mastodon.
|
||||
This bot scrapes the KUOW website, looks for links to news stories that haven't already been seen/posted, then posts them to Mastodon.
|
||||
|
||||
PRs welcome!
|
||||
|
||||
## Features
|
||||
- Tags based on article topics
|
||||
- Matching for already-seen articles by both article link, and "Pageview ID"
|
||||
- Re-posting articles when the published time changes
|
||||
- Detecting and including metadata about the language of the posts
|
||||
PRs welcome!
|
|
@ -8,7 +8,7 @@ import yaml
|
|||
from bs4 import BeautifulSoup
|
||||
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
|
||||
from mastodon import Mastodon
|
||||
from sqlalchemy import create_engine, select, or_
|
||||
from sqlalchemy import create_engine, select
|
||||
from sqlalchemy.exc import NoResultFound
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
|
||||
|
||||
|
@ -75,6 +75,11 @@ with Session(engine) as session:
|
|||
for article in articles:
|
||||
article_link = article.find("a").attrs["href"]
|
||||
print("Checking {}".format(article_link))
|
||||
|
||||
lookup_statement = select(KuowStory).where(
|
||||
KuowStory.article_link == article_link
|
||||
)
|
||||
lookup_result = session.scalars(lookup_statement)
|
||||
is_new_article = False
|
||||
|
||||
article_lookup = requests.get(kuow_base_url + article_link)
|
||||
|
@ -100,30 +105,15 @@ with Session(engine) as session:
|
|||
except NameError:
|
||||
print("Could not find or load the meta published time for this post")
|
||||
last_updated_time = datetime.now()
|
||||
|
||||
try:
|
||||
pageview_story_id = article_soup.find(
|
||||
"script", {"class": "pageview_story"}
|
||||
)["data-id"]
|
||||
except (NameError, TypeError):
|
||||
print(
|
||||
"Could not find or load a Pageview story ID, skipping additional processing on this post"
|
||||
)
|
||||
continue
|
||||
try:
|
||||
lookup_statement = select(KuowStory).where(
|
||||
or_(
|
||||
KuowStory.article_link == article_link,
|
||||
KuowStory.pageview_story_id == pageview_story_id,
|
||||
)
|
||||
)
|
||||
lookup_result = session.scalars(lookup_statement)
|
||||
article_record = lookup_result.one()
|
||||
# Only process existing articles if the last updated time doesn't match
|
||||
process_article = (
|
||||
article_record.last_updated_time.astimezone() != last_updated_time
|
||||
)
|
||||
except NoResultFound:
|
||||
# Is a new article
|
||||
# Is a new article, or at least one that doesn't match based on the link
|
||||
article_record = KuowStory()
|
||||
process_article = True
|
||||
is_new_article = True
|
||||
|
@ -132,6 +122,9 @@ with Session(engine) as session:
|
|||
print("Processing {}".format(article_link))
|
||||
|
||||
try:
|
||||
article_record.pageview_story_id = article_soup.find(
|
||||
"script", {"class": "pageview_story"}
|
||||
)["data-id"]
|
||||
article_record.dfp_targeting_id = article_soup.find(
|
||||
"script", {"class": "dfp_targeting", "data-key": "id"}
|
||||
)["data-value"]
|
||||
|
@ -157,7 +150,7 @@ with Session(engine) as session:
|
|||
except (NameError, TypeError):
|
||||
print("Could not find or load any tags from the 'tags' property")
|
||||
|
||||
# Remove duplicate tags
|
||||
# Remove duplicates
|
||||
tags = list(set(tags))
|
||||
|
||||
additional_tag_string = ""
|
||||
|
@ -199,7 +192,6 @@ with Session(engine) as session:
|
|||
except Exception:
|
||||
print("Could not load a description/post this article")
|
||||
|
||||
article_record.pageview_story_id = pageview_story_id
|
||||
article_record.article_link = article_link
|
||||
article_record.last_updated_time = last_updated_time
|
||||
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
beautifulsoup4==4.12.3
|
||||
blurhash==1.1.4
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.4.0
|
||||
certifi==2024.2.2
|
||||
charset-normalizer==3.3.2
|
||||
decorator==5.1.1
|
||||
greenlet==3.1.1
|
||||
idna==3.10
|
||||
greenlet==3.0.3
|
||||
idna==3.6
|
||||
lingua-language-detector==2.0.2
|
||||
Mastodon.py==1.8.1
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dateutil==2.8.2
|
||||
python-magic==0.4.27
|
||||
PyYAML==6.0.2
|
||||
requests==2.32.3
|
||||
PyYAML==6.0.1
|
||||
requests==2.31.0
|
||||
six==1.16.0
|
||||
soupsieve==2.6
|
||||
SQLAlchemy==2.0.36
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.3
|
||||
soupsieve==2.5
|
||||
SQLAlchemy==2.0.27
|
||||
typing_extensions==4.9.0
|
||||
urllib3==2.2.0
|
||||
|
|
Loading…
Reference in a new issue