Compare commits

..

3 commits

2 changed files with 121 additions and 27 deletions

View file

@ -1,14 +1,40 @@
import json
from datetime import datetime
from typing import Optional
import requests import requests
import yaml import yaml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from mastodon import Mastodon from mastodon import Mastodon
from sqlalchemy import create_engine, select
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
class Base(DeclarativeBase):
pass
class KuowStory(Base):
__tablename__ = "kuow_stories"
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
dfp_targeting_id: Mapped[str] = mapped_column()
article_link: Mapped[str] = mapped_column()
last_updated_time: Mapped[datetime] = mapped_column()
post_id: Mapped[Optional[str]] = mapped_column()
def __repr__(self) -> str:
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine)
kuow_base_url = "https://www.kuow.org" kuow_base_url = "https://www.kuow.org"
url = ( url = (
kuow_base_url kuow_base_url
+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12" + "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
) )
log_file = "kuow_bot_logfile.txt"
config = yaml.safe_load(open("config.yml")) config = yaml.safe_load(open("config.yml"))
mastodon = Mastodon( mastodon = Mastodon(
client_id=config["mastodon"]["client_id"], client_id=config["mastodon"]["client_id"],
@ -23,36 +49,101 @@ articles = soup.find_all("span", class_="txt")
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published # Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
articles.reverse() articles.reverse()
for article in articles: with Session(engine) as session:
for article in articles:
article_link = article.find("a").attrs["href"] article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link))
is_new_article = True lookup_statement = select(KuowStory).where(
with open(log_file, "r") as fp: KuowStory.article_link == article_link
lines = fp.readlines() )
for row in lines: lookup_result = session.scalars(lookup_statement)
if row == article_link + "\n":
print("Article " + article_link + " has already been seen")
is_new_article = False is_new_article = False
if is_new_article:
print(article_link + " has not been seen, posting")
article_lookup = requests.get(kuow_base_url + article_link) article_lookup = requests.get(kuow_base_url + article_link)
article_soup = BeautifulSoup(article_lookup.content, "html.parser") article_soup = BeautifulSoup(article_lookup.content, "html.parser")
try:
schema_org_scriptblock = article_soup.find(
"script", {"type": "application/ld+json"}
)
schema_org = json.loads(schema_org_scriptblock.text)
assert schema_org["@context"] == "http://schema.org"
last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
except:
print(
"Could not find or load schema.org data for this post, looking up the meta published time"
)
try:
last_updated_time = datetime.fromisoformat(
article_soup.find(
"meta", attrs={"property": "article:published_time"}
)["content"]
)
except:
print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now()
try:
article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match
process_article = (
article_record.last_updated_time.astimezone() != last_updated_time
)
except NoResultFound:
# Is a new article, or at least one that doesn't match based on the link
article_record = KuowStory()
process_article = True
is_new_article = True
if process_article:
print("Processing {}".format(article_link))
try:
article_record.pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"]
except:
print("Could not find or load IDs for this post")
try:
tags = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "tags"}
)["data-value"].split("|")
except:
print("Could not find or load any tags for this article")
tags = []
additional_tag_string = ""
for tag in tags:
# TODO: Do a check for tags in the config file that we don't want to tag posts with
additional_tag_string += " #{}".format(tag.title().replace(" ", ""))
try: try:
article_description = ( article_description = (
(article_soup.find("meta", attrs={"property": "description"})) (article_soup.find("meta", attrs={"property": "description"}))
.attrs["content"] .attrs["content"]
.strip() .strip()
) )
mastodon.status_post( if not article_record.post_id:
mastodon_post_result = mastodon.status_post(
status=article_description status=article_description
+ "\n" + "\n"
+ kuow_base_url + kuow_base_url
+ article_link + article_link
+ "\n#KUOW #News", + "\n#KUOW #News{}".format(additional_tag_string),
visibility="public", visibility="public",
) )
with open(log_file, "a") as fp: article_record.post_id = mastodon_post_result["id"]
fp.write(article_link + "\n")
except: except:
print("Could not load a description/post this article") print("Could not load a description/post this article")
article_record.article_link = article_link
article_record.last_updated_time = last_updated_time
if is_new_article:
session.add(article_record)
session.commit()

View file

@ -3,6 +3,7 @@ blurhash==1.1.4
certifi==2023.11.17 certifi==2023.11.17
charset-normalizer==3.3.2 charset-normalizer==3.3.2
decorator==5.1.1 decorator==5.1.1
greenlet==3.0.3
idna==3.6 idna==3.6
Mastodon.py==1.8.1 Mastodon.py==1.8.1
python-dateutil==2.8.2 python-dateutil==2.8.2
@ -11,4 +12,6 @@ PyYAML==6.0.1
requests==2.31.0 requests==2.31.0
six==1.16.0 six==1.16.0
soupsieve==2.5 soupsieve==2.5
SQLAlchemy==2.0.25
typing_extensions==4.9.0
urllib3==2.1.0 urllib3==2.1.0