Compare commits

..

No commits in common. "943af2f10b9360549aab3d98005b68712e2a3b56" and "03308124d982a8e5794dce7eb946393272d1c351" have entirely different histories.

2 changed files with 27 additions and 121 deletions

View file

@ -1,40 +1,14 @@
import json
from datetime import datetime
from typing import Optional
import requests import requests
import yaml import yaml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from mastodon import Mastodon from mastodon import Mastodon
from sqlalchemy import create_engine, select
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
class Base(DeclarativeBase):
pass
class KuowStory(Base):
__tablename__ = "kuow_stories"
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
dfp_targeting_id: Mapped[str] = mapped_column()
article_link: Mapped[str] = mapped_column()
last_updated_time: Mapped[datetime] = mapped_column()
post_id: Mapped[Optional[str]] = mapped_column()
def __repr__(self) -> str:
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine)
kuow_base_url = "https://www.kuow.org" kuow_base_url = "https://www.kuow.org"
url = ( url = (
kuow_base_url kuow_base_url
+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12" + "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
) )
log_file = "kuow_bot_logfile.txt"
config = yaml.safe_load(open("config.yml")) config = yaml.safe_load(open("config.yml"))
mastodon = Mastodon( mastodon = Mastodon(
client_id=config["mastodon"]["client_id"], client_id=config["mastodon"]["client_id"],
@ -49,101 +23,36 @@ articles = soup.find_all("span", class_="txt")
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published # Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
articles.reverse() articles.reverse()
with Session(engine) as session: for article in articles:
for article in articles: article_link = article.find("a").attrs["href"]
article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link))
lookup_statement = select(KuowStory).where( is_new_article = True
KuowStory.article_link == article_link with open(log_file, "r") as fp:
) lines = fp.readlines()
lookup_result = session.scalars(lookup_statement) for row in lines:
is_new_article = False if row == article_link + "\n":
print("Article " + article_link + " has already been seen")
is_new_article = False
if is_new_article:
print(article_link + " has not been seen, posting")
article_lookup = requests.get(kuow_base_url + article_link) article_lookup = requests.get(kuow_base_url + article_link)
article_soup = BeautifulSoup(article_lookup.content, "html.parser") article_soup = BeautifulSoup(article_lookup.content, "html.parser")
try: try:
schema_org_scriptblock = article_soup.find( article_description = (
"script", {"type": "application/ld+json"} (article_soup.find("meta", attrs={"property": "description"}))
.attrs["content"]
.strip()
) )
schema_org = json.loads(schema_org_scriptblock.text) mastodon.status_post(
assert schema_org["@context"] == "http://schema.org" status=article_description
last_updated_time = datetime.fromisoformat(schema_org["dateModified"]) + "\n"
+ kuow_base_url
+ article_link
+ "\n#KUOW #News",
visibility="public",
)
with open(log_file, "a") as fp:
fp.write(article_link + "\n")
except: except:
print( print("Could not load a description/post this article")
"Could not find or load schema.org data for this post, looking up the meta published time"
)
try:
last_updated_time = datetime.fromisoformat(
article_soup.find(
"meta", attrs={"property": "article:published_time"}
)["content"]
)
except:
print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now()
try:
article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match
process_article = (
article_record.last_updated_time.astimezone() != last_updated_time
)
except NoResultFound:
# Is a new article, or at least one that doesn't match based on the link
article_record = KuowStory()
process_article = True
is_new_article = True
if process_article:
print("Processing {}".format(article_link))
try:
article_record.pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"]
except:
print("Could not find or load IDs for this post")
try:
tags = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "tags"}
)["data-value"].split("|")
except:
print("Could not find or load any tags for this article")
tags = []
additional_tag_string = ""
for tag in tags:
# TODO: Do a check for tags in the config file that we don't want to tag posts with
additional_tag_string += " #{}".format(tag.title().replace(" ", ""))
try:
article_description = (
(article_soup.find("meta", attrs={"property": "description"}))
.attrs["content"]
.strip()
)
if not article_record.post_id:
mastodon_post_result = mastodon.status_post(
status=article_description
+ "\n"
+ kuow_base_url
+ article_link
+ "\n#KUOW #News{}".format(additional_tag_string),
visibility="public",
)
article_record.post_id = mastodon_post_result["id"]
except:
print("Could not load a description/post this article")
article_record.article_link = article_link
article_record.last_updated_time = last_updated_time
if is_new_article:
session.add(article_record)
session.commit()

View file

@ -3,7 +3,6 @@ blurhash==1.1.4
certifi==2023.11.17 certifi==2023.11.17
charset-normalizer==3.3.2 charset-normalizer==3.3.2
decorator==5.1.1 decorator==5.1.1
greenlet==3.0.3
idna==3.6 idna==3.6
Mastodon.py==1.8.1 Mastodon.py==1.8.1
python-dateutil==2.8.2 python-dateutil==2.8.2
@ -12,6 +11,4 @@ PyYAML==6.0.1
requests==2.31.0 requests==2.31.0
six==1.16.0 six==1.16.0
soupsieve==2.5 soupsieve==2.5
SQLAlchemy==2.0.25
typing_extensions==4.9.0
urllib3==2.1.0 urllib3==2.1.0