Merge branch 'improved_logging'
This commit is contained in:
commit
3c7dc2e299
2 changed files with 120 additions and 27 deletions
144
kuow_fetcher.py
144
kuow_fetcher.py
|
@ -1,14 +1,40 @@
|
|||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
from bs4 import BeautifulSoup
|
||||
from mastodon import Mastodon
|
||||
from sqlalchemy import create_engine, select
|
||||
from sqlalchemy.exc import NoResultFound
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class KuowStory(Base):
|
||||
__tablename__ = "kuow_stories"
|
||||
pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
|
||||
dfp_targeting_id: Mapped[str] = mapped_column()
|
||||
article_link: Mapped[str] = mapped_column()
|
||||
last_updated_time: Mapped[datetime] = mapped_column()
|
||||
post_id: Mapped[Optional[str]] = mapped_column()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"
|
||||
|
||||
|
||||
engine = create_engine("sqlite:///kuow.db")
|
||||
Base.metadata.create_all(engine)
|
||||
|
||||
kuow_base_url = "https://www.kuow.org"
|
||||
url = (
|
||||
kuow_base_url
|
||||
+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
|
||||
)
|
||||
log_file = "kuow_bot_logfile.txt"
|
||||
config = yaml.safe_load(open("config.yml"))
|
||||
mastodon = Mastodon(
|
||||
client_id=config["mastodon"]["client_id"],
|
||||
|
@ -23,36 +49,100 @@ articles = soup.find_all("span", class_="txt")
|
|||
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
|
||||
articles.reverse()
|
||||
|
||||
for article in articles:
|
||||
article_link = article.find("a").attrs["href"]
|
||||
with Session(engine) as session:
|
||||
for article in articles:
|
||||
article_link = article.find("a").attrs["href"]
|
||||
print("Checking {}".format(article_link))
|
||||
|
||||
is_new_article = True
|
||||
with open(log_file, "r") as fp:
|
||||
lines = fp.readlines()
|
||||
for row in lines:
|
||||
if row == article_link + "\n":
|
||||
print("Article " + article_link + " has already been seen")
|
||||
is_new_article = False
|
||||
lookup_statement = select(KuowStory).where(
|
||||
KuowStory.article_link == article_link
|
||||
)
|
||||
lookup_result = session.scalars(lookup_statement)
|
||||
is_new_article = False
|
||||
|
||||
if is_new_article:
|
||||
print(article_link + " has not been seen, posting")
|
||||
article_lookup = requests.get(kuow_base_url + article_link)
|
||||
article_soup = BeautifulSoup(article_lookup.content, "html.parser")
|
||||
|
||||
try:
|
||||
article_description = (
|
||||
(article_soup.find("meta", attrs={"property": "description"}))
|
||||
.attrs["content"]
|
||||
.strip()
|
||||
schema_org_scriptblock = article_soup.find(
|
||||
"script", {"type": "application/ld+json"}
|
||||
)
|
||||
mastodon.status_post(
|
||||
status=article_description
|
||||
+ "\n"
|
||||
+ kuow_base_url
|
||||
+ article_link
|
||||
+ "\n#KUOW #News",
|
||||
visibility="public",
|
||||
)
|
||||
with open(log_file, "a") as fp:
|
||||
fp.write(article_link + "\n")
|
||||
schema_org = json.loads(schema_org_scriptblock.text)
|
||||
assert schema_org["@context"] == "http://schema.org"
|
||||
last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
|
||||
except:
|
||||
print("Could not load a description/post this article")
|
||||
print(
|
||||
"Could not find or load schema.org data for this post, looking up the meta published time"
|
||||
)
|
||||
try:
|
||||
last_updated_time = datetime.fromisoformat(
|
||||
article_soup.find(
|
||||
"meta", attrs={"property": "article:published_time"}
|
||||
)["content"]
|
||||
)
|
||||
except:
|
||||
print("Could not find or load the meta published time for this post")
|
||||
last_updated_time = datetime.now()
|
||||
|
||||
try:
|
||||
article_record = lookup_result.one()
|
||||
# Only process existing articles if the last updated time doesn't match
|
||||
process_article = (
|
||||
article_record.last_updated_time.astimezone() != last_updated_time
|
||||
)
|
||||
except NoResultFound:
|
||||
# Is a new article, or at least one that doesn't match based on the link
|
||||
article_record = KuowStory()
|
||||
process_article = True
|
||||
is_new_article = True
|
||||
|
||||
if process_article:
|
||||
print("Processing {}".format(article_link))
|
||||
|
||||
try:
|
||||
article_record.pageview_story_id = article_soup.find(
|
||||
"script", {"class": "pageview_story"}
|
||||
)["data-id"]
|
||||
article_record.dfp_targeting_id = article_soup.find(
|
||||
"script", {"class": "dfp_targeting", "data-key": "id"}
|
||||
)["data-value"]
|
||||
except:
|
||||
print("Could not find or load IDs for this post")
|
||||
|
||||
try:
|
||||
tags = article_soup.find(
|
||||
"script", {"class": "dfp_targeting", "data-key": "tags"}
|
||||
)["data-value"].split("|")
|
||||
except:
|
||||
print("Could not find or load any tags for this article")
|
||||
tags = []
|
||||
|
||||
additional_tag_string = ""
|
||||
for tag in tags:
|
||||
# TODO: Do a check for tags in the config file that we don't want to tag posts with
|
||||
additional_tag_string += " #{}".format(tag.title().replace(" ", ""))
|
||||
|
||||
try:
|
||||
article_description = (
|
||||
(article_soup.find("meta", attrs={"property": "description"}))
|
||||
.attrs["content"]
|
||||
.strip()
|
||||
)
|
||||
mastodon_post_result = mastodon.status_post(
|
||||
status=article_description
|
||||
+ "\n"
|
||||
+ kuow_base_url
|
||||
+ article_link
|
||||
+ "\n#KUOW #News{}".format(additional_tag_string),
|
||||
visibility="public",
|
||||
)
|
||||
article_record.post_id = mastodon_post_result["id"]
|
||||
except:
|
||||
print("Could not load a description/post this article")
|
||||
|
||||
article_record.article_link = article_link
|
||||
article_record.last_updated_time = last_updated_time
|
||||
|
||||
if is_new_article:
|
||||
session.add(article_record)
|
||||
session.commit()
|
||||
|
|
|
@ -3,6 +3,7 @@ blurhash==1.1.4
|
|||
certifi==2023.11.17
|
||||
charset-normalizer==3.3.2
|
||||
decorator==5.1.1
|
||||
greenlet==3.0.3
|
||||
idna==3.6
|
||||
Mastodon.py==1.8.1
|
||||
python-dateutil==2.8.2
|
||||
|
@ -11,4 +12,6 @@ PyYAML==6.0.1
|
|||
requests==2.31.0
|
||||
six==1.16.0
|
||||
soupsieve==2.5
|
||||
SQLAlchemy==2.0.25
|
||||
typing_extensions==4.9.0
|
||||
urllib3==2.1.0
|
||||
|
|
Loading…
Add table
Reference in a new issue