kuow-mastodon-bot/kuow_fetcher.py

import json
from datetime import datetime
from json.decoder import JSONDecodeError
from typing import Optional

import requests
import yaml
from bs4 import BeautifulSoup
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
from mastodon import Mastodon
from sqlalchemy import create_engine, select, or_
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column


class Base(DeclarativeBase):
    pass


class KuowStory(Base):
    __tablename__ = "kuow_stories"
    pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)
    dfp_targeting_id: Mapped[str] = mapped_column()
    article_link: Mapped[str] = mapped_column()
    article_language: Mapped[Optional[str]] = mapped_column()
    last_updated_time: Mapped[datetime] = mapped_column()
    post_id: Mapped[Optional[str]] = mapped_column()

    def __repr__(self) -> str:
        return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"


def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language:
    iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper())
    return Language.from_iso_code_639_1(iso_code_369_1)


def detect_article_language(article_description: str) -> str:
    detector = LanguageDetectorBuilder.from_languages(*languages).build()
    try:
        language = detector.detect_language_of(article_description)
        return language.iso_code_639_1.name
    except AttributeError:
        return default_language.iso_code_639_1.name


engine = create_engine("sqlite:///kuow.db")
Base.metadata.create_all(engine)

kuow_base_url = "https://www.kuow.org"
url = (
    kuow_base_url
    + "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"
)
config = yaml.safe_load(open("config.yml"))
mastodon = Mastodon(
    client_id=config["mastodon"]["client_id"],
    client_secret=config["mastodon"]["client_secret"],
    access_token=config["mastodon"]["access_token"],
    api_base_url=config["mastodon"]["api_base_url"],
)

languages = [
    get_language_from_iso_code_639_1_str(language) for language in config["languages"]
]
default_language = get_language_from_iso_code_639_1_str(config["default_language"])

kuow_response = requests.get(url)
soup = BeautifulSoup(kuow_response.content, "html.parser")
articles = soup.find_all("span", class_="txt")
# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published
articles.reverse()

with Session(engine) as session:
    for article in articles:
        article_link = article.find("a").attrs["href"]
        print("Checking {}".format(article_link))
        is_new_article = False

        article_lookup = requests.get(kuow_base_url + article_link)
        article_soup = BeautifulSoup(article_lookup.content, "html.parser")

        try:
            schema_org_scriptblock = article_soup.find(
                "script", {"type": "application/ld+json"}
            )
            schema_org = json.loads(schema_org_scriptblock.text)
            assert schema_org["@context"] == "http://schema.org"
            last_updated_time = datetime.fromisoformat(schema_org["dateModified"])
        except (AssertionError, JSONDecodeError):
            print(
                "Could not find or load schema.org data for this post, looking up the meta published time"
            )
            try:
                last_updated_time = datetime.fromisoformat(
                    article_soup.find(
                        "meta", attrs={"property": "article:published_time"}
                    )["content"]
                )
            except NameError:
                print("Could not find or load the meta published time for this post")
                last_updated_time = datetime.now()
        try:
            pageview_story_id = article_soup.find(
                "script", {"class": "pageview_story"}
            )["data-id"]
        except (NameError, TypeError):
            print(
                "Could not find or load a Pageview story ID, skipping additional processing on this post"
            )
            continue
        try:
            lookup_statement = select(KuowStory).where(
                or_(
                    KuowStory.article_link == article_link,
                    KuowStory.pageview_story_id == pageview_story_id,
                )
            )
            lookup_result = session.scalars(lookup_statement)
            article_record = lookup_result.one()
            # Only process existing articles if the last updated time doesn't match
            process_article = (
                article_record.last_updated_time.astimezone() != last_updated_time
            )
        except NoResultFound:
            # Is a new article
            article_record = KuowStory()
            process_article = True
            is_new_article = True

        if process_article:
            print("Processing {}".format(article_link))

            try:
                article_record.dfp_targeting_id = article_soup.find(
                    "script", {"class": "dfp_targeting", "data-key": "id"}
                )["data-value"]
            except (NameError, TypeError):
                print("Could not find or load IDs for this post")

            tags: list[str] = []
            try:
                tags.extend(
                    article_soup.find(
                        "script", {"class": "dfp_targeting", "data-key": "tags"}
                    )["data-value"].split("|")
                )
            except (NameError, TypeError):
                print("Could not find or load any tags from the 'tags' property")

            try:
                tags.extend(
                    article_soup.find(
                        "script", {"class": "dfp_targeting", "data-key": "topics"}
                    )["data-value"].split("|")
                )
            except (NameError, TypeError):
                print("Could not find or load any tags from the 'tags' property")

            # Remove duplicate tags
            tags = list(set(tags))

            additional_tag_string = ""
            for tag in tags:
                tag = tag.title().replace(" ", "").replace("&", "And")
                if tag.casefold() in config["exclude_tags"]:
                    print(
                        "Tag {} was found in the article, but won't be included in the post".format(
                            tag
                        )
                    )
                else:
                    additional_tag_string += " #{}".format(tag)

            try:
                article_description = (
                    (article_soup.find("meta", attrs={"property": "description"}))
                    .attrs["content"]
                    .strip()
                )
                if not article_record.post_id:
                    print("Posting to Mastodon")

                    article_language = detect_article_language(article_description)
                    article_record.article_language = article_language

                    mastodon_post_result = mastodon.status_post(
                        status=article_description
                        + "\n"
                        + kuow_base_url
                        + article_link
                        + "\n#KUOW #News{}".format(additional_tag_string),
                        visibility="public",
                        language=article_language,
                    )
                    article_record.post_id = mastodon_post_result["id"]
                else:
                    print("Article has already been posted")
            except Exception:
                print("Could not load a description/post this article")

            article_record.pageview_story_id = pageview_story_id
            article_record.article_link = article_link
            article_record.last_updated_time = last_updated_time

            if is_new_article:
                session.add(article_record)
            session.commit()
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`import json`
			`from datetime import datetime`
Fix bare excepts 2024-02-10 20:22:10 -08:00			`from json.decoder import JSONDecodeError`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`from typing import Optional`

Initial version 2024-01-14 14:48:09 -08:00			`import requests`
Switch to yaml config 2024-01-14 15:12:33 -08:00			`import yaml`
Initial version 2024-01-14 14:48:09 -08:00			`from bs4 import BeautifulSoup`
Initial langage detection 2024-02-11 13:45:54 -08:00			`from lingua import IsoCode639_1, Language, LanguageDetectorBuilder`
Initial version 2024-01-14 14:48:09 -08:00			`from mastodon import Mastodon`
Also match posts by Pageview ID 2024-02-22 06:38:08 -08:00			`from sqlalchemy import create_engine, select, or_`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`from sqlalchemy.exc import NoResultFound`
			`from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column`


			`class Base(DeclarativeBase):`
			`pass`


			`class KuowStory(Base):`
			`__tablename__ = "kuow_stories"`
			`pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True)`
			`dfp_targeting_id: Mapped[str] = mapped_column()`
			`article_link: Mapped[str] = mapped_column()`
Initial langage detection 2024-02-11 13:45:54 -08:00			`article_language: Mapped[Optional[str]] = mapped_column()`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`last_updated_time: Mapped[datetime] = mapped_column()`
			`post_id: Mapped[Optional[str]] = mapped_column()`

			`def __repr__(self) -> str:`
Initial langage detection 2024-02-11 13:45:54 -08:00			`return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})"`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00

Initial langage detection 2024-02-11 13:45:54 -08:00			`def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language:`
			`iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper())`
			`return Language.from_iso_code_639_1(iso_code_369_1)`


			`def detect_article_language(article_description: str) -> str:`
			`detector = LanguageDetectorBuilder.from_languages(*languages).build()`
			`try:`
			`language = detector.detect_language_of(article_description)`
			`return language.iso_code_639_1.name`
			`except AttributeError:`
			`return default_language.iso_code_639_1.name`

Improve tagging 2024-02-14 08:18:08 -08:00
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`engine = create_engine("sqlite:///kuow.db")`
			`Base.metadata.create_all(engine)`
Fix formatting 2024-01-14 14:48:09 -08:00
			`kuow_base_url = "https://www.kuow.org"`
			`url = (`
			`kuow_base_url`
			`+ "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12"`
			`)`
Switch to yaml config 2024-01-14 15:12:33 -08:00			`config = yaml.safe_load(open("config.yml"))`
			`mastodon = Mastodon(`
			`client_id=config["mastodon"]["client_id"],`
			`client_secret=config["mastodon"]["client_secret"],`
			`access_token=config["mastodon"]["access_token"],`
			`api_base_url=config["mastodon"]["api_base_url"],`
			`)`
Initial version 2024-01-14 14:48:09 -08:00
Initial langage detection 2024-02-11 13:45:54 -08:00			`languages = [`
			`get_language_from_iso_code_639_1_str(language) for language in config["languages"]`
			`]`
			`default_language = get_language_from_iso_code_639_1_str(config["default_language"])`

Switch to yaml config 2024-01-14 15:12:33 -08:00			`kuow_response = requests.get(url)`
			`soup = BeautifulSoup(kuow_response.content, "html.parser")`
Fix formatting 2024-01-14 14:48:09 -08:00			`articles = soup.find_all("span", class_="txt")`
Reverse articles before processing 2024-01-14 14:48:09 -08:00			`# Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published`
			`articles.reverse()`

Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`with Session(engine) as session:`
			`for article in articles:`
			`article_link = article.find("a").attrs["href"]`
			`print("Checking {}".format(article_link))`
			`is_new_article = False`
Initial version 2024-01-14 14:48:09 -08:00
Fix formatting 2024-01-14 14:48:09 -08:00			`article_lookup = requests.get(kuow_base_url + article_link)`
			`article_soup = BeautifulSoup(article_lookup.content, "html.parser")`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00
Initial version 2024-01-14 14:48:09 -08:00			`try:`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`schema_org_scriptblock = article_soup.find(`
			`"script", {"type": "application/ld+json"}`
Fix formatting 2024-01-14 14:48:09 -08:00			`)`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`schema_org = json.loads(schema_org_scriptblock.text)`
			`assert schema_org["@context"] == "http://schema.org"`
			`last_updated_time = datetime.fromisoformat(schema_org["dateModified"])`
Fix bare excepts 2024-02-10 20:22:10 -08:00			`except (AssertionError, JSONDecodeError):`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`print(`
			`"Could not find or load schema.org data for this post, looking up the meta published time"`
			`)`
			`try:`
			`last_updated_time = datetime.fromisoformat(`
			`article_soup.find(`
			`"meta", attrs={"property": "article:published_time"}`
			`)["content"]`
			`)`
Fix bare excepts 2024-02-10 20:22:10 -08:00			`except NameError:`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`print("Could not find or load the meta published time for this post")`
			`last_updated_time = datetime.now()`
Fix indent on pageview lookup 2024-02-22 06:41:44 -08:00			`try:`
			`pageview_story_id = article_soup.find(`
			`"script", {"class": "pageview_story"}`
			`)["data-id"]`
			`except (NameError, TypeError):`
			`print(`
			`"Could not find or load a Pageview story ID, skipping additional processing on this post"`
			`)`
			`continue`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`try:`
Also match posts by Pageview ID 2024-02-22 06:38:08 -08:00			`lookup_statement = select(KuowStory).where(`
			`or_(`
			`KuowStory.article_link == article_link,`
			`KuowStory.pageview_story_id == pageview_story_id,`
			`)`
			`)`
			`lookup_result = session.scalars(lookup_statement)`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`article_record = lookup_result.one()`
			`# Only process existing articles if the last updated time doesn't match`
			`process_article = (`
			`article_record.last_updated_time.astimezone() != last_updated_time`
			`)`
			`except NoResultFound:`
Update comments 2024-02-22 07:14:29 -08:00			`# Is a new article`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`article_record = KuowStory()`
			`process_article = True`
			`is_new_article = True`

			`if process_article:`
			`print("Processing {}".format(article_link))`

			`try:`
			`article_record.dfp_targeting_id = article_soup.find(`
			`"script", {"class": "dfp_targeting", "data-key": "id"}`
			`)["data-value"]`
Update metadata exception types 2024-02-14 07:38:07 -08:00			`except (NameError, TypeError):`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`print("Could not find or load IDs for this post")`

Improve tagging 2024-02-14 08:18:08 -08:00			`tags: list[str] = []`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`try:`
Improve tagging 2024-02-14 08:18:08 -08:00			`tags.extend(`
			`article_soup.find(`
			`"script", {"class": "dfp_targeting", "data-key": "tags"}`
			`)["data-value"].split("\|")`
			`)`
Update metadata exception types 2024-02-14 07:38:07 -08:00			`except (NameError, TypeError):`
Improve tagging 2024-02-14 08:18:08 -08:00			`print("Could not find or load any tags from the 'tags' property")`

			`try:`
			`tags.extend(`
			`article_soup.find(`
			`"script", {"class": "dfp_targeting", "data-key": "topics"}`
			`)["data-value"].split("\|")`
			`)`
			`except (NameError, TypeError):`
			`print("Could not find or load any tags from the 'tags' property")`

Update comments 2024-02-22 07:14:29 -08:00			`# Remove duplicate tags`
Improve tagging 2024-02-14 08:18:08 -08:00			`tags = list(set(tags))`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00
			`additional_tag_string = ""`
			`for tag in tags:`
Improve tagging 2024-02-14 08:18:08 -08:00			`tag = tag.title().replace(" ", "").replace("&", "And")`
			`if tag.casefold() in config["exclude_tags"]:`
			`print(`
			`"Tag {} was found in the article, but won't be included in the post".format(`
			`tag`
			`)`
			`)`
			`else:`
			`additional_tag_string += " #{}".format(tag)`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00
			`try:`
			`article_description = (`
			`(article_soup.find("meta", attrs={"property": "description"}))`
			`.attrs["content"]`
			`.strip()`
			`)`
Only post the article if it hasn't been already 2024-01-15 08:52:24 -08:00			`if not article_record.post_id:`
Print if anything is going to be posted or not 2024-01-16 15:28:47 -08:00			`print("Posting to Mastodon")`
Initial langage detection 2024-02-11 13:45:54 -08:00
			`article_language = detect_article_language(article_description)`
			`article_record.article_language = article_language`

Only post the article if it hasn't been already 2024-01-15 08:52:24 -08:00			`mastodon_post_result = mastodon.status_post(`
			`status=article_description`
			`+ "\n"`
			`+ kuow_base_url`
			`+ article_link`
			`+ "\n#KUOW #News{}".format(additional_tag_string),`
			`visibility="public",`
Initial langage detection 2024-02-11 13:45:54 -08:00			`language=article_language,`
Only post the article if it hasn't been already 2024-01-15 08:52:24 -08:00			`)`
			`article_record.post_id = mastodon_post_result["id"]`
Print if anything is going to be posted or not 2024-01-16 15:28:47 -08:00			`else:`
			`print("Article has already been posted")`
Fix bare excepts 2024-02-10 20:22:10 -08:00			`except Exception:`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`print("Could not load a description/post this article")`

Also match posts by Pageview ID 2024-02-22 06:38:08 -08:00			`article_record.pageview_story_id = pageview_story_id`
Migrate to SQLAlchemy for storage 2024-01-15 08:47:46 -08:00			`article_record.article_link = article_link`
			`article_record.last_updated_time = last_updated_time`

			`if is_new_article:`
			`session.add(article_record)`
			`session.commit()`