import json from datetime import datetime from json.decoder import JSONDecodeError from typing import Optional import requests import yaml from bs4 import BeautifulSoup from lingua import IsoCode639_1, Language, LanguageDetectorBuilder from mastodon import Mastodon from sqlalchemy import create_engine, select, or_ from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column class Base(DeclarativeBase): pass class KuowStory(Base): __tablename__ = "kuow_stories" pageview_story_id: Mapped[str] = mapped_column(primary_key=True, unique=True) dfp_targeting_id: Mapped[str] = mapped_column() article_link: Mapped[str] = mapped_column() article_language: Mapped[Optional[str]] = mapped_column() last_updated_time: Mapped[datetime] = mapped_column() post_id: Mapped[Optional[str]] = mapped_column() def __repr__(self) -> str: return f"KuowStory(pageview_story_id={self.pageview_story_id!r}, dfp_targeting_id={self.dfp_targeting_id!r}, article_link={self.article_link!r}, article_language={self.article_language!r}, last_updated_time={self.last_updated_time!r}, post_id={self.post_id!r})" def get_language_from_iso_code_639_1_str(iso_code_639_1_str: str) -> Language: iso_code_369_1 = getattr(IsoCode639_1, iso_code_639_1_str.upper()) return Language.from_iso_code_639_1(iso_code_369_1) def detect_article_language(article_description: str) -> str: detector = LanguageDetectorBuilder.from_languages(*languages).build() try: language = detector.detect_language_of(article_description) return language.iso_code_639_1.name except AttributeError: return default_language.iso_code_639_1.name engine = create_engine("sqlite:///kuow.db") Base.metadata.create_all(engine) kuow_base_url = "https://www.kuow.org" url = ( kuow_base_url + "/fragments?name=story_grid&source=homepage&view_id=1&page=1&per_page=12" ) config = yaml.safe_load(open("config.yml")) mastodon = Mastodon( client_id=config["mastodon"]["client_id"], client_secret=config["mastodon"]["client_secret"], access_token=config["mastodon"]["access_token"], api_base_url=config["mastodon"]["api_base_url"], ) languages = [ get_language_from_iso_code_639_1_str(language) for language in config["languages"] ] default_language = get_language_from_iso_code_639_1_str(config["default_language"]) kuow_response = requests.get(url) soup = BeautifulSoup(kuow_response.content, "html.parser") articles = soup.find_all("span", class_="txt") # Reverse articles, so that if multiple new ones have been found, they'll be posted in order of when published articles.reverse() with Session(engine) as session: for article in articles: article_link = article.find("a").attrs["href"] print("Checking {}".format(article_link)) is_new_article = False article_lookup = requests.get(kuow_base_url + article_link) article_soup = BeautifulSoup(article_lookup.content, "html.parser") try: schema_org_scriptblock = article_soup.find( "script", {"type": "application/ld+json"} ) schema_org = json.loads(schema_org_scriptblock.text) assert schema_org["@context"] == "http://schema.org" last_updated_time = datetime.fromisoformat(schema_org["dateModified"]) except (AssertionError, JSONDecodeError): print( "Could not find or load schema.org data for this post, looking up the meta published time" ) try: last_updated_time = datetime.fromisoformat( article_soup.find( "meta", attrs={"property": "article:published_time"} )["content"] ) except NameError: print("Could not find or load the meta published time for this post") last_updated_time = datetime.now() try: pageview_story_id = article_soup.find( "script", {"class": "pageview_story"} )["data-id"] except (NameError, TypeError): print( "Could not find or load a Pageview story ID, skipping additional processing on this post" ) continue try: lookup_statement = select(KuowStory).where( or_( KuowStory.article_link == article_link, KuowStory.pageview_story_id == pageview_story_id, ) ) lookup_result = session.scalars(lookup_statement) article_record = lookup_result.one() # Only process existing articles if the last updated time doesn't match process_article = ( article_record.last_updated_time.astimezone() != last_updated_time ) except NoResultFound: # Is a new article, or at least one that doesn't match based on the link article_record = KuowStory() process_article = True is_new_article = True if process_article: print("Processing {}".format(article_link)) try: article_record.dfp_targeting_id = article_soup.find( "script", {"class": "dfp_targeting", "data-key": "id"} )["data-value"] except (NameError, TypeError): print("Could not find or load IDs for this post") tags: list[str] = [] try: tags.extend( article_soup.find( "script", {"class": "dfp_targeting", "data-key": "tags"} )["data-value"].split("|") ) except (NameError, TypeError): print("Could not find or load any tags from the 'tags' property") try: tags.extend( article_soup.find( "script", {"class": "dfp_targeting", "data-key": "topics"} )["data-value"].split("|") ) except (NameError, TypeError): print("Could not find or load any tags from the 'tags' property") # Remove duplicates tags = list(set(tags)) additional_tag_string = "" for tag in tags: tag = tag.title().replace(" ", "").replace("&", "And") if tag.casefold() in config["exclude_tags"]: print( "Tag {} was found in the article, but won't be included in the post".format( tag ) ) else: additional_tag_string += " #{}".format(tag) try: article_description = ( (article_soup.find("meta", attrs={"property": "description"})) .attrs["content"] .strip() ) if not article_record.post_id: print("Posting to Mastodon") article_language = detect_article_language(article_description) article_record.article_language = article_language mastodon_post_result = mastodon.status_post( status=article_description + "\n" + kuow_base_url + article_link + "\n#KUOW #News{}".format(additional_tag_string), visibility="public", language=article_language, ) article_record.post_id = mastodon_post_result["id"] else: print("Article has already been posted") except Exception: print("Could not load a description/post this article") article_record.pageview_story_id = pageview_story_id article_record.article_link = article_link article_record.last_updated_time = last_updated_time if is_new_article: session.add(article_record) session.commit()