From 1928e00d3d67a3f4472227a7943686bb830de5d9 Mon Sep 17 00:00:00 2001 From: Liam Steckler Date: Thu, 22 Feb 2024 06:38:08 -0800 Subject: [PATCH] Also match posts by Pageview ID --- kuow_fetcher.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/kuow_fetcher.py b/kuow_fetcher.py index 9413d0b..670377b 100644 --- a/kuow_fetcher.py +++ b/kuow_fetcher.py @@ -8,7 +8,7 @@ import yaml from bs4 import BeautifulSoup from lingua import IsoCode639_1, Language, LanguageDetectorBuilder from mastodon import Mastodon -from sqlalchemy import create_engine, select +from sqlalchemy import create_engine, select, or_ from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column @@ -75,11 +75,6 @@ with Session(engine) as session: for article in articles: article_link = article.find("a").attrs["href"] print("Checking {}".format(article_link)) - - lookup_statement = select(KuowStory).where( - KuowStory.article_link == article_link - ) - lookup_result = session.scalars(lookup_statement) is_new_article = False article_lookup = requests.get(kuow_base_url + article_link) @@ -105,8 +100,23 @@ with Session(engine) as session: except NameError: print("Could not find or load the meta published time for this post") last_updated_time = datetime.now() - + try: + pageview_story_id = article_soup.find( + "script", {"class": "pageview_story"} + )["data-id"] + except (NameError, TypeError): + print( + "Could not find or load a Pageview story ID, skipping additional processing on this post" + ) + continue try: + lookup_statement = select(KuowStory).where( + or_( + KuowStory.article_link == article_link, + KuowStory.pageview_story_id == pageview_story_id, + ) + ) + lookup_result = session.scalars(lookup_statement) article_record = lookup_result.one() # Only process existing articles if the last updated time doesn't match process_article = ( @@ -122,9 +132,6 @@ with Session(engine) as session: print("Processing {}".format(article_link)) try: - article_record.pageview_story_id = article_soup.find( - "script", {"class": "pageview_story"} - )["data-id"] article_record.dfp_targeting_id = article_soup.find( "script", {"class": "dfp_targeting", "data-key": "id"} )["data-value"] @@ -192,6 +199,7 @@ with Session(engine) as session: except Exception: print("Could not load a description/post this article") + article_record.pageview_story_id = pageview_story_id article_record.article_link = article_link article_record.last_updated_time = last_updated_time