diff --git a/kuow_fetcher.py b/kuow_fetcher.py index 670377b..9413d0b 100644 --- a/kuow_fetcher.py +++ b/kuow_fetcher.py @@ -8,7 +8,7 @@ import yaml from bs4 import BeautifulSoup from lingua import IsoCode639_1, Language, LanguageDetectorBuilder from mastodon import Mastodon -from sqlalchemy import create_engine, select, or_ +from sqlalchemy import create_engine, select from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column @@ -75,6 +75,11 @@ with Session(engine) as session: for article in articles: article_link = article.find("a").attrs["href"] print("Checking {}".format(article_link)) + + lookup_statement = select(KuowStory).where( + KuowStory.article_link == article_link + ) + lookup_result = session.scalars(lookup_statement) is_new_article = False article_lookup = requests.get(kuow_base_url + article_link) @@ -100,23 +105,8 @@ with Session(engine) as session: except NameError: print("Could not find or load the meta published time for this post") last_updated_time = datetime.now() - try: - pageview_story_id = article_soup.find( - "script", {"class": "pageview_story"} - )["data-id"] - except (NameError, TypeError): - print( - "Could not find or load a Pageview story ID, skipping additional processing on this post" - ) - continue + try: - lookup_statement = select(KuowStory).where( - or_( - KuowStory.article_link == article_link, - KuowStory.pageview_story_id == pageview_story_id, - ) - ) - lookup_result = session.scalars(lookup_statement) article_record = lookup_result.one() # Only process existing articles if the last updated time doesn't match process_article = ( @@ -132,6 +122,9 @@ with Session(engine) as session: print("Processing {}".format(article_link)) try: + article_record.pageview_story_id = article_soup.find( + "script", {"class": "pageview_story"} + )["data-id"] article_record.dfp_targeting_id = article_soup.find( "script", {"class": "dfp_targeting", "data-key": "id"} )["data-value"] @@ -199,7 +192,6 @@ with Session(engine) as session: except Exception: print("Could not load a description/post this article") - article_record.pageview_story_id = pageview_story_id article_record.article_link = article_link article_record.last_updated_time = last_updated_time