Also match posts by Pageview ID #23

Merged
buckbanzai merged 1 commit from fix-duplicate-posts into main 2024-02-22 06:38:58 -08:00
Showing only changes of commit 1928e00d3d - Show all commits

View file

@ -8,7 +8,7 @@ import yaml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
from mastodon import Mastodon from mastodon import Mastodon
from sqlalchemy import create_engine, select from sqlalchemy import create_engine, select, or_
from sqlalchemy.exc import NoResultFound from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
@ -75,11 +75,6 @@ with Session(engine) as session:
for article in articles: for article in articles:
article_link = article.find("a").attrs["href"] article_link = article.find("a").attrs["href"]
print("Checking {}".format(article_link)) print("Checking {}".format(article_link))
lookup_statement = select(KuowStory).where(
KuowStory.article_link == article_link
)
lookup_result = session.scalars(lookup_statement)
is_new_article = False is_new_article = False
article_lookup = requests.get(kuow_base_url + article_link) article_lookup = requests.get(kuow_base_url + article_link)
@ -105,8 +100,23 @@ with Session(engine) as session:
except NameError: except NameError:
print("Could not find or load the meta published time for this post") print("Could not find or load the meta published time for this post")
last_updated_time = datetime.now() last_updated_time = datetime.now()
try: try:
pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
except (NameError, TypeError):
print(
"Could not find or load a Pageview story ID, skipping additional processing on this post"
)
continue
try:
lookup_statement = select(KuowStory).where(
or_(
KuowStory.article_link == article_link,
KuowStory.pageview_story_id == pageview_story_id,
)
)
lookup_result = session.scalars(lookup_statement)
article_record = lookup_result.one() article_record = lookup_result.one()
# Only process existing articles if the last updated time doesn't match # Only process existing articles if the last updated time doesn't match
process_article = ( process_article = (
@ -122,9 +132,6 @@ with Session(engine) as session:
print("Processing {}".format(article_link)) print("Processing {}".format(article_link))
try: try:
article_record.pageview_story_id = article_soup.find(
"script", {"class": "pageview_story"}
)["data-id"]
article_record.dfp_targeting_id = article_soup.find( article_record.dfp_targeting_id = article_soup.find(
"script", {"class": "dfp_targeting", "data-key": "id"} "script", {"class": "dfp_targeting", "data-key": "id"}
)["data-value"] )["data-value"]
@ -192,6 +199,7 @@ with Session(engine) as session:
except Exception: except Exception:
print("Could not load a description/post this article") print("Could not load a description/post this article")
article_record.pageview_story_id = pageview_story_id
article_record.article_link = article_link article_record.article_link = article_link
article_record.last_updated_time = last_updated_time article_record.last_updated_time = last_updated_time