Merge pull request 'Also match posts by Pageview ID' (#23) from fix-duplicate-posts into main
All checks were successful
ci/woodpecker/push/lint Pipeline was successful
All checks were successful
ci/woodpecker/push/lint Pipeline was successful
Reviewed-on: #23
This commit is contained in:
commit
84ca53d320
1 changed files with 18 additions and 10 deletions
|
@ -8,7 +8,7 @@ import yaml
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
|
from lingua import IsoCode639_1, Language, LanguageDetectorBuilder
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
from sqlalchemy import create_engine, select
|
from sqlalchemy import create_engine, select, or_
|
||||||
from sqlalchemy.exc import NoResultFound
|
from sqlalchemy.exc import NoResultFound
|
||||||
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
|
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column
|
||||||
|
|
||||||
|
@ -75,11 +75,6 @@ with Session(engine) as session:
|
||||||
for article in articles:
|
for article in articles:
|
||||||
article_link = article.find("a").attrs["href"]
|
article_link = article.find("a").attrs["href"]
|
||||||
print("Checking {}".format(article_link))
|
print("Checking {}".format(article_link))
|
||||||
|
|
||||||
lookup_statement = select(KuowStory).where(
|
|
||||||
KuowStory.article_link == article_link
|
|
||||||
)
|
|
||||||
lookup_result = session.scalars(lookup_statement)
|
|
||||||
is_new_article = False
|
is_new_article = False
|
||||||
|
|
||||||
article_lookup = requests.get(kuow_base_url + article_link)
|
article_lookup = requests.get(kuow_base_url + article_link)
|
||||||
|
@ -105,8 +100,23 @@ with Session(engine) as session:
|
||||||
except NameError:
|
except NameError:
|
||||||
print("Could not find or load the meta published time for this post")
|
print("Could not find or load the meta published time for this post")
|
||||||
last_updated_time = datetime.now()
|
last_updated_time = datetime.now()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
pageview_story_id = article_soup.find(
|
||||||
|
"script", {"class": "pageview_story"}
|
||||||
|
)["data-id"]
|
||||||
|
except (NameError, TypeError):
|
||||||
|
print(
|
||||||
|
"Could not find or load a Pageview story ID, skipping additional processing on this post"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
lookup_statement = select(KuowStory).where(
|
||||||
|
or_(
|
||||||
|
KuowStory.article_link == article_link,
|
||||||
|
KuowStory.pageview_story_id == pageview_story_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lookup_result = session.scalars(lookup_statement)
|
||||||
article_record = lookup_result.one()
|
article_record = lookup_result.one()
|
||||||
# Only process existing articles if the last updated time doesn't match
|
# Only process existing articles if the last updated time doesn't match
|
||||||
process_article = (
|
process_article = (
|
||||||
|
@ -122,9 +132,6 @@ with Session(engine) as session:
|
||||||
print("Processing {}".format(article_link))
|
print("Processing {}".format(article_link))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
article_record.pageview_story_id = article_soup.find(
|
|
||||||
"script", {"class": "pageview_story"}
|
|
||||||
)["data-id"]
|
|
||||||
article_record.dfp_targeting_id = article_soup.find(
|
article_record.dfp_targeting_id = article_soup.find(
|
||||||
"script", {"class": "dfp_targeting", "data-key": "id"}
|
"script", {"class": "dfp_targeting", "data-key": "id"}
|
||||||
)["data-value"]
|
)["data-value"]
|
||||||
|
@ -192,6 +199,7 @@ with Session(engine) as session:
|
||||||
except Exception:
|
except Exception:
|
||||||
print("Could not load a description/post this article")
|
print("Could not load a description/post this article")
|
||||||
|
|
||||||
|
article_record.pageview_story_id = pageview_story_id
|
||||||
article_record.article_link = article_link
|
article_record.article_link = article_link
|
||||||
article_record.last_updated_time = last_updated_time
|
article_record.last_updated_time = last_updated_time
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue